In [1]:
%load_ext autoreload 
%autoreload 2

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import shapiro

In [23]:
# User CSV
df = pd.read_csv('sample_data/california_housing_train.csv')
df.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0


In [24]:
class Data:
    def __init__(self, n_samples = 1000):
        self.n_samples = n_samples
        pass
        
    def get_dataframe(self, df, dataset_name, table_name):
        print(f'Processing DataFrame: {dataset_name} {table_name}')
        df_rows = [
            self.get_row(df[col_name], dataset_name, table_name) for col_name in df
        ]
        return pd.concat(df_rows, axis=0).reset_index(drop=True)

    def get_row(self, column, dataset_name, table_name):
        try:
            col = column.sample(self.n_samples)
        except ValueError:
            col = column

        features = {
            "dataset_name": [dataset_name],
            "table_name": [table_name],
            "column_name": [column.name],
            "label": [np.nan],
        }

        feature_functions = {
            "column_values": lambda x: ", ".join(map(str, x.tolist())),
            "column_values_unique": lambda x: x.unique(),
            "n_unique_values": lambda x: x.nunique(),
            "unique_value_counts": lambda x: x.value_counts(),
            'n_values': lambda x: x.shape[0],
            "mean": lambda x: x.mean(),
            "std": lambda x: x.std(),
            "median": lambda x: x.median(),
            "skew": lambda x: x.skew(),
            "kurt": lambda x: x.kurt(),
            "shapiro_wilk_test": lambda x: shapiro(x)[1],
        }

        for col_name, fn in feature_functions.items():
            try:
                val = fn(col)
            except (ValueError, TypeError):
                val = np.nan
            except Exception as e:  # DEBUGGING TYPES OF ERRORS
                val = np.nan
                print(features['column_name'], col_name, 'Exception:', type(e))

            features[col_name] = [val]

        return pd.DataFrame.from_dict(features)

In [29]:
df_transformed = Data().get_dataframe(df=df, dataset_name='sample_data', table_name='california_housing_train')
df_transformed

Processing DataFrame: sample_data california_housing_train


Unnamed: 0,dataset_name,table_name,column_name,label,column_values,column_values_unique,n_unique_values,unique_value_counts,n_values,mean,std,median,skew,kurt,shapiro_wilk_test
0,sample_data,california_housing_train,longitude,,"-121.9, -121.8, -119.83, -118.32, -122.29, -11...","[-121.9, -121.8, -119.83, -118.32, -122.29, -1...",410,-118.37 10 -118.35 9 -118.19 9 -121...,1000,-119.60315,2.012866,-118.75,-0.263858,-1.365533,4.835621e-25
1,sample_data,california_housing_train,latitude,,"32.77, 33.92, 33.07, 33.61, 34.26, 37.82, 34.5...","[32.77, 33.92, 33.07, 33.61, 34.26, 37.82, 34....",386,34.04 15 34.08 14 34.13 13 33.94 1...,1000,35.62565,2.133944,34.27,0.461849,-1.113007,3.8373010000000004e-27
2,sample_data,california_housing_train,housing_median_age,,"36.0, 28.0, 42.0, 33.0, 32.0, 14.0, 44.0, 25.0...","[36.0, 28.0, 42.0, 33.0, 32.0, 14.0, 44.0, 25....",51,52.0 61 36.0 43 34.0 43 16.0 40 17...,1000,28.365,12.439233,28.0,0.094677,-0.740075,1.271147e-10
3,sample_data,california_housing_train,total_rooms,,"2104.0, 2045.0, 3585.0, 11872.0, 1529.0, 1326....","[2104.0, 2045.0, 3585.0, 11872.0, 1529.0, 1326...",892,2465.0 5 1649.0 3 1818.0 3 1476.0 ...,1000,2768.964,2204.739824,2250.5,4.004629,29.059948,8.305118e-39
4,sample_data,california_housing_train,total_bedrooms,,"788.0, 966.0, 560.0, 296.0, 485.0, 378.0, 573....","[788.0, 966.0, 560.0, 296.0, 485.0, 378.0, 573...",609,365.0 6 424.0 6 303.0 6 353.0 ...,1000,569.459,460.177707,452.0,3.87453,26.136141,2.902217e-39
5,sample_data,california_housing_train,population,,"1229.0, 2553.0, 288.0, 975.0, 1150.0, 1749.0, ...","[1229.0, 2553.0, 288.0, 975.0, 1150.0, 1749.0,...",822,725.0 4 837.0 4 1047.0 4 956.0 ...,1000,1462.346,1109.682846,1206.5,3.002527,14.446499,5.5864399999999995e-36
6,sample_data,california_housing_train,households,,"439.0, 437.0, 316.0, 128.0, 483.0, 668.0, 953....","[439.0, 437.0, 316.0, 128.0, 483.0, 668.0, 953...",613,231.0 6 269.0 5 234.0 5 482.0 ...,1000,499.939,396.75204,407.0,3.14387,16.260029,2.2339369999999998e-36
7,sample_data,california_housing_train,median_income,,"1.84, 2.3846, 2.5625, 2.9844, 3.1369, 3.8333, ...","[1.84, 2.3846, 2.5625, 2.9844, 3.1369, 3.8333,...",945,2.6250 4 3.1250 4 1.6250 3 2.5625 ...,1000,3.984221,2.030529,3.6365,1.630879,4.306822,1.838925e-26
8,sample_data,california_housing_train,median_house_value,,"165400.0, 99000.0, 69400.0, 276300.0, 427300.0...","[165400.0, 99000.0, 69400.0, 276300.0, 427300....",770,500001.0 47 112500.0 7 187500.0 7 1...,1000,210885.347,117856.184441,183550.0,0.893467,0.111373,1.327198e-22


In [30]:
def to_json(df):
    return df.to_json()

In [31]:
df_json = to_json(df_transformed)

In [32]:
pd.read_json(df_json)

Unnamed: 0,dataset_name,table_name,column_name,label,column_values,column_values_unique,n_unique_values,unique_value_counts,n_values,mean,std,median,skew,kurt,shapiro_wilk_test
0,sample_data,california_housing_train,longitude,,"-121.9, -121.8, -119.83, -118.32, -122.29, -11...","[-121.9, -121.8, -119.83, -118.32, -122.29, -1...",410,"{'-118.37': 10, '-118.35': 9, '-118.19': 9, '-...",1000,-119.60315,2.012866,-118.75,-0.263858,-1.365533,4.835621e-25
1,sample_data,california_housing_train,latitude,,"32.77, 33.92, 33.07, 33.61, 34.26, 37.82, 34.5...","[32.77, 33.92, 33.07, 33.61, 34.26, 37.82, 34....",386,"{'34.04': 15, '34.08': 14, '34.13': 13, '33.94...",1000,35.62565,2.133944,34.27,0.461849,-1.113007,3.8373010000000004e-27
2,sample_data,california_housing_train,housing_median_age,,"36.0, 28.0, 42.0, 33.0, 32.0, 14.0, 44.0, 25.0...","[36.0, 28.0, 42.0, 33.0, 32.0, 14.0, 44.0, 25....",51,"{'52.0': 61, '36.0': 43, '34.0': 43, '16.0': 4...",1000,28.365,12.439233,28.0,0.094677,-0.740075,1e-10
3,sample_data,california_housing_train,total_rooms,,"2104.0, 2045.0, 3585.0, 11872.0, 1529.0, 1326....","[2104.0, 2045.0, 3585.0, 11872.0, 1529.0, 1326...",892,"{'2465.0': 5, '1649.0': 3, '1818.0': 3, '1476....",1000,2768.964,2204.739824,2250.5,4.004629,29.059948,8.305118e-39
4,sample_data,california_housing_train,total_bedrooms,,"788.0, 966.0, 560.0, 296.0, 485.0, 378.0, 573....","[788.0, 966.0, 560.0, 296.0, 485.0, 378.0, 573...",609,"{'365.0': 6, '424.0': 6, '303.0': 6, '353.0': ...",1000,569.459,460.177707,452.0,3.87453,26.136141,2.902217e-39
5,sample_data,california_housing_train,population,,"1229.0, 2553.0, 288.0, 975.0, 1150.0, 1749.0, ...","[1229.0, 2553.0, 288.0, 975.0, 1150.0, 1749.0,...",822,"{'725.0': 4, '837.0': 4, '1047.0': 4, '956.0':...",1000,1462.346,1109.682846,1206.5,3.002527,14.446499,5.5864399999999995e-36
6,sample_data,california_housing_train,households,,"439.0, 437.0, 316.0, 128.0, 483.0, 668.0, 953....","[439.0, 437.0, 316.0, 128.0, 483.0, 668.0, 953...",613,"{'231.0': 6, '269.0': 5, '234.0': 5, '482.0': ...",1000,499.939,396.75204,407.0,3.14387,16.260029,2.2339369999999998e-36
7,sample_data,california_housing_train,median_income,,"1.84, 2.3846, 2.5625, 2.9844, 3.1369, 3.8333, ...","[1.8399999999999999, 2.3846, 2.5625, 2.9844, 3...",945,"{'2.625': 4, '3.125': 4, '1.625': 3, '2.5625':...",1000,3.984221,2.030529,3.6365,1.630879,4.306822,1.838925e-26
8,sample_data,california_housing_train,median_house_value,,"165400.0, 99000.0, 69400.0, 276300.0, 427300.0...","[165400.0, 99000.0, 69400.0, 276300.0, 427300....",770,"{'500001.0': 47, '112500.0': 7, '187500.0': 7,...",1000,210885.347,117856.184441,183550.0,0.893467,0.111373,1.327198e-22


#### FRONT END
-----
- CSV IN 
- Transform to Model features
- Convert to JSON



#### BACKEND
 ----
- Send to API (SERVER)
- Get Predictions on server
- Return Predictions 




1. CSV IN
2. Get column type predictions (labels)

---
3. Allow user to select columns to keep/drop
4. Allow user to overwrite data type predictions
5. Allow user to select type of imputing
6. Allow user to select type of encoding

---
7. Return Ready to go CSV