# Preprocessing
---

## Clean data

In [4]:
import pandas as pd 

def clean_data(file_input='data/census.csv', file_output='data/census_clean.csv'):
    df_raw = pd.read_csv(file_input)
    (df_raw
        .drop_duplicates()
        .dropna()
    ).to_csv(file_output, index=False)

clean_data()

## EDA

In [5]:
from pandas_profiling import ProfileReport

def load_data(file_input='data/census_clean.csv'):
    return pd.read_csv(file_input)

df = load_data()

profile = ProfileReport(df)
profile.to_file("reports/census_eda.html")

Summarize dataset: 100%|██████████| 64/64 [00:12<00:00,  5.33it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.22s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.34s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 47.53it/s]


# ML pipeline
--- 

In [19]:
from src.ml.data import load_config
from src.ml.data import preprocess
from src.ml.model import train_model
from src.ml.model import create_automl_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

# load config
config = load_config()

# load data & preprocess
X, y = preprocess(config)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=config['data']['test_size'],
    random_state=config['data']['random_state']
)

pipeline = create_automl_pipeline(config)

settings_automl = {
    "automl__task": config["automl"]["task"],
    "automl__metric": config["automl"]["metric"],
    "automl__log_training_metric": config["automl"]["log_training_metric"],
    "automl__log_file_name": f"logs/flaml.log", 
    "automl__time_budget": config["automl"]["time_budget"],
    "automl__seed": config["automl"]["random_state"],
    "automl__estimator_list": config["automl"]["estimator_list"]
}
pipeline.fit(X_train, y_train, **settings_automl)

[flaml.automl: 11-03 09:34:21] {1463} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 11-03 09:34:21] {1467} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 11-03 09:34:21] {1515} INFO - Minimizing error metric: 1-roc_auc
INFO:flaml.automl:Minimizing error metric: 1-roc_auc
[flaml.automl: 11-03 09:34:21] {1552} INFO - List of ML learners in AutoML Run: ['lgbm']
INFO:flaml.automl:List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 11-03 09:34:21] {1793} INFO - iteration 0, current learner lgbm
INFO:flaml.automl:iteration 0, current learner lgbm
[flaml.automl: 11-03 09:34:21] {1910} INFO - Estimated sufficient time budget=922s. Estimated necessary time budget=1s.
INFO:flaml.automl:Estimated sufficient time budget=922s. Estimated necessary time budget=1s.
[flaml.automl: 11-03 09:34:21] {1981} INFO -  at 0.2s,	estimator lgbm's best error=0.1454,	best estimator lgbm's best error=0.1454
I

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('num',
                 

# API
---

In [79]:
from src.ml.data import load_config
from src.ml.data import preprocess
from src.ml.model import load_model
import json
import pandas as pd
from pprint import pprint


config = load_config()
X, y = preprocess(config)
model = load_model()

X_test = json.dumps(X.sample(1).to_dict(orient='records')[0])

X_pred = pd.DataFrame(json.loads(X_test), index=[0])
model.predict(X_pred)


   age workclass  fnlgt education  education-num      marital-status  \
0   51   Private  22211   Masters             14  Married-civ-spouse   

       occupation relationship   race   sex  capital-gain  capital-loss  \
0  Prof-specialty      Husband  White  Male             0          1902   

   hours-per-week native-country  
0              60  United-States  


array([1])

In [81]:
pprint(X.sample(1).to_dict(orient='records')[0])

{'age': 41,
 'capital-gain': 0,
 'capital-loss': 0,
 'education': '12th',
 'education-num': 8,
 'fnlgt': 327606,
 'hours-per-week': 40,
 'marital-status': 'Separated',
 'native-country': 'United-States',
 'occupation': 'Craft-repair',
 'race': 'Black',
 'relationship': 'Not-in-family',
 'sex': 'Male',
 'workclass': 'Private'}


# Tests

---

## Model

In [59]:
from flaml import AutoML
from src.ml.data import load_config
from src.ml.model import create_automl_pipeline

config = load_config()



## Data