In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, roc_curve, f1_score

import random
import pickle
import os

%matplotlib inline

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("income-prediction-01")

2022/08/08 17:42:48 INFO mlflow.tracking.fluent: Experiment with name 'income-prediction-01' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='income-prediction-01', tags={}>

In [3]:
def process_dataframe(filepath):
    columns = ['age', 'workClass', 'financialWeight', 'education', 'educationNum', 'maritalStatus', 'occupation',
          'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'incomeTarget']
    
    target = 'incomeTarget'

    df = pd.read_csv(filepath, names=columns)

    transformed_target = []

    for _, value in df['incomeTarget'].iteritems():
        if value == ' <=50K':
            transformed_target.append(0)
        else:
            transformed_target.append(1)
    df['incomeTarget'] = transformed_target

    df.drop('nativeCountry', axis=1, inplace=True)
    
    y = df[target]
    X = df.drop('incomeTarget', axis=1, inplace=True)
    X = pd.get_dummies(df)
    
    # Upsampling
    X_upsampled, y_upsampled = resample(X[y == 1],
                                   y[y == 1],
                                   replace=True,
                                   n_samples=X[y == 0].shape[0],
                                   random_state=1)

    X_upsampled = np.concatenate((X[y == 0], X_upsampled))
    y_upsampled = np.concatenate((y[y == 0], y_upsampled))

    
    df_new = pd.DataFrame(X_upsampled, columns=X.columns)
    
    return df_new, y_upsampled

In [4]:
X_train, y_train = process_dataframe('../data/adult-train.csv')
X_val, y_val = process_dataframe('../data/adult-test.csv')

len(X_train), len(y_train), len(X_val), len(y_val)

(49440, 49440, 24870, 24870)

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_df = pd.DataFrame(X_train_scaled, columns = X_train.columns)

X_val_scaled = scaler.transform(X_val)
X_val_df = pd.DataFrame(X_val_scaled, columns = X_val.columns)

In [6]:
dv = DictVectorizer()

train_dicts = X_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = X_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [7]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)

In [13]:
os.makedirs('models/', exist_ok=True)

with open('models/logreg.bin', 'wb') as f_out:
    pickle.dump((dv, logreg), f_out)

In [16]:
with mlflow.start_run():
    mlflow.set_tag("developer", "enchristos")
    
    mlflow.log_param("train-data-path", "../data/adult-train.csv")
    mlflow.log_param("valid-data-path", "../data/adult-test.csv")
    
    max_iter = 500
    mlflow.log_param("max_iter", max_iter)
    logreg = LogisticRegression(max_iter=max_iter)
    logreg.fit(X_train, y_train)
    
    y_pred = logreg.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    mlflow.log_metric("auc", auc)
    mlflow.log_metric("fi_score", f1)
    
    mlflow.log_artifact(local_path="models/logreg.bin", artifact_path="models_pickle")