In [1]:
import pandas as pd
from config import config
import numpy as np
import matplotlib.pyplot as plt
from pipeline import visualisation_pipeline
from pipeline import rf_pipeline, lr_pipeline, sm_rf_pipeline, sm_lr_pipeline
from helpers import get_meta_columns, plot_categoricial, plot_discrete, plot_continuous
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score

In [5]:
def get_data(mode='train'):
    """gets the training or testing data
    encodes the target variable to binary
    returns: DataFrame
    """
    # reads in the data
    if mode == 'train':
        df = pd.read_csv(config.TRAIN_DIR, names=get_meta_columns(), na_values=' ?')
    else:
        df = pd.read_csv(config.TEST_DIR, names=get_meta_columns(), na_values=' ?')
        
    # encodes the target variable as binary
    df[config.TARGET] = df[config.TARGET].map(config.TARGET_ENCODING)
    
    return df

data = get_data()
 
# splits the data in to a training and validation set
X_train, X_test, y_train, y_test = train_test_split(
         data[config.FEATURES], data[config.TARGET], test_size=0.2, random_state=0
         )  

In [None]:
def run_visualisation(X_train, y_train):
    """Plots the visualisations
    uses a visualisation pipeline to preprocess the data
    visualisation pipeline: 
        Encodes NA values,
        Removes duplicate columns
        Performs Feature Engineering,
        Ordinal encoding for Categorical variables
    returns: None
    """
    sns.set_style("whitegrid")
    x_train_processed = visualisation_pipeline.fit_transform(X_train[config.FEATURES], y_train)
    plotting_df = pd.DataFrame(pd.concat([x_train_processed, y_train], axis=1))
    
    print('----Categorical Variables -----')
    for var in config.VIS_CATEGORICAL_VALS:
        if var not in config.CATEGORICAL_VALS_NOT_PLOTTED:
            plot_categoricial(plotting_df, var)
            
    print('----Discrete Variables -----')
    for var in config.DISCRETE_NUMERIC_VARS:
        if var not in config.DISCRETE_NOT_PLOTTED:
            plot_discrete(plotting_df, var)
            
    print('----Continuous Variables -----')
    for var in config.CONTINUOUS_NUMERIC_VARS:
        plot_continuous(plotting_df, var)

run_visualisation(X_train, y_train)

In [4]:
def train_models( X_train, X_test, y_train, y_test):
    """Trains the models using sklearn pipelines
    Preprocessing steps shared by all pipelines are:
        1. Encode Not in universe as np.NAN
        2. Fill NA's with "missing"
        3. Encode Categorical as ordinal
        4. Encode education using feature engineering
        5. log transform skewed numeric variables
        6. Normalises all variable using min max
    Variability in resampling methods:
        1. No method
        2. SMOTE (Synthetic 
    returns: Dict, DataFrame
    """	
    models = {
        'Random_Forest': rf_pipeline,
        'Smote_Random_Forest': sm_rf_pipeline,
        'Logistic Regression': lr_pipeline,
        'Smote_Logistic_Regression': sm_lr_pipeline
        }
    
	results = {
		'Model': [],
		'Accuracy': [],
		'F1_Score': [],
		'Recall_Score': []
	}

	for key in models:
		models[key].fit(X_train, y_train)
		y_pred = models[key].predict(X_test)
		results['Model'].append(key)
		results['Accuracy'].append(accuracy_score(y_test, y_pred))
		results['F1_Score'].append(f1_score(y_test, y_pred))
		results['Recall_Score'].append(recall_score(y_test, y_pred))

	print(pd.DataFrame(results))
	return models, results



models, results = train_models(X_train, X_test, y_train, y_test)

                       Model  Accuracy  F1_Score  Recall_Score
0              Random_Forest  0.955169  0.536649      0.427569
1        Smote_Random_Forest  0.946999  0.565083      0.567066
2        Logistic Regression  0.950357  0.460071      0.348329
3  Smote_Logistic_Regression  0.846160  0.407490      0.871234


In [None]:

test_data = get_data(mode='test')


 

