In [2]:
### basic package for data science project
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import helpers

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set()

In [3]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [86]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
imp_mean = IterativeImputer(random_state=0)

1. Remove columns with missing value more than 75% 
2. Remove col 'x39' and 'x99' since those two columns don't bring additional information 
3. Impute NAN for missing values in numerical methods 
4. We can use Catboost or Imputing categorical features is to replace missing values with the most common class. 


Alternative (if time permitted):
    - Check each features to see if there exists outliers.

In [5]:
train_data = pd.read_csv('data/exercise_40_train.csv')
test_data = pd.read_csv('data/exercise_40_test.csv')

In [6]:
most_missing_cols = list(set(train_data.columns[train_data.isnull().mean() > 0.4]))
remove_cat_col = ['x39', 'x99']
remove_cols = most_missing_cols + remove_cat_col

In [68]:
def create_dummy_df(df, cat_cols, dummy_na):
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1),\
                            pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

def clean_catergorical_ft(df):
    
    cat_df = df.select_dtypes(include=['object'])
    
    #### clean col x7 and x19
    cat_df['x7'] = cat_df['x7'].str.replace('%','')
    cat_df["x19"] = cat_df['x19'].str.replace('$','')
    cat_df = cat_df.astype({'x7': 'float', 'x19': 'float'})

    #### clean x3 col
    cat_df['x3'] = cat_df.apply(lambda x: helpers.clean_day_x3_col(x['x3']), axis = 1)

    ### convert x7 and x19 back to numerical fts 
    df['x7'] =  cat_df['x7']
    df['x19'] = cat_df["x19"]

    #### categorical cols 
    cat_df.drop(['x7','x19'], axis = 1, inplace = True)
    cat_cols = cat_df.select_dtypes(include=['object']).columns
    
    return cat_cols, cat_df, df
    

In [166]:
def ft_engineering(df, remove_cols, data_type = "train", label_encoder_path = "le_dict.pickle", use_catboost = True):
    df = train_data.copy()
    df = df.loc[:, (~df.columns.isin(remove_cols))]
    print(df.shape)
    #### clean categorical features
    cat_cols, cat_df, df = clean_catergorical_ft(df)
    
    #### clean numerical features 
    numerical_df = df.loc[:, (~df.columns.isin(cat_cols))]

    ### imputing missing values 
#     num_df_trasformed = pd.DataFrame(imp_mean.fit_transform(numerical_df.loc[:, ~(numerical_df.columns.isin(["y"]))]))
    null_cols = list(set(numerical_df.columns[numerical_df.isnull().mean()!=0]))
    for col in null_cols:
        mean_freq = numerical_df[col].median()
        numerical_df[col][pd.isna(numerical_df[col])] = mean_freq
        
    num_df_trasformed = numerical_df
    ### if not using catboost model, then imputing categorical features 
    ### by replacing missing values with the most common class
    ### use label encoder for col 'x33' and one-hot encoder for the rest
    if use_catboost == False:
        
        ### replace missing value with most common value
        for col in cat_cols:
            max_freq = cat_df[col].value_counts().index[0]
            cat_df[col][pd.isna(cat_df[col])] = max_freq
    
        one_hot_cols = cat_df.columns[cat_df.columns != 'x33']  
        cat_df_transformed = create_dummy_df(cat_df, list(one_hot_cols), dummy_na= False)

        if data_type == "train":
            ### user label encoder for col 'x33'
            le = LabelEncoder()
            cat_df_transformed['x33'] = le.fit_transform(cat_df_transformed['x33'])

            ### save labelEncoder for test set
            with open(label_encoder_path, 'wb') as l:
                pickle.dump(le, l, pickle.HIGHEST_PROTOCOL)
                
        else: 
            with open(label_encoder_path, 'rb') as f:
                le = pickle.load(f)
            cat_df_transformed = le.transform(cat_df_transformed['x33'])
    else:
        cat_df_transformed = cat_df
        cat_df_transformed = cat_df_transformed.fillna('NaN')
    
    cleaned_data = pd.concat([num_df_trasformed, cat_df_transformed], axis = 1)
    cleaned_data["y"] = df['y']

    return cleaned_data


In [167]:
data = ft_engineering(df, remove_cols, data_type = "train", label_encoder_path = "le_dict.pickle", use_catboost = False)

(40000, 94)


In [168]:
data.head()

Unnamed: 0,y,x1,x2,x4,x5,x6,x7,x8,x9,x10,...,x65_farmers,x65_geico,x65_progressive,x77_chevrolet,x77_ford,x77_mercedes,x77_nissan,x77_subaru,x77_toyota,x93_yes
0,0,0.165254,18.060003,1.07738,-1.339233,-1.584341,0.0062,0.220784,1.816481,1.171788,...,1,0,0,0,0,1,0,0,0,0
1,1,2.441471,18.416307,1.482586,0.920817,-0.759931,0.0064,1.192441,3.51395,1.4199,...,0,0,0,0,0,1,0,0,0,0
2,1,4.427278,19.188092,0.145652,0.366093,0.709962,-0.0008,0.952323,0.782974,-1.247022,...,0,1,0,0,0,0,0,1,0,0
3,0,3.925235,19.901257,1.763602,-0.251926,-0.827461,-0.0057,-0.520756,1.825586,2.223038,...,0,1,0,0,0,0,1,0,0,0
4,0,2.868802,22.202473,3.405119,0.083162,1.381504,0.0109,-0.732739,2.15199,-0.275406,...,0,1,0,0,0,0,0,0,1,1


In [169]:
set(data.columns[data.isnull().mean()!=0])

set()

In [170]:
X_train, X_test, y_train, y_test = train_test_split(
                                    data.drop(['y'], axis=1),
                                    data['y'],
                                    test_size=0.15,
                                    random_state=0)

scaler = StandardScaler()
scaler.fit(X_train)


StandardScaler()

### Logistic Regression 

In [171]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, roc_auc_score, auc, precision_recall_curve, roc_curve, average_precision_score

In [172]:
clf = LogisticRegression(random_state=0, C = 0.2)

In [173]:
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores

array([0.85602941, 0.85617647, 0.85617647, 0.85632353, 0.85617647])

In [174]:
clf.fit(X_train, y_train)

LogisticRegression(C=0.2, random_state=0)

In [175]:
#Generate predicted probabilites
clf_probs = clf.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, clf_probs[:,1]))
print('Accuracy: ', clf.score(X_test, y_test))

AUC:  0.6065128322500647
Accuracy:  0.8476666666666667


In [197]:
reporting(clf.predict_proba(X_test)[:,1], y_test)


AUC score:       0.6065
Best threshold       0.1500

Confusion Matrix : 
 [[3278 1809]
 [ 442  471]]

Accuracy    :       0.6248
Sensitivity :       0.6444
Specificity :       0.5159


In [177]:
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import RFECV

score = 'roc_auc'
#Setup recursive feature reduction w/ cross validation
clf2 = RFECV(LogisticRegression(),
      scoring = score,
      n_jobs = -1,
      cv = 3,
      step = 5)
clf2.fit(X_train, y_train)


RFECV(cv=3, estimator=LogisticRegression(), n_jobs=-1, scoring='roc_auc',
      step=5)

In [178]:
#Generate predicted probabilites
clf2_probs = clf2.predict_proba(X_test)
print('AUC: ', roc_auc_score(y_test, clf2_probs[:,1]))
print('Accuracy: ', clf2.score(X_test, y_test))

AUC:  0.6947079200875199
Accuracy:  0.8478333333333333


In [115]:
def reporting(ensem_preds, targets):
    best_th = 0
    best_score = 0

    for th in np.arange(0.0, 0.6, 0.01):
        pred = (ensem_preds > th).astype(int)
        score = f1_score(targets, pred)
        if score > best_score:
            best_th = th
            best_score = score

    print(f"\nAUC score: {roc_auc_score(targets, ensem_preds):12.4f}")
    print(f"Best threshold {best_th:12.4f}")

    preds = (ensem_preds > best_th).astype(int)
    # print(classification_report(targets, preds, digits=3))

    cm1 = confusion_matrix(targets, preds)
    print('\nConfusion Matrix : \n', cm1)
    total1=sum(sum(cm1))

    print('\n=============')
    accuracy1=(cm1[0,0]+cm1[1,1])/total1
    print (f'Accuracy    : {accuracy1:12.4f}')

    sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    print(f'Sensitivity : {sensitivity1:12.4f}')

    specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    print(f'Specificity : {specificity1:12.4f}')

### Removing features using lasso

In [185]:
sel_ = SelectFromModel(LogisticRegression(penalty='l1', solver='saga', C = 0.1))
sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=LogisticRegression(C=0.1, penalty='l1',
                                             solver='saga'))

In [186]:
selected_feat = X_train.columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 116
selected features: 100
features with coefficients shrank to zero: 15


In [187]:
removed_feats = X_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['x22', 'x26', 'x48', 'x59', 'x69', 'x72', 'x79', 'x81', 'x88', 'x90',
       'x24_male', 'x60_January', 'x65_esurance', 'x77_chevrolet', 'x77_ford'],
      dtype='object')

In [188]:
X_train_selected = sel_.transform(X_train)
X_test_selected = sel_.transform(X_test)

In [189]:
clf_selected = LogisticRegression(random_state=0, C = 0.2)

In [190]:
scores = cross_val_score(clf_selected, X_train, y_train, cv=5)
scores

array([0.85602941, 0.85617647, 0.85617647, 0.85632353, 0.85617647])

In [193]:
clf_selected.fit(X_train_selected, y_train)

LogisticRegression(C=0.2, random_state=0)

In [194]:
#Generate predicted probabilites
clf_probs = clf_selected.predict_proba(X_test_selected)
print('AUC: ', roc_auc_score(y_test, clf_probs[:,1]))
print('Accuracy: ', clf_selected.score(X_test_selected, y_test))

AUC:  0.6067447228734801
Accuracy:  0.8476666666666667


In [196]:
reporting(clf_selected.predict_proba(X_test_selected)[:,1], y_test)


AUC score:       0.6067
Best threshold       0.1500

Confusion Matrix : 
 [[3222 1865]
 [ 434  479]]

Accuracy    :       0.6168
Sensitivity :       0.6334
Specificity :       0.5246


## Ensemble 