In [1]:
# house keeping and import libiraries
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from IPython.display import display
from scipy.stats import skew
from itertools import combinations

import sklearn.feature_selection
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import Imputer, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv(r'C:\Users\Eric Yang\Desktop\sample.csv') # replace the path to load df
df.drop(['Unnamed: 0', 'ACCOUNT_NUMBER'], axis = 1, inplace = True)

In [3]:
df.prim_CustomerOwnHome.replace({'Yes': 1, 
                                'No': 0,
                                 np.nan: 0}, inplace = True)

In [4]:
numeric_features = list(df.dtypes[df.dtypes != "object"].index)
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)

In [4]:
skewed_features.drop('FLAG')

Index(['CashPrice', 'AmountFinanced', 'TradeAllowance', 'TradePayoff',
       'CashDown', 'Term', 'PaymentAmount', 'VehicleBook', 'VehicleMileage',
       'VehicleClass', 'BlackBookValue', 'prim_YearsJob', 'prim_Income1',
       'prim_ResidenceStability', 'prim_GoodItems', 'prim_HighGood',
       'prim_DerogItems', 'prim_HighDerog', 'prim_Repos',
       'prim_OtherMonthlyDebt', 'prim_PreviousBankruptcyCount',
       'prim_YearsOnBureau', 'prim_PaidAutoLoan', 'CustFactor',
       'CheckToDealer'],
      dtype='object')

In [5]:
def benchmark(dataframe):
    global categorical_features
    for variable in categorical_features:
        dataframe[variable].fillna('Missing', inplace = True)
        dummies = pd.get_dummies(dataframe[variable], prefix = variable)
        dataframe = pd.concat([dataframe, dummies], axis = 1)
        dataframe.drop([variable], axis = 1, inplace = True)
    dataframe.dropna(axis = 0, inplace = True) 
    X_benchmark = dataframe.drop('FLAG', axis = 1)
    y_benchmark = dataframe.FLAG
    X_train, X_test, y_train, y_test = train_test_split(X_benchmark, y_benchmark, test_size = 0.3, random_state = 42)
    bm_model = RandomForestClassifier(100)
    bm_model.fit(X_train, y_train)
    y_bm_predict = bm_model.predict(X_test)
    bm_score = roc_auc_score(y_test, y_bm_predict)
    print('the bench mark auc is: %s' %(bm_score))

In [22]:
benchmark(df)

the bench mark aoc is: 0.511109539172


In [5]:
df = pd.read_csv(r'C:\Users\Eric Yang\Desktop\sample.csv') #reload data frame
df.drop(['Unnamed: 0', 'ACCOUNT_NUMBER'], axis = 1, inplace = True)
numeric_features = list(df.dtypes[df.dtypes != "object"].index)
categorical_features = list(df.dtypes[df.dtypes == 'object'].index)

In [5]:
def preprocess(dataframe):
    global numeric_features, df, categorical_features
    dataframe = dataframe[dataframe['TradePayoff'] < 50000]
    dataframe = dataframe[dataframe['prim_YearsJob'] < 80]
    dataframe = dataframe[dataframe['Term'] < 120]
    dataframe.VehicleClass.fillna(dataframe.VehicleClass.median(), axis = 0, inplace = True)
    dataframe.ExpectedLoss.fillna(dataframe.ExpectedLoss.median(), axis = 0, inplace = True)
    dataframe.prim_CustomerOwnHome.fillna('No', axis = 0, inplace = True)
    for variable in categorical_features:
        dataframe[variable].fillna('Missing', inplace = True)
        dummies = pd.get_dummies(dataframe[variable], prefix = variable)
        dataframe = pd.concat([dataframe, dummies], axis = 1)
        dataframe.drop([variable], axis = 1, inplace = True) 
    #dataframe = dataframe.fillna(dataframe.mean())
    df = dataframe

In [6]:
preprocess(df)

In [13]:
df.isnull().values.any()

False

# Feature Engineering 
## get the interactions of all features

In [7]:
def interactions(dataframe):
    combos = list(combinations(list(dataframe.columns), 2))
    column_names = list(dataframe.columns) + ['_'.join(x) for x in combos]
    polynomial = PolynomialFeatures(interaction_only = True, include_bias = False)
    dataframe = polynomial.fit_transform(dataframe)
    dataframe, dataframe.columns = pd.DataFrame(dataframe), column_names
    noint_indicies = [i for i, x in enumerate(list((dataframe == 0).all())) if x]
    dataframe = dataframe.drop(dataframe.columns[noint_indicies], axis = 1)
    return dataframe 

In [8]:
df = interactions(df)

In [9]:
def inpute_skew(dataframe):
    skewed_features = dataframe[numeric_features].apply(lambda x: skew(x)) #compute skewness
    skewed_features = numeric_features[skewed_features > 1.5]
    skewed_features = skewed_features.index
    skewed_features.drop('FLAG')
    dataframe[skewed_features] = np.log1p(dataframe[skewed_features])

In [12]:
inpute_skew(df)



In [10]:
df = df.fillna(df.mean(), axis = 0)

In [11]:
X = df.drop('FLAG', axis = 1)
y = df.FLAG
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [22]:
select = sklearn.feature_selection.SelectKBest(k = 340)
selected_features = select.fit(X_train, y_train)
selected_indices =  selected_features.get_support(indices = True)
selected_columns = [df.columns[i] for i in selected_indices]

 3093 3101 3109 3122 3141 3155 3162 3174 3187 3191 3215 3217 3219 3238 3296
 3354 3362 3383] are constant.
  f = msb / msw


In [23]:
X_train_final, X_test_final = X_train[selected_columns], X_test[selected_columns]

In [14]:
def model_building(X_train, y_train, X_test, y_test):
    model = GradientBoostingClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    roc = roc_auc_score(y_test, y_pred)
    print('the ROC AUC score for our boosting model is: %s' %(roc))

In [24]:
model_building(X_train_final, y_train, X_test_final, y_test)

the ROC AUC score for our boosting model is: 0.939519852262
