In [160]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix

In [138]:
df = pd.read_csv('dataset/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [139]:
features = ['CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']
merge_count=0
df_X = df[features]
df_X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [140]:
df_y = df['Exited']
df_y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [141]:
#df_X_train,df_X_test,df_y_train,df_y_test = train_test_split( df_X, df_y, train_size=0.8, test_size=0.2)
#print(df_X_train.shape,df_X_test.shape,df_y_train.shape,df_y_test.shape)

In [142]:
df_X.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
dtype: object

In [143]:
df_X.nunique()

CreditScore         460
Geography             3
Gender                2
Age                  70
Tenure               11
Balance            6382
NumOfProducts         4
HasCrCard             2
IsActiveMember        2
EstimatedSalary    9999
dtype: int64

In [144]:
catagorical_col = list(df_X.columns[df_X.dtypes == 'object'])
catagorical_col

['Geography', 'Gender']

In [145]:
continues_col = list(df_X.columns[df_X.nunique() > 4])
continues_col

['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

In [146]:
zero_containing_col = ['HasCrCard','IsActiveMember'] 
zero_containing_col

['HasCrCard', 'IsActiveMember']

In [147]:
df_X.loc[df_X.HasCrCard == 0, 'HasCrCard'] = -1
df_X.loc[df_X.IsActiveMember == 0, 'IsActiveMember'] = -1
df_X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,-1,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,-1,113931.57
3,699,France,Female,39,1,0.0,2,-1,-1,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [148]:
def handle_cat(catag_col,df_temp):
    for var in catag_col:
        for val in df_temp[var].unique():
            df_temp[var+"_"+val] = np.where(df_temp[var] == val,1,-1)
        df_temp = df_temp.drop(var,axis=1)
    return df_temp

df_X = handle_cat(catagorical_col,df_X)

df_X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Spain,Geography_Germany,Gender_Female,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,-1,-1,1,-1
1,608,41,1,83807.86,1,-1,1,112542.58,-1,1,-1,1,-1
2,502,42,8,159660.8,3,1,-1,113931.57,1,-1,-1,1,-1
3,699,39,1,0.0,2,-1,-1,93826.63,1,-1,-1,1,-1
4,850,43,2,125510.82,1,1,1,79084.1,-1,1,-1,1,-1


In [149]:
continues_col

['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']

In [150]:
#df_X_train['Balance/EstimatedSalary'] = df_X_train['Balance']/df_X_train['EstimatedSalary']

def divide_and_add_col(col1,col2,df_temp):
    df_temp[col1+"/"+col2] = df_temp[col1]/df_temp[col2]
    return df_temp

#divide_and_add_col.counter=0
#Balance/EstimatedSalary
df_X = divide_and_add_col('Balance','EstimatedSalary',df_X)


#Age/Tenure
df_X = divide_and_add_col('Tenure','Age',df_X)


#CreditedScore/Tenure
df_X = divide_and_add_col('Tenure','CreditScore',df_X)


#EstimatedSalary/NumOFProducts
#df_X = divide_and_add_col(continues_col[5],continues_col[4],df_X)
df_X = df_X.drop(continues_col,axis=1)


In [151]:
df_X.isin([np.inf,-np.inf])

Unnamed: 0,NumOfProducts,HasCrCard,IsActiveMember,Geography_France,Geography_Spain,Geography_Germany,Gender_Female,Gender_Male,Balance/EstimatedSalary,Tenure/Age,Tenure/CreditScore
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
9995,False,False,False,False,False,False,False,False,False,False,False
9996,False,False,False,False,False,False,False,False,False,False,False
9997,False,False,False,False,False,False,False,False,False,False,False
9998,False,False,False,False,False,False,False,False,False,False,False


In [152]:
def normalization_fun(col, df_temp):
    df_temp[col] = (df_X[col]-df_X[col].min())/(df_X.loc[df_X[col] != np.inf, col].max() -df_X[col].min())
    return df_temp

df_X = normalization_fun('Balance/EstimatedSalary',df_X)
df_X = normalization_fun('Tenure/Age',df_X)
df_X = normalization_fun('Tenure/CreditScore',df_X)
#df_X = normalization_fun('EstimatedSalary/NumOfProducts',df_X)




TypeError: 'Index' object is not callable

In [153]:
df_X.head()

Unnamed: 0,NumOfProducts,HasCrCard,IsActiveMember,Geography_France,Geography_Spain,Geography_Germany,Gender_Female,Gender_Male,Balance/EstimatedSalary,Tenure/Age,Tenure/CreditScore
0,1,1,1,1,-1,-1,1,-1,0.0,0.085714,0.113086
1,1,-1,1,-1,1,-1,1,-1,7e-05,0.043902,0.057566
2,3,1,-1,1,-1,-1,1,-1,0.000132,0.342857,0.557769
3,2,-1,-1,1,-1,-1,1,-1,0.0,0.046154,0.050072
4,1,1,1,-1,1,-1,1,-1,0.00015,0.083721,0.082353


In [154]:
df_X_train,df_X_test,df_y_train,df_y_test = train_test_split( df_X, df_y, train_size=0.8, test_size=0.2)
print(df_X_train.shape,df_X_test.shape,df_y_train.shape,df_y_test.shape)

(8000, 11) (2000, 11) (8000,) (2000,)


In [157]:
def best_model(model):
    print(model.best_score_)    
    print(model.best_params_)
    print(model.best_estimator_)
    


In [170]:
# Fit primal logistic regression
param_grid = {'C': [0.1,0.5,1,10,50,100,200,300,400,500], 'max_iter': [100], 'fit_intercept':[True],'intercept_scaling':[1],
              'penalty':['l2'], 'tol':[0.00001,0.0001,0.000001]}
log_primal_Grid = GridSearchCV(LogisticRegression(solver='lbfgs'),param_grid, cv=10, refit=True, verbose=0)
log_primal_Grid.fit(df_X_train,df_y_train)
best_model(log_primal_Grid)

0.80475
{'C': 200, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 100, 'penalty': 'l2', 'tol': 1e-05}
LogisticRegression(C=200, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=1e-05, verbose=0,
                   warm_start=False)


In [172]:
logistic_alg = LogisticRegression(C=200, class_weight=None, dual=False, fit_intercept=True,intercept_scaling=1, max_iter=250, multi_class='warn',n_jobs=None, 
                                penalty='l2', random_state=1, solver='lbfgs',tol=1e-05, verbose=0, warm_start=False)
logistic_alg.fit(df_X_train,df_y_train)

LogisticRegression(C=200, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=250,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=1, solver='lbfgs', tol=1e-05, verbose=0,
                   warm_start=False)

In [174]:
df_y_pred = logistic_alg.predict(df_X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logistic_alg.score(df_X_test, df_y_test)))

Accuracy of logistic regression classifier on test set: 0.81


In [None]:
print(classification_report())