In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [155]:
df = pd.read_csv('dataset/Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [157]:
features = ['CreditScore','Geography','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary']
df_X = df[features]
df_X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.8,3,1,0,113931.57
3,699,France,Female,39,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1


In [158]:
df_y = df['Exited']
df_y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

In [159]:
df_X_train,df_X_test,df_y_train,df_y_test = train_test_split( df_X, df_y, train_size=0.8, test_size=0.2)
print(df_X_train.shape,df_X_test.shape,df_y_train.shape,df_y_test.shape)

(8000, 10) (2000, 10) (8000,) (2000,)


In [160]:
df_X_train.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
dtype: object

In [161]:
df_X_train.nunique()

CreditScore         458
Geography             3
Gender                2
Age                  70
Tenure               11
Balance            5123
NumOfProducts         4
HasCrCard             2
IsActiveMember        2
EstimatedSalary    8000
dtype: int64

In [162]:
catagorical_col = list(df_X_train.columns[df_X_train.dtypes == 'object'])
catagorical_col

['Geography', 'Gender']

In [163]:
continues_col = list(df_X_train.columns[df_X_train.nunique() > 3])
continues_col

['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [164]:
zero_containing_col = ['HasCrCard','IsActiveMember'] 
zero_containing_col

['HasCrCard', 'IsActiveMember']

In [165]:
df_X_train.loc[df_X_train.HasCrCard == 0, 'HasCrCard'] = -1
df_X_train.loc[df_X_train.IsActiveMember == 0, 'IsActiveMember'] = -1
df_X_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
2666,740,France,Female,37,7,0.0,2,1,1,194270.91
8414,600,Germany,Female,29,6,74430.1,2,1,1,96051.1
8247,505,Germany,Female,25,5,114268.85,2,1,1,126728.27
4770,615,Spain,Male,19,5,0.0,2,1,-1,159920.92
5836,618,Germany,Female,29,10,100315.1,2,1,1,32526.64


In [166]:
def handle_cat(catag_col,df_temp):
    for var in catag_col:
        for val in df_temp[var].unique():
            df_temp[var+"_"+val] = np.where(df_temp[var] == val,1,-1)
        df_temp = df_temp.drop(var,axis=1)
    return df_temp

df_X_train = handle_cat(catagorical_col,df_X_train)

df_X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
2666,740,37,7,0.0,2,1,1,194270.91,1,-1,-1,1,-1
8414,600,29,6,74430.1,2,1,1,96051.1,-1,1,-1,1,-1
8247,505,25,5,114268.85,2,1,1,126728.27,-1,1,-1,1,-1
4770,615,19,5,0.0,2,1,-1,159920.92,-1,-1,1,-1,1
5836,618,29,10,100315.1,2,1,1,32526.64,-1,1,-1,1,-1


In [167]:
continues_col


['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [168]:
#df_X_train['Balance/EstimatedSalary'] = df_X_train['Balance']/df_X_train['EstimatedSalary']

def devide_and_add_col(col1,col2,df_temp):
    df_temp[col1+"/"+col2] = df_temp[col1]/df_temp[col2]
    return df_temp

#Balance/EstimatedSalary
df_X_train = devide_and_add_col(continues_col[3],continues_col[5],df_X_train)


#Tenure/Age
df_X_train = devide_and_add_col(continues_col[2],continues_col[1],df_X_train)


#CreditedScore/Tenure
df_X_train = devide_and_add_col(continues_col[0],continues_col[2],df_X_train)


#EstimatedSalary/NumOFProducts
df_X_train = devide_and_add_col(continues_col[5],continues_col[4],df_X_train)
df_X_train = df_X_train.drop(continues_col,axis=1)
df_X_train.head()

Unnamed: 0,HasCrCard,IsActiveMember,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Balance/EstimatedSalary,Tenure/Age,CreditScore/Tenure,EstimatedSalary/NumOfProducts
2666,1,1,1,-1,-1,1,-1,0.0,0.189189,105.714286,97135.455
8414,1,1,-1,1,-1,1,-1,0.774901,0.206897,100.0,48025.55
8247,1,1,-1,1,-1,1,-1,0.901684,0.2,101.0,63364.135
4770,1,-1,-1,-1,1,-1,1,0.0,0.263158,123.0,79960.46
5836,1,1,-1,1,-1,1,-1,3.08409,0.344828,61.8,16263.32
