In [35]:
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
df = pd.read_csv('Churn_Modelling.csv')
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,15634602.0,Hargrave,602.0,France,Female,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1.0
1,,,,,France,,,,,,,,,
2,3.0,15619304.0,Onio,,France,Female,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1.0
3,4.0,15701354.0,Boni,699.0,France,Female,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,0.0
4,5.0,15737888.0,Mitchell,850.0,Spain,Female,,2.0,125510.82,1.0,1.0,1.0,79084.1,0.0


# **Check For Missing Values**

In [37]:
df.isnull().sum()

RowNumber            1
CustomerId           1
Surname              1
CreditScore          2
Geography            0
Gender               1
Age                136
Tenure              74
Balance             74
NumOfProducts       81
HasCrCard           74
IsActiveMember      74
EstimatedSalary     79
Exited              74
dtype: int64

# **Drop Unnecessary Features**

In [38]:
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# **Handle Missing Values**

In [39]:
columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']

for column in columns:
    median = math.floor(df[column].median())
    df[column] = df[column].fillna(median)


In [40]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,602.0,France,Female,42.0,2.0,0.0,1.0,1.0,1.0,101348.88,1.0
1,652.0,France,,37.0,5.0,97267.0,1.0,1.0,1.0,100200.0,0.0
2,652.0,France,Female,42.0,8.0,159660.8,3.0,1.0,0.0,113931.57,1.0
3,699.0,France,Female,39.0,1.0,0.0,2.0,0.0,0.0,93826.63,0.0
4,850.0,Spain,Female,37.0,2.0,125510.82,1.0,1.0,1.0,79084.1,0.0


In [41]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode().iloc[0])
Y = df['EstimatedSalary']
df = df.drop(['EstimatedSalary'], axis = 1)

In [42]:
Y

0       101348.88
1       100200.00
2       113931.57
3        93826.63
4        79084.10
          ...    
9995     96270.64
9996    101699.77
9997     42085.58
9998     92888.52
9999     38190.78
Name: EstimatedSalary, Length: 10000, dtype: float64

In [43]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,602.0,France,Female,42.0,2.0,0.0,1.0,1.0,1.0,1.0
1,652.0,France,Male,37.0,5.0,97267.0,1.0,1.0,1.0,0.0
2,652.0,France,Female,42.0,8.0,159660.8,3.0,1.0,0.0,1.0
3,699.0,France,Female,39.0,1.0,0.0,2.0,0.0,0.0,0.0
4,850.0,Spain,Female,37.0,2.0,125510.82,1.0,1.0,1.0,0.0


# **Label Encoding**

In [44]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

columns = ['Geography', 'Gender']

for column in columns:
  df[column]= label_encoder.fit_transform(df[column])
  df[column].unique()

df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,602.0,0,0,42.0,2.0,0.0,1.0,1.0,1.0,1.0
1,652.0,0,1,37.0,5.0,97267.0,1.0,1.0,1.0,0.0
2,652.0,0,0,42.0,8.0,159660.8,3.0,1.0,0.0,1.0
3,699.0,0,0,39.0,1.0,0.0,2.0,0.0,0.0,0.0
4,850.0,2,0,37.0,2.0,125510.82,1.0,1.0,1.0,0.0


# **Feature Selection**

**Principal Component Analysis**

In [45]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
pca.fit(df)

In [46]:
x_pca = pca.transform(df)

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x_pca, Y, test_size=0.25, random_state=42)

In [48]:
from sklearn.linear_model import LinearRegression

In [49]:
reg = LinearRegression().fit(X_train, Y_train)
reg.score(X_test,Y_test)


-0.003027044820623992

In [50]:
from sklearn.ensemble import ExtraTreesRegressor
reg = ExtraTreesRegressor(n_estimators=10000, random_state=0).fit(X_train, Y_train)
reg.score(X_test,Y_test)

-0.15648957670362584