## Standardization
### Try to bring all the variables and features to a similar scale. Standardisation means centering the variables at zero.
### Z=(Xi - X_mean)/std.dev

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [7]:
df=pd.read_csv('titanic_train.csv',usecols=['Pclass','Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [8]:
df['Age'].fillna(df.Age.median(),inplace=True)

In [9]:
df.isnull().sum()

Survived    0
Pclass      0
Age         0
Fare        0
dtype: int64

In [14]:
X=df.iloc[:,1:]
X

Unnamed: 0,Pclass,Age,Fare
0,3,22.0,7.2500
1,1,38.0,71.2833
2,3,26.0,7.9250
3,1,35.0,53.1000
4,3,35.0,8.0500
...,...,...,...
886,2,27.0,13.0000
887,1,19.0,30.0000
888,3,28.0,23.4500
889,1,26.0,30.0000


In [15]:
y=df.iloc[:,0]
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, 
                                                    random_state=101)

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scaler=StandardScaler()

In [29]:
# AFTER STANDARDISATION
X_train_scaled=scaler.fit_transform(X_train)
X_train_scaled

array([[-1.59418307,  0.05905201,  1.39097165],
       [ 0.81936748, -0.01900244, -0.52170584],
       [-0.38740779, -1.26787352, -0.25961883],
       ...,
       [-1.59418307,  0.91765088,  2.305488  ],
       [-1.59418307,  1.1518142 ,  0.59867038],
       [ 0.81936748, -0.09705688,  0.85676027]])

In [28]:
# ORIGINAL DATA
X_train

Unnamed: 0,Pclass,Age,Fare
520,1,30.0,93.5000
510,3,29.0,7.7500
446,2,13.0,19.5000
2,3,26.0,7.9250
691,3,4.0,13.4167
...,...,...,...
575,3,19.0,14.5000
838,3,32.0,56.4958
337,1,41.0,134.5000
523,1,44.0,57.9792


In [27]:
X_test

Unnamed: 0,Pclass,Age,Fare
331,1,45.5,28.5000
700,1,18.0,227.5250
748,1,19.0,53.1000
751,3,6.0,12.4750
481,2,28.0,0.0000
...,...,...,...
388,3,28.0,7.7292
416,2,34.0,32.5000
407,2,3.0,18.7500
482,3,50.0,8.0500


In [31]:
# HERE NOT USE OF fit.transform 
# only use transform to overcome overfitting and same standardisation with X_train
X_test_scaled=scaler.transform(X_test)
X_test_scaled

array([[-1.59418307e+00,  1.26889587e+00, -5.88713411e-02],
       [-1.59418307e+00, -8.77601304e-01,  4.38043638e+00],
       [-1.59418307e+00, -7.99546861e-01,  4.89838469e-01],
       [ 8.19367482e-01, -1.81425461e+00, -4.16313402e-01],
       [-3.87407793e-01, -9.70568780e-02, -6.94571730e-01],
       [ 8.19367482e-01,  9.95705318e-01, -5.26166890e-01],
       [-1.59418307e+00, -9.70568780e-02, -2.54134258e-02],
       [ 8.19367482e-01, -5.65383533e-01, -5.18453726e-01],
       [-1.59418307e+00, -9.70568780e-02,  4.65302664e-01],
       [ 8.19367482e-01, -9.70568780e-02, -3.48839940e-01],
       [-3.87407793e-01, -9.70568780e-02, -6.94571730e-01],
       [ 8.19367482e-01,  2.15160892e-01, -5.17802412e-01],
       [ 8.19367482e-01, -1.89230906e+00, -2.65010018e-01],
       [ 8.19367482e-01, -4.09274648e-01, -5.35274135e-01],
       [-1.59418307e+00,  3.96177414e+00, -2.54134258e-02],
       [-3.87407793e-01,  3.18122971e+00, -4.60366324e-01],
       [-1.59418307e+00, -2.53165763e-01

### MODEL Building
#### .fit() for training and .predict() for test

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
log= LogisticRegression()

In [35]:
log.fit(X_train_scaled,y_train)

LogisticRegression()

In [36]:
log.predict(X_test_scaled)

array([1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0], dtype=int64)