## Standard Scaler with Scikit-learn

## Present by Mohammad Kahkeshani

### In this meeting, we want to use the Titanic dataset to standardize the data. Our target in this model is to see if a person survived the accident.

### Lets import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

### Check the data

In [2]:
titanic = pd.read_csv('titanic.csv')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## in this dataset:
### target = survived
### featurs = pclass, fare, alone, ismale

## lest preprocess some featurs:
### adult_male

In [3]:
# We want to convert a non-numeric column into a numeric column
# make a new column and convert:
titanic['ismale'] = titanic['sex'].replace({'female': 0, 'male' : 1})
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,ismale
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,1
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,0
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,0
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,1


## your practice:

In [4]:
# Convert alone column to numeric value


## Select features

In [5]:
cdf = titanic[['pclass', 'fare', 'alone','ismale','survived']]
cdf

Unnamed: 0,pclass,fare,alone,ismale,survived
0,3,7.2500,False,1,0
1,1,71.2833,False,0,1
2,3,7.9250,True,0,1
3,1,53.1000,False,0,1
4,3,8.0500,True,1,0
...,...,...,...,...,...
886,2,13.0000,True,1,0
887,1,30.0000,True,0,1
888,3,23.4500,False,0,0
889,1,30.0000,True,1,1


In [6]:
features = cdf[['pclass', 'fare', 'alone','ismale']]
target = cdf['survived']
features

Unnamed: 0,pclass,fare,alone,ismale
0,3,7.2500,False,1
1,1,71.2833,False,0
2,3,7.9250,True,0
3,1,53.1000,False,0
4,3,8.0500,True,1
...,...,...,...,...
886,2,13.0000,True,1
887,1,30.0000,True,0
888,3,23.4500,False,0
889,1,30.0000,True,1


## Split data

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size= 0.33, shuffle= False)
x_train

Unnamed: 0,pclass,fare,alone,ismale
0,3,7.2500,False,1
1,1,71.2833,False,0
2,3,7.9250,True,0
3,1,53.1000,False,0
4,3,8.0500,True,1
...,...,...,...,...
591,1,78.2667,False,0
592,3,7.2500,True,1
593,3,7.7500,False,0
594,2,26.0000,False,1


# standard Scaler

In [8]:
scaler = StandardScaler()
scaler.fit(x_train)
scaler.mean_ # mean of each feature (optional)

array([ 2.30872483, 31.89675537,  0.58724832,  0.62751678])

In [9]:
scaler.var_ # Variance of each feature (optional)

array([6.96635287e-01, 2.14942645e+03, 2.42387730e-01, 2.33739471e-01])

In [10]:
# lets scale featurs
x_test_scaled = scaler.transform(x_test)
x_train_scaled = scaler.transform(x_train)
x_test

Unnamed: 0,pclass,fare,alone,ismale
596,2,33.0000,True,0
597,3,0.0000,True,1
598,3,7.2250,True,1
599,1,56.9292,False,1
600,2,27.0000,False,0
...,...,...,...,...
886,2,13.0000,True,1
887,1,30.0000,True,0
888,3,23.4500,False,0
889,1,30.0000,True,1


In [11]:
x_train_scaled

array([[ 0.82822478, -0.53161651, -1.19279681,  0.77044331],
       [-1.56799838,  0.84954539, -1.19279681, -1.29795404],
       [ 0.82822478, -0.51705714,  0.83836576, -1.29795404],
       ...,
       [ 0.82822478, -0.52083179, -1.19279681, -1.29795404],
       [-0.3698868 , -0.12718966, -1.19279681,  0.77044331],
       [ 0.82822478, -0.16709311, -1.19279681,  0.77044331]])

## Modeling

Using sklearn package to model data

In [12]:
# modeling without scaled values
log_regr = LogisticRegression()
log_regr.fit(x_train, y_train)

In [13]:
y_pre = log_regr.predict(x_test) # real predicts

In [14]:
accuracy = accuracy_score(y_test, y_pre)
accuracy

0.7762711864406779

## Modeling by scaled values

In [15]:
new_log_regr = LogisticRegression()
new_log_regr.fit(x_train_scaled, y_train)

In [16]:
y_pred = new_log_regr.predict(x_test_scaled)
accuracy_score(y_test, y_pred)

0.7762711864406779

دلایل کمتر بودن اهمیت مقیاس‌بندی در رگرسیون لجستیک:

تابع پیوند لجستیک: در رگرسیون لجستیک، از تابع پیوند لجستیک استفاده می‌شود که خروجی را به یک احتمال بین 0 تا 1 محدود می‌کند. این تابع به طور طبیعی مقیاس داده‌ها را تغییر می‌دهد و تأثیر مقیاس‌های مختلف متغیرهای مستقل را کاهش می‌دهد.

هدف مدل: هدف اصلی رگرسیون لجستیک، پیش‌بینی احتمال وقوع یک رویداد است. در این مدل، مقیاس مطلق متغیرها به اندازه رابطه بین متغیرها و متغیر وابسته اهمیت ندارد.

In [17]:
#log_regr.predict([[]])