# Skalowanie zmiennych

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston, fetch_california_housing
from sklearn.preprocessing import scale, minmax_scale
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Załadujmy zbiór danych boston i na nim przeprowadzajmy transormacje zmiennych.

In [2]:
data = load_boston()
X = data["data"]
X = pd.DataFrame(X)
X.columns = data["feature_names"]
y = data["target"]

Importujemy również funkcję to losowego podziału zbioru na część testową i treningową.
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
np.random.seed(10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [5]:
X_train = pd.DataFrame(X_train)
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
147,2.36862,0.0,19.58,0.0,0.871,4.926,95.7,1.4608,5.0,403.0,14.7,391.71,29.53
330,0.04544,0.0,3.24,0.0,0.46,6.144,32.2,5.8736,4.0,430.0,16.9,368.57,9.09
388,14.3337,0.0,18.1,0.0,0.7,4.88,100.0,1.5895,24.0,666.0,20.2,372.92,30.62
238,0.08244,30.0,4.93,0.0,0.428,6.481,18.5,6.1899,6.0,300.0,16.6,379.41,6.36
113,0.22212,0.0,10.01,0.0,0.547,6.092,95.4,2.548,6.0,432.0,17.8,396.9,17.09


In [6]:
X_train.mean()

CRIM         3.494978
ZN          11.676991
INDUS       11.453687
CHAS         0.058997
NOX          0.555360
RM           6.254643
AGE         68.591740
DIS          3.813495
RAD          9.778761
TAX        413.000000
PTRATIO     18.543068
B          354.687699
LSTAT       12.894867
dtype: float64

In [7]:
reg = LinearRegression().fit(X_train, y_train)

Tym razem testujemy na zbiorze testowym.

In [8]:
reg.score(X_test, y_test)

0.7165219393967551

Teraz zamiast funkcji scale importujemy klasę StandardScaler. Dzięki niej można przechować średnią i odchylenie standardowe "na później" i zastosować je dla zbioru testowego.
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [9]:
scaler = StandardScaler()

Zwrócmy uwagę, że nie dopasowujemy standard scalera do danych testowych, a jedynie używamy tego wcześniej dopasowanego do danych treningowych.

In [10]:
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
X_train_scaled1 = zscore(X_train)
X_train_scaled2 = scale(X_train)

In [12]:
np.all(X_train_scaled == X_train_scaled2)

True

In [13]:
np.all(X_train_scaled == X_train_scaled1)

True

Tak jak wcześniej tracimy też nazwy kolumn.

In [14]:
X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.136415,-0.47928,1.167876,-0.250392,2.749511,-1.992829,0.964187,-1.103664,-0.540043,-0.058172,-1.777315,0.392614,2.365979
1,-0.417778,-0.47928,-1.180433,-0.250392,-0.830671,-0.165953,-1.294382,0.966408,-0.653052,0.098893,-0.759875,0.147219,-0.541158
2,1.312692,-0.47928,0.955177,-0.250392,1.259946,-2.061824,1.117129,-1.04329,1.607129,1.471764,0.766286,0.19335,2.521007
3,-0.413297,0.752065,-0.937554,-0.250392,-1.10942,0.339512,-1.781664,1.114787,-0.427034,-0.657349,-0.898616,0.262175,-0.92944
4,-0.39638,-0.47928,-0.20748,-0.250392,-0.072822,-0.243948,0.953516,-0.593652,-0.427034,0.110528,-0.343649,0.447653,0.596665


In [15]:
X_train_scaled.mean()

0    -2.033771e-16
1    -3.176744e-17
2     4.643942e-15
3     3.196394e-16
4    -3.819626e-15
5    -1.969581e-15
6     1.097532e-15
7    -1.801247e-16
8    -3.667993e-17
9     2.816495e-17
10    1.954386e-14
11    3.910343e-15
12   -8.678735e-18
dtype: float64

In [16]:
X_train_scaled.std()

0     1.001478
1     1.001478
2     1.001478
3     1.001478
4     1.001478
5     1.001478
6     1.001478
7     1.001478
8     1.001478
9     1.001478
10    1.001478
11    1.001478
12    1.001478
dtype: float64

In [17]:
reg1 = LinearRegression().fit(X_train_scaled, y_train)

In [106]:
reg1.score(X_test_scaled, y_test)

0.7165219393967556

In [18]:
scaler1 = MinMaxScaler()

In [19]:
scaler1.fit(X_train)
X_train_scaled_1 = scaler1.transform(X_train)
X_test_scaled_1 = scaler1.transform(X_test)

In [20]:
X_train_scaled_2 = minmax_scale(X_train)

In [21]:
np.all(X_train_scaled_1==X_train_scaled_2)

True

In [22]:
pd.DataFrame(X_train_scaled_2).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,339.0,339.0,339.0,339.0,339.0,339.0,339.0,339.0,339.0,339.0,339.0,339.0,339.0
mean,0.039212,0.11677,0.402994,0.058997,0.350535,0.516122,0.676537,0.24355,0.381685,0.431298,0.632241,0.893559,0.304435
std,0.092942,0.243996,0.255443,0.235967,0.23656,0.127936,0.289976,0.194264,0.385301,0.328543,0.230371,0.238127,0.195322
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.000853,0.0,0.173387,0.0,0.139918,0.444721,0.42688,0.086496,0.130435,0.171756,0.510638,0.94382,0.152288
50%,0.00287,0.0,0.346041,0.0,0.314815,0.498371,0.760041,0.193439,0.173913,0.272901,0.691489,0.98598,0.265187
75%,0.042779,0.125,0.646628,0.0,0.49177,0.577888,0.941813,0.362196,1.0,0.914122,0.808511,0.997983,0.422607
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Zadanie 1
 1.1) Dokonaj podziału danych na część trenującą i testową (80% - część trenująca, 20% - część testowa);
 
 1.2) Wykonaj standaryzację danych;
 
 1.3) Uwtórz model regresji liniowej i sprawdź jego działanie na zbiorze testowym (przy użyciu miary R^2);
 
 1.4) Porównaj działanie modelu z modelem utworzonym na danych bez wykonanej standaryzacji.

In [113]:
data = fetch_california_housing()
X = data["data"]
X = pd.DataFrame(X)
X.columns = data["feature_names"]
y = data["target"]

In [114]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


# 1.1

In [115]:
np.random.seed(10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 1.2

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1.3

In [116]:
reg1 = LinearRegression().fit(X_train_scaled, y_train)
reg1.score(X_test_scaled, y_test)

0.593595852643664

# 1.4

In [117]:
reg1 = LinearRegression().fit(X_train, y_train)
reg1.score(X_test, y_test)

0.5935958526436642