# Schulung Bias in Machine Learning

In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, Lasso
import datetime
from statistics import mean

In [89]:
# CSV Datei einlesen
df = pd.read_csv('weatherHistory.csv')
df

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.
...,...,...,...,...,...,...,...,...,...,...,...,...
96448,2016-09-09 19:00:00.000 +0200,Partly Cloudy,rain,26.016667,26.016667,0.43,10.9963,31.0,16.1000,0.0,1014.36,Partly cloudy starting in the morning.
96449,2016-09-09 20:00:00.000 +0200,Partly Cloudy,rain,24.583333,24.583333,0.48,10.0947,20.0,15.5526,0.0,1015.16,Partly cloudy starting in the morning.
96450,2016-09-09 21:00:00.000 +0200,Partly Cloudy,rain,22.038889,22.038889,0.56,8.9838,30.0,16.1000,0.0,1015.66,Partly cloudy starting in the morning.
96451,2016-09-09 22:00:00.000 +0200,Partly Cloudy,rain,21.522222,21.522222,0.60,10.5294,20.0,16.1000,0.0,1015.95,Partly cloudy starting in the morning.


## Zielvariable bestimmen

## Datenreduktion & -bereinigung

- Nicht benötigte Spalten entfernen
- Null-Werte ersetzen

#### Nicht benötigte Spalten entfernen

In [90]:
df.drop(['Daily Summary', 'Pressure (millibars)', 'Wind Speed (km/h)', 'Humidity', 'Loud Cover'], axis=1, inplace=True)

#### Fehlende Werte finden und ersetzen

In [91]:
df.isna().sum()

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Wind Bearing (degrees)        0
Visibility (km)               0
dtype: int64

In [92]:
df['Precip Type'] = df['Precip Type'].replace(np.nan, 0)
df.isna().sum()

Formatted Date              0
Summary                     0
Precip Type                 0
Temperature (C)             0
Apparent Temperature (C)    0
Wind Bearing (degrees)      0
Visibility (km)             0
dtype: int64

## Datentransformation
- Numerische Kodierung und Normierung
- Datumsspalte in Datentyp date umwandeln

#### Monat aus Datumsspalte extrahieren

In [93]:
df['Date'] = pd.to_datetime(df['Formatted Date'], utc=True)
df['Month'] = df['Date'].dt.month
df

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Wind Bearing (degrees),Visibility (km),Date,Month
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,251.0,15.8263,2006-03-31 22:00:00+00:00,3
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,259.0,15.8263,2006-03-31 23:00:00+00:00,3
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,204.0,14.9569,2006-04-01 00:00:00+00:00,4
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,269.0,15.8263,2006-04-01 01:00:00+00:00,4
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,259.0,15.8263,2006-04-01 02:00:00+00:00,4
...,...,...,...,...,...,...,...,...,...
96448,2016-09-09 19:00:00.000 +0200,Partly Cloudy,rain,26.016667,26.016667,31.0,16.1000,2016-09-09 17:00:00+00:00,9
96449,2016-09-09 20:00:00.000 +0200,Partly Cloudy,rain,24.583333,24.583333,20.0,15.5526,2016-09-09 18:00:00+00:00,9
96450,2016-09-09 21:00:00.000 +0200,Partly Cloudy,rain,22.038889,22.038889,30.0,16.1000,2016-09-09 19:00:00+00:00,9
96451,2016-09-09 22:00:00.000 +0200,Partly Cloudy,rain,21.522222,21.522222,20.0,16.1000,2016-09-09 20:00:00+00:00,9


#### Kategorische Spalten in numerische umwandeln

In [94]:
dfCopy = df.copy()
label_encoder = preprocessing.LabelEncoder()
dfCopy['Summary'] = label_encoder.fit_transform(dfCopy['Summary'])
dfCopy['Summary'].unique()
dfCopy

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Wind Bearing (degrees),Visibility (km),Date,Month
0,2006-04-01 00:00:00.000 +0200,19,rain,9.472222,7.388889,251.0,15.8263,2006-03-31 22:00:00+00:00,3
1,2006-04-01 01:00:00.000 +0200,19,rain,9.355556,7.227778,259.0,15.8263,2006-03-31 23:00:00+00:00,3
2,2006-04-01 02:00:00.000 +0200,17,rain,9.377778,9.377778,204.0,14.9569,2006-04-01 00:00:00+00:00,4
3,2006-04-01 03:00:00.000 +0200,19,rain,8.288889,5.944444,269.0,15.8263,2006-04-01 01:00:00+00:00,4
4,2006-04-01 04:00:00.000 +0200,17,rain,8.755556,6.977778,259.0,15.8263,2006-04-01 02:00:00+00:00,4
...,...,...,...,...,...,...,...,...,...
96448,2016-09-09 19:00:00.000 +0200,19,rain,26.016667,26.016667,31.0,16.1000,2016-09-09 17:00:00+00:00,9
96449,2016-09-09 20:00:00.000 +0200,19,rain,24.583333,24.583333,20.0,15.5526,2016-09-09 18:00:00+00:00,9
96450,2016-09-09 21:00:00.000 +0200,19,rain,22.038889,22.038889,30.0,16.1000,2016-09-09 19:00:00+00:00,9
96451,2016-09-09 22:00:00.000 +0200,19,rain,21.522222,21.522222,20.0,16.1000,2016-09-09 20:00:00+00:00,9


In [95]:
dfCopy['Precip Type'] = dfCopy['Precip Type'].replace(['rain', 'snow'], [0, 1])
dfCopy

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Wind Bearing (degrees),Visibility (km),Date,Month
0,2006-04-01 00:00:00.000 +0200,19,0,9.472222,7.388889,251.0,15.8263,2006-03-31 22:00:00+00:00,3
1,2006-04-01 01:00:00.000 +0200,19,0,9.355556,7.227778,259.0,15.8263,2006-03-31 23:00:00+00:00,3
2,2006-04-01 02:00:00.000 +0200,17,0,9.377778,9.377778,204.0,14.9569,2006-04-01 00:00:00+00:00,4
3,2006-04-01 03:00:00.000 +0200,19,0,8.288889,5.944444,269.0,15.8263,2006-04-01 01:00:00+00:00,4
4,2006-04-01 04:00:00.000 +0200,17,0,8.755556,6.977778,259.0,15.8263,2006-04-01 02:00:00+00:00,4
...,...,...,...,...,...,...,...,...,...
96448,2016-09-09 19:00:00.000 +0200,19,0,26.016667,26.016667,31.0,16.1000,2016-09-09 17:00:00+00:00,9
96449,2016-09-09 20:00:00.000 +0200,19,0,24.583333,24.583333,20.0,15.5526,2016-09-09 18:00:00+00:00,9
96450,2016-09-09 21:00:00.000 +0200,19,0,22.038889,22.038889,30.0,16.1000,2016-09-09 19:00:00+00:00,9
96451,2016-09-09 22:00:00.000 +0200,19,0,21.522222,21.522222,20.0,16.1000,2016-09-09 20:00:00+00:00,9


In [96]:
dfCopy = dfCopy.drop(['Formatted Date', 'Date'], axis=1)
dfCopy

Unnamed: 0,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Wind Bearing (degrees),Visibility (km),Month
0,19,0,9.472222,7.388889,251.0,15.8263,3
1,19,0,9.355556,7.227778,259.0,15.8263,3
2,17,0,9.377778,9.377778,204.0,14.9569,4
3,19,0,8.288889,5.944444,269.0,15.8263,4
4,17,0,8.755556,6.977778,259.0,15.8263,4
...,...,...,...,...,...,...,...
96448,19,0,26.016667,26.016667,31.0,16.1000,9
96449,19,0,24.583333,24.583333,20.0,15.5526,9
96450,19,0,22.038889,22.038889,30.0,16.1000,9
96451,19,0,21.522222,21.522222,20.0,16.1000,9


## Teilung in Trainings- & Testdaten

#### Zielattribut von Dataframe separieren

In [97]:
x = dfCopy.drop(['Apparent Temperature (C)'], axis = 1)
y = dfCopy['Apparent Temperature (C)']
print(x)

       Summary  Precip Type  Temperature (C)  Wind Bearing (degrees)  \
0           19            0         9.472222                   251.0   
1           19            0         9.355556                   259.0   
2           17            0         9.377778                   204.0   
3           19            0         8.288889                   269.0   
4           17            0         8.755556                   259.0   
...        ...          ...              ...                     ...   
96448       19            0        26.016667                    31.0   
96449       19            0        24.583333                    20.0   
96450       19            0        22.038889                    30.0   
96451       19            0        21.522222                    20.0   
96452       19            0        20.438889                    39.0   

       Visibility (km)  Month  
0              15.8263      3  
1              15.8263      3  
2              14.9569      4  
3      

In [39]:
y

0         7.388889
1         7.227778
2         9.377778
3         5.944444
4         6.977778
           ...    
96448    26.016667
96449    24.583333
96450    22.038889
96451    21.522222
96452    20.438889
Name: Apparent Temperature (C), Length: 96453, dtype: float64

#### Daten in Trainings- und Testdaten aufteilen

In [117]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                    random_state = 101, test_size = 0.2)

In [118]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

#### Vorhersage durchführen

In [119]:
pred = model.predict(x_test)

## Bias berechnen ( + SSE + Varianz)

#### Varianz anzeigen lassen

In [120]:
var = np.var(pred)
print(var)

112.64120527582348


#### Sum of Squared Error berechnen

In [121]:
SSE = np.mean((np.mean(pred) - y)** 2)
print(SSE)

114.43645359045047


#### Bias berechnen

In [122]:
bias = SSE - var
print(bias)

1.7952483146269884


In [123]:
from sklearn import linear_model

lasso_model = linear_model.LassoLars()
lasso_model.fit(x_train, y_train)
pred2 = model.predict(x_test)

In [124]:
var2 = np.var(pred2)
SSE2 = np.mean((np.mean(pred2) - y)** 2)
bias2 = SSE2 - var2

In [125]:
print("Bias Lineare Regression:", bias)
print("Bias Lasso Lars:", bias2)

Bias Lineare Regression: 1.7952483146269884
Bias Lasso Lars: 1.7952483146269884


In [128]:
sdg_model = linear_model.SGDRegressor()
sdg_model.fit(x_train, y_train)
pred4 = model.predict(x_test)

In [129]:
var4 = np.var(pred4)
SSE4 = np.mean((np.mean(pred4) - y)** 2)
bias4 = SSE4 - var4
print(bias4)

1.7952483146269884


In [114]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
XYregressor = RandomForestRegressor(max_depth=2, random_state=0)

pred3 = cross_val_predict(XYregressor, x,y,cv=10)

In [116]:
var3 = np.var(pred3)
SSE3 = np.mean((np.mean(pred3) - y)** 2)
bias3 = SSE3 - var3
print(bias3)

11.064476827029566
