# Preprocessing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 200

In [None]:
from sklearn.datasets import fetch_california_housing

california = fetch_california_housing()
from sklearn.model_selection import train_test_split
X, y = california.data, california.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

In [None]:
print(california.DESCR)

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(20, 10))
for i, ax in enumerate(axes.ravel()):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, california.feature_names[i]))
    ax.set_ylabel("MEDV")

In [None]:
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
           california.feature_names, rotation=30, ha="right");

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.neighbors import KNeighborsRegressor
scores = cross_val_score(KNeighborsRegressor(),
                         X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

In [None]:
scores = cross_val_score(KNeighborsRegressor(),
                         X_train_scaled, y_train, cv=10)
np.mean(scores), np.std(scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
scores = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0),
                         X_train_scaled, y_train, cv=10)
np.mean(scores), np.std(scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
scores = cross_val_score(RandomForestRegressor(n_estimators=100, random_state=0),
                         X_train, y_train, cv=10)
np.mean(scores), np.std(scores)

# Categorical Variables

In [None]:
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx']})
df

In [None]:
pd.get_dummies(df)

In [None]:
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': [0, 1, 0, 2, 2, 3]})
df

In [None]:
pd.get_dummies(df)

In [None]:
pd.get_dummies(df, columns=['boro'])

### Ensuring consistent encoding

In [None]:
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhatten', 'Queens', 'Manhatten', 'Brooklyn', 'Brooklyn', 'Bronx']})


df['boro'] = pd.Categorical(df['boro'],
                            categories=['Manhatten', 'Queens', 'Brooklyn', 'Bronx', 'Staten Island'])
df

In [None]:
pd.get_dummies(df)

# Exercises

## Oefening 1
Laad de "adult" datasets met inkomensgegevens uit de volkstelling, waarbij volwassenen worden ingedeeld in degenen die meer dan $50k per jaar verdienen tegenover degenen die minder verdienen.

## Oefening 2

Experimenteer met het visualiseren van de gegevens. Kun je ontdekken welke kenmerken het meest van invloed zijn op het inkomen?

## Oefening 3

Split de data in een trainings- en testset.
Pas dummy encoding en schaling toe.
Hoe heeft dit het aantal variabelen veranderd?

## Oefening 4

Bouw en evalueer een LogisticRegression-model op de data.


In [None]:
data = pd.read_csv("adult.csv", index_col=0)
