In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook

In [2]:
data = pd.read_csv("adult.csv", index_col=0)

In [3]:
data.shape

(32561, 14)

In [4]:
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
# Simplify a bit
columns = ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']
data_simple = data[columns]

In [7]:
data_simple.head()

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [8]:
X = data_simple.drop("income", axis=1)
y = data_simple.income

# Dummy variables / one hot encoding

In [9]:
X_dummies = pd.get_dummies(X)
X_dummies.head()

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Farming-fishing,occupation_ Handlers-cleaners,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,38,40,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,53,40,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y)

# Data Scaling

In [11]:
X_train.min()

age                              17
hours-per-week                    1
workclass_ ?                      0
workclass_ Federal-gov            0
workclass_ Local-gov              0
workclass_ Never-worked           0
workclass_ Private                0
workclass_ Self-emp-inc           0
workclass_ Self-emp-not-inc       0
workclass_ State-gov              0
workclass_ Without-pay            0
education_ 10th                   0
education_ 11th                   0
education_ 12th                   0
education_ 1st-4th                0
education_ 5th-6th                0
education_ 7th-8th                0
education_ 9th                    0
education_ Assoc-acdm             0
education_ Assoc-voc              0
education_ Bachelors              0
education_ Doctorate              0
education_ HS-grad                0
education_ Masters                0
education_ Preschool              0
education_ Prof-school            0
education_ Some-college           0
gender_ Female              

In [12]:
X_train.max()

age                              90
hours-per-week                   99
workclass_ ?                      1
workclass_ Federal-gov            1
workclass_ Local-gov              1
workclass_ Never-worked           1
workclass_ Private                1
workclass_ Self-emp-inc           1
workclass_ Self-emp-not-inc       1
workclass_ State-gov              1
workclass_ Without-pay            1
education_ 10th                   1
education_ 11th                   1
education_ 12th                   1
education_ 1st-4th                1
education_ 5th-6th                1
education_ 7th-8th                1
education_ 9th                    1
education_ Assoc-acdm             1
education_ Assoc-voc              1
education_ Bachelors              1
education_ Doctorate              1
education_ HS-grad                1
education_ Masters                1
education_ Preschool              1
education_ Prof-school            1
education_ Some-college           1
gender_ Female              

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)


In [14]:
X_train_scaled.min(axis=0)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [15]:
X_train_scaled.max(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

# Exercises
Load the boston housing dataset from ``boston_house_prices.csv``. Extract the features (all numeric) and split the data into training and test set.
Rescale using the ``StandardScaler`` from ``sklearn.preprocessing`` that removes means and scales to a standard-deviation of 1. As before, call ``fit`` on the training set to compute mean and std, and use ``transform`` to transform the data. What is the mean of the rescaled training data? What is the mean of the rescaled test data? Why?


In [16]:
# %load solutions/scale_boston.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# we're setting some options for nicer printing here
np.set_printoptions(suppress=True, precision=4)

data = pd.read_csv("boston_house_prices.csv")
X = data.drop("MEDV", axis=1)
y = data.MEDV

X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled mean:")
print(X_train_scaled.mean(axis=0))
print("X_test_scaled mean:")
print(X_test_scaled.mean(axis=0))


(379, 13)
X_train_scaled mean:
[ 0. -0. -0.  0. -0. -0. -0.  0. -0.  0.  0.  0. -0.]
X_test_scaled mean:
[-0.0979 -0.0222  0.0746  0.2397  0.1415 -0.1196  0.16   -0.112  -0.0498
 -0.0309 -0.079   0.0219  0.0962]


  return self.partial_fit(X, y)
