In [1]:
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing
from acquire import get_titanic_data, get_iris_data

## Iris Data

- Use the function defined in acquire.py to load the iris data.

In [2]:
df = get_iris_data()

In [3]:
df.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


- Rename the species_name column to just species.

In [4]:
df.rename(columns = {'species_name':'species'}, inplace = True)
df.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


- Drop the species_id and measurement_id columns.

In [5]:
df.drop(columns = ['species_id', 'measurement_id'], inplace = True)

In [11]:
train, test = sklearn.model_selection.train_test_split(df, random_state=47, train_size = .8)

In [12]:
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
108,6.7,2.5,5.8,1.8,virginica
91,6.1,3.0,4.6,1.4,versicolor
107,7.3,2.9,6.3,1.8,virginica
46,5.1,3.8,1.6,0.2,setosa
138,6.0,3.0,4.8,1.8,virginica


- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

In [13]:
encoder = sklearn.preprocessing.OneHotEncoder()

encoder.fit(train[['species']])
                  


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [14]:
encoder.categories_

[array(['setosa', 'versicolor', 'virginica'], dtype=object)]

In [15]:
encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(train[['species']])
cols = ['embark_town_' + c for c in encoder.categories_[0]]

m = encoder.transform(train[['species']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns = cols, index=train.index)
], axis = 1).drop(columns='species')

m = encoder.transform(test[['species']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns = cols, index=test.index)
], axis = 1).drop(columns='species')




In [None]:
train.head()

- Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [28]:
def drop_columns(df):
    df.drop(columns = ['species_id', 'measurement_id'], inplace = True)
    return df

def rename_columns(df):
    df.rename(columns = {'species_name':'species'}, inplace = True)
    return df

def encode_species(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder().fit(train[['species']])
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['species']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns = cols, index=train.index)
    ], axis = 1).drop(columns='species')

    m = encoder.transform(test[['species']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns = cols, index=test.index)
    ], axis = 1).drop(columns='species')    
    
    return encoder, train, test
    

def prep_iris(df):
    df = drop_columns(df)
    df = rename_columns(df)
    train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size = .8)
    encoder, train, test = encode_species(train, test)
    return encoder, train, test

In [29]:
df = get_iris_data()
df.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [30]:
encoder, train, test = prep_iris(df)

In [31]:
encoder

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

## Titanic Data

- *Use the function you defined in acquire.py to load the titanic data set.*
- *Handle the missing values in the embark_town and embarked columns.*
- *Remove the deck column.*
- *Use a label encoder to transform the embarked column.*
- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?
- *Fill the missing values in age. The way you fill these values is up to you. Consider the tradeoffs of different methods.*
- Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [None]:
df = get_titanic_data()
df.head(1)

In [None]:
def drop_columns_titanic(df):
    df.drop(columns = ['deck', 'embarked', 'class'], inplace = True)
    return df
df = drop_columns_titanic(df)

In [None]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)

In [None]:
train.head(1)

In [None]:
def impute_embark_town(train, test):
    train.embark_town = train.embark_town.fillna("Southampton")
    test.embark_town = test.embark_town.fillna("Southampton")
    return train, test

In [None]:
train, test = impute_embark_town(train, test)
train.head()

In [None]:
def encode_embark_town(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder()
    encoder.fit(train[['embark_town']])
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['embark_town']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embark_town')
    
    m = encoder.transform(test[['embark_town']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embark_town')
    return train, test

In [None]:
train, test = encode_embark_town(train, test)

In [None]:
def impute_age(train, test):
    imputer = sklearn.impute.SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

In [None]:
train, test = impute_age(train, test)
train.age.isnull().sum()

In [None]:
def scale_titanic(train, test)

    scaler = sklearn.preprocessing.MinMaxScaler(copy=True, feature_range=(0,1))

    scaler = scaler.fit(train[['age', 'fare']])

    train[['age_scaled', 'fare_scaled']] = scaler.transform(train[['age', 'fare']])

    test[['age_scaled', 'fare_scaled']] = scaler.transform(test[['age', 'fare']])

train.head()

In [None]:
def min_max_scaler(X_train, X_test):
    """Transforms features by scaling each feature to a given range.
       Takes in X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled within range.
       Sensitive to outliers.
    """
    scaler = (sklearn.preprocessing.MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [None]:
X_train = train[['age']]
X_test = test[['age']]

X_train_scaled, X_test_scaled = min_max_scaler(X_train, X_test)