In [2]:
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing
from acquire import get_titanic_data, get_iris_data

## Iris Data

- Use the function defined in acquire.py to load the iris data.

In [None]:
df = get_iris_data()

In [None]:
df.head()

- Rename the species_name column to just species.

In [None]:
df.rename(columns = {'species_name':'species'}, inplace = True)
df.head()

- Drop the species_id and measurement_id columns.

In [None]:
df.drop(columns = ['species_id', 'measurement_id'], inplace = True)

In [None]:
train, test = sklearn.model_selection.train_test_split(df, random_state=47, train_size = .8)

In [None]:
train.head()

- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?

In [None]:
encoder = sklearn.preprocessing.OneHotEncoder()

encoder.fit(train[['species']])
                  


In [None]:
encoder.categories_

In [None]:
encoder = sklearn.preprocessing.OneHotEncoder()
encoder.fit(train[['species']])
cols = ['embark_town_' + c for c in encoder.categories_[0]]

m = encoder.transform(train[['species']]).todense()
train = pd.concat([
    train,
    pd.DataFrame(m, columns = cols, index=train.index)
], axis = 1).drop(columns='species')

m = encoder.transform(test[['species']]).todense()
test = pd.concat([
    test,
    pd.DataFrame(m, columns = cols, index=test.index)
], axis = 1).drop(columns='species')




In [None]:
train.head()

- Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [None]:
def drop_columns(df):
    df.drop(columns = ['species_id', 'measurement_id'], inplace = True)
    return df

def rename_columns(df):
    df.rename(columns = {'species_name':'species'}, inplace = True)
    return df

def encode_species(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder().fit(train[['species']])
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['species']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns = cols, index=train.index)
    ], axis = 1).drop(columns='species')

    m = encoder.transform(test[['species']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns = cols, index=test.index)
    ], axis = 1).drop(columns='species')    
    
    return encoder, train, test
    

def prep_iris(df):
    df = drop_columns(df)
    df = rename_columns(df)
    train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size = .8)
    encoder, train, test = encode_species(train, test)
    return encoder, train, test

In [None]:
df = get_iris_data()
df.head()

In [None]:
encoder, train, test = prep_iris(df)

In [None]:
encoder

In [None]:
train.head()

## Titanic Data

- *Use the function you defined in acquire.py to load the titanic data set.*
- *Handle the missing values in the embark_town and embarked columns.*
- *Remove the deck column.*
- *Use a label encoder to transform the embarked column.*
- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?
- *Fill the missing values in age. The way you fill these values is up to you. Consider the tradeoffs of different methods.*
- Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [1]:
df = get_titanic_data()
df.head(1)

NameError: name 'get_titanic_data' is not defined

In [4]:
def drop_columns_titanic(df):
    df.drop(columns = ['deck', 'embarked', 'class'], inplace = True)
    return df
df = drop_columns_titanic(df)

In [5]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)

In [6]:
train.head(1)

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
329,329,1,1,female,16.0,0,1,57.9792,Cherbourg,0


In [7]:
def impute_embark_town(train, test):
    train.embark_town = train.embark_town.fillna("Southampton")
    test.embark_town = test.embark_town.fillna("Southampton")
    return train, test

In [8]:
train, test = impute_embark_town(train, test)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embark_town,alone
329,329,1,1,female,16.0,0,1,57.9792,Cherbourg,0
749,749,0,3,male,31.0,0,0,7.75,Queenstown,1
203,203,0,3,male,45.5,0,0,7.225,Cherbourg,1
421,421,0,3,male,21.0,0,0,7.7333,Queenstown,1
97,97,1,1,male,23.0,0,1,63.3583,Cherbourg,0


In [9]:
def encode_embark_town(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder().fit(train[['embark_town']])
    
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['embark_town']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embark_town')
    
    m = encoder.transform(test[['embark_town']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embark_town')
    return encoder, train, test

In [10]:
encoder, train, test = encode_embark_town(train, test)

In [11]:
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
329,329,1,1,female,16.0,0,1,57.9792,0,1.0,0.0,0.0
749,749,0,3,male,31.0,0,0,7.75,1,0.0,1.0,0.0
203,203,0,3,male,45.5,0,0,7.225,1,1.0,0.0,0.0
421,421,0,3,male,21.0,0,0,7.7333,1,0.0,1.0,0.0
97,97,1,1,male,23.0,0,1,63.3583,0,1.0,0.0,0.0


In [12]:
def impute_age(train, test):
    imputer = sklearn.impute.SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

In [13]:
train, test = impute_age(train, test)
train.age.isnull().sum()

0

In [None]:
# def scale_titanic(train, test):

#     scaler = sklearn.preprocessing.MinMaxScaler(copy=True).fit(train[['age', 'fare']])

#     train_scaled = pd.DataFrame(scaler.transform(train[['age', 'fare']]), columns={'age_scaled', 'fare_scaled'})

#     test_scaled = pd.DataFrame(scaler.transform(test[['age', 'fare']]), columns={'age_scaled', 'fare_scaled'})
    
# #     train = pd.merge([train, train_scaled], )
# #     test = pd.merge([test, test_scaled])
#     train.merge(train_scaled, how='inner', on='index')
#     test.merge(test_scaled, how='inner', on='index')
#     return scaler, train, test

In [14]:
def scale_columns(train, test):
    scaler = sklearn.preprocessing.MinMaxScaler()
    train[['age','fare']] = scaler.fit_transform(train[['age','fare']])
    test[['age','fare']] = scaler.transform(test[['age','fare']])
    return scaler, train, test

In [15]:
scaler, train, test = scale_columns(train, test)

In [16]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,alone,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
329,329,1,1,female,0.195778,0,1,0.113168,0,1.0,0.0,0.0
749,749,0,3,male,0.384267,0,0,0.015127,1,0.0,1.0,0.0
203,203,0,3,male,0.566474,0,0,0.014102,1,1.0,0.0,0.0
421,421,0,3,male,0.258608,0,0,0.015094,1,0.0,1.0,0.0
97,97,1,1,male,0.283740,0,1,0.123667,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
98,98,1,2,female,0.421965,0,1,0.044893,0,0.0,0.0,1.0
322,322,1,2,female,0.371701,0,0,0.024106,1,0.0,1.0,0.0
382,382,0,3,male,0.396833,0,0,0.015469,1,0.0,0.0,1.0
365,365,0,3,male,0.371701,0,0,0.014151,1,0.0,0.0,1.0


In [None]:
train.head()

In [None]:
def min_max_scaler(X_train, X_test):
    """Transforms features by scaling each feature to a given range.
       Takes in X_train and X_test,
       Returns the scaler and X_train_scaled and X_test_scaled within range.
       Sensitive to outliers.
    """
    scaler = (sklearn.preprocessing.MinMaxScaler(copy=True, 
                           feature_range=(0,1))
                          .fit(X_train))
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [None]:
X_train = train[['age']]
X_test = test[['age']]

X_train_scaled, X_test_scaled = min_max_scaler(X_train, X_test)

In [5]:
df = get_titanic_data()

def drop_columns_titanic(df):
    df.drop(columns = ['deck', 'embarked', 'class'], inplace = True)
    return df

def impute_embark_town(train, test):
    train.embark_town = train.embark_town.fillna("Southampton")
    test.embark_town = test.embark_town.fillna("Southampton")
    return train, test

def encode_embark_town(train, test):
    encoder = sklearn.preprocessing.OneHotEncoder().fit(train[['embark_town']])
    
    cols = ['embark_town_' + c for c in encoder.categories_[0]]

    m = encoder.transform(train[['embark_town']]).todense()
    train = pd.concat([
        train,
        pd.DataFrame(m, columns=cols, index=train.index)
    ], axis=1).drop(columns='embark_town')
    
    m = encoder.transform(test[['embark_town']]).todense()
    test = pd.concat([
        test,
        pd.DataFrame(m, columns=cols, index=test.index)
    ], axis=1).drop(columns='embark_town')
    return encoder, train, test

def impute_age(train, test):
    imputer = sklearn.impute.SimpleImputer(strategy='mean')
    imputer.fit(train[['age']])
    train.age = imputer.transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

def scale_columns(train, test):
    scaler = sklearn.preprocessing.MinMaxScaler()
    train[['age','fare']] = scaler.fit_transform(train[['age','fare']])
    test[['age','fare']] = scaler.transform(test[['age','fare']])
    return scaler, train, test

def prep_titanic(df):
    df = drop_columns_titanic(df)
    train, test = sklearn.model_selection.train_test_split(df, random_state=123, train_size=.8)
    train, test = impute_embark_town(train, test)
    encoder, train, test = encode_embark_town(train, test)
    train, test = impute_age(train, test)
    scaler, train, test = scale_columns(train, test)
    return scaler, encoder, train, test
    

In [6]:
scaler, encoder, train, test = prep_titanic(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 329 to 510
Data columns (total 12 columns):
passenger_id               712 non-null int64
survived                   712 non-null int64
pclass                     712 non-null int64
sex                        712 non-null object
age                        712 non-null float64
sibsp                      712 non-null int64
parch                      712 non-null int64
fare                       712 non-null float64
alone                      712 non-null int64
embark_town_Cherbourg      712 non-null float64
embark_town_Queenstown     712 non-null float64
embark_town_Southampton    712 non-null float64
dtypes: float64(5), int64(6), object(1)
memory usage: 72.3+ KB
