#  Feature Scaling & Feature Engineering

## Assignment
Perform the feature engineering techniques learned today on the Titanic Dataset. 

### Requirements:

- Create a Logistic Regression using the titanic dataset
- Start by making a model that only uses numerical features first and predicts the "survived" column.
- Make different models utilizing the feature engineering techniques learned in class
- Compare your results between validation and test cost functions and report the percent difference between them.
- Save your model using Joblib.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression


# Cost Functions
from sklearn.metrics import accuracy_score, recall_score, precision_score

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Scalers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler

# Feature Engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


# Plot style
plt.style.use("seaborn-whitegrid")

In [2]:
# Load in Data
df = sns.load_dataset('titanic').dropna()
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True


In [3]:
# drop columns (unneeded)
columns = ['class', 'adult_male', 'deck', 'embark_town', 'alive', 'alone']
df.drop(columns, axis =1, inplace = True)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who
1,1,1,female,38.0,1,0,71.2833,C,woman
3,1,1,female,35.0,1,0,53.1,S,woman
6,0,1,male,54.0,0,0,51.8625,S,man


In [4]:
# Split Data
train, test = train_test_split(
    df, 
    train_size = .80,
    test_size =.20,
    random_state=42
)

# Split train Data into train and val 
train, val = train_test_split(
    train,
    train_size = .80,
    test_size = .20,
    random_state=42
)

In [5]:
# Model with only numeric features
train.dtypes

survived      int64
pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
who          object
dtype: object

In [6]:
def cleaner(df):
    "Return Only Numeric Features"
    
    # Create boolean array with only columns that are float64
    columns_bool = (df.dtypes != "category").values & (df.dtypes != "object").values 

    # Return only numeric columns
    df = df.iloc[:, columns_bool].dropna()

    return df

train_numeric = cleaner(train)
val_numeric = cleaner(val)
test_numeric = cleaner(test)

In [7]:
# Create X, y train and validation sets

target = 'survived'

# feature set --> everything except for price --> X_train
# target set/column --> price --> y_train

x_train = train_numeric.drop(target, axis=1)
y_train = train_numeric[target]

x_val = val_numeric.drop(target, axis=1)
y_val = val_numeric[target]

x_test = test_numeric.drop(target, axis=1)
y_test = test_numeric[target]

In [8]:
# Logistic Regression 
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [9]:
def calc_cost(y_true, y_predict):
    
    "Calculate Cost Functions and print output"
    
    result_dict = {}
    
    acc = accuracy_score(y_true, y_predict)
    rec = recall_score(y_true, y_predict)
    pre= precision_score(y_true, y_predict)
    
    ls = [acc, rec, pre]
    ls2 = ["Accuracy", "Recall", "Precision"]
    
    for x in range(len(ls)):
        print(f"{ls2[x]}: {ls[x]}")
        result_dict[ls2[x]] = ls[x]
    
    return result_dict

# Save results to object and print results
print("\nLogistic Regression")

res1 = calc_cost(y_val, lr.predict(x_val))
res1_t = calc_cost(y_test, lr.predict(x_test))


Logistic Regression
Accuracy: 0.7241379310344828
Recall: 1.0
Precision: 0.7142857142857143
Accuracy: 0.7567567567567568
Recall: 1.0
Precision: 0.7272727272727273


## Feature Engineering


#### One Hot Encoding

In [10]:
train.describe(exclude="number")
# print(train.head(2))

Unnamed: 0,sex,embarked,who
count,116,116,116
unique,2,3,3
top,male,S,man
freq,60,76,56


In [11]:
# Create our One Hot Encoder object
one_hot = OneHotEncoder()

col_names = ['embarked']

# One Hot encode the column
one_hot_df = one_hot.fit_transform(train[col_names]).toarray()
one_hot_df_val = one_hot.transform(val[col_names]).toarray()
one_hot_df_test = one_hot.transform(test[col_names]).toarray()

In [12]:
# Look at the categories
one_hot.categories_

[array(['C', 'Q', 'S'], dtype=object)]

In [13]:
# Create column names list for one hot encoded features
column_names = []

for y in range(len(one_hot.categories_)):
    for z in range(len(one_hot.categories_[y])):
        # print(one_hot.categories_[y][z])
        column_names.append(col_names[y]+"_"+one_hot.categories_[y][z])


column_names

['embarked_C', 'embarked_Q', 'embarked_S']

In [14]:
def col_names(one_hot_model):
    
    "Create columns names list for one hot encoded feature"
    
    column_names = []
    
    col_names = ['embarked']

    for y in range(len(one_hot_model.categories_)):
        for z in range(len(one_hot_model.categories_[y])):
            # print(one_hot.categories_[y][z])
            column_names.append(col_names[y]+"_"+one_hot_model.categories_[y][z])
            
    return column_names

column_names = col_names(one_hot)

In [15]:
# Values for each row
one_hot_df

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0

In [16]:
# Cast One Hot Encoded values into a dataframe

oh_df = pd.DataFrame(
    one_hot_df,
    index=train.index,
    columns = column_names
)

oh_df_val = pd.DataFrame(
    one_hot_df_val,
    index=val.index,
    columns = column_names
)

oh_df_test = pd.DataFrame(
    one_hot_df_test,
    index=test.index,
    columns = column_names
)


print(oh_df.shape)
print(oh_df_val.shape)
print(oh_df_test.shape)

(116, 3)
(29, 3)
(37, 3)


In [17]:
# Create train_new df by merging train and one hot encoded df together and drop color and clarity cols
train_new = train.merge(oh_df, on=train.index).set_index("key_0").drop(['embarked'], axis=1)
val_new = val.merge(oh_df_val, on=val.index).set_index("key_0").drop(['embarked'], axis=1)
test_new = test.merge(oh_df_test, on=test.index).set_index("key_0").drop(['embarked'], axis=1)

print(train_new.shape)
print(val_new.shape)
print(test_new.shape)

(116, 11)
(29, 11)
(37, 11)


#### Ordinal Encoder 


In [18]:
ordinal = OrdinalEncoder()

col_names = ['sex','who']

# Ordinal encode the column
ordinal_ls = ordinal.fit_transform(train[col_names])
ordinal_ls_val = ordinal.transform(val[col_names])
ordinal_ls_test = ordinal.transform(test[col_names])

In [19]:
# Look at categories
ordinal.categories_

[array(['female', 'male'], dtype=object),
 array(['child', 'man', 'woman'], dtype=object)]

In [20]:
# Add oridnally encoded column to the data
train_new['sex'] = ordinal_ls
val_new['sex'] = ordinal_ls_val
test_new['sex'] = ordinal_ls_test

train_new['who'] = ordinal_ls
val_new['who'] = ordinal_ls_val
test_new['who'] = ordinal_ls_test

train_new.sample(5)

Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,who,embarked_C,embarked_Q,embarked_S
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
124,0,1,1.0,54.0,0,1,77.2875,1.0,0.0,0.0,1.0
310,1,1,0.0,24.0,0,0,83.1583,0.0,1.0,0.0,0.0
484,1,1,1.0,25.0,1,0,91.0792,1.0,1.0,0.0,0.0
434,0,1,1.0,50.0,1,0,55.9,1.0,0.0,0.0,1.0
632,1,1,1.0,32.0,0,0,30.5,1.0,1.0,0.0,0.0


#### Model Using One Hot and Ordinally Encoded Features

In [21]:
target = "survived"

x_train = train_new.drop(target, axis=1)
y_train = train_new[target]

x_val = val_new.drop(target, axis=1)
y_val = val_new[target]

x_test = test_new.drop(target, axis=1)
y_test = test_new[target]

In [22]:
lr2 = LogisticRegression()
lr2.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [23]:
# cost function 
res2 = calc_cost(y_val, lr2.predict(x_val))
res2_t = calc_cost(y_test, lr2.predict(x_test))

Accuracy: 0.7241379310344828
Recall: 0.85
Precision: 0.7727272727272727
Accuracy: 0.6216216216216216
Recall: 0.6666666666666666
Precision: 0.7272727272727273


## Percent Decrease in the Cost Function

In [24]:
delta_acc = 100*((res1['Accuracy']- res2['Accuracy'])/abs(res1['Accuracy']))
delta_rec = 100*((res1['Recall']- res2['Recall'])/abs(res1['Recall']))
delta_pre = 100*((res1['Precision']- res2['Precision'])/abs(res1['Precision']))

delta_ls = [round(delta_acc), round(delta_rec), round(delta_pre)]
delta_ls2 = ["Acc % difference", "Rec % difference", "Pre % difference"]
    
for x in range(len(delta_ls)):
    if delta_ls[x] > 0:
        print(f"{delta_ls2[x]}: {delta_ls[x] * (-1)}")
    else:
        print(f"{delta_ls2[x]}: {abs(delta_ls[x])}")
        
        

Acc % difference: 0
Rec % difference: -15
Pre % difference: 8


In [25]:
delta_acc_t = 100*((res1_t['Accuracy']- res2_t['Accuracy'])/res1_t['Accuracy'])
delta_rec_t = 100*((res1_t['Recall']- res2_t['Recall'])/res1_t['Recall'])
delta_pre_t = 100*((res1_t['Precision']- res2_t['Precision'])/res1_t['Precision'])

delta_ls_t = [round(delta_acc_t), round(delta_rec_t), round(delta_pre_t)]
delta_ls2_t = ["Acc % difference", "Rec % difference", "Pre % difference"]
    
for x in range(len(delta_ls_t)):
    if delta_ls_t[x] > 0:
        print(f"{delta_ls2_t[x]}: {delta_ls_t[x] * (-1)}")
    else:
        print(f"{delta_ls2_t[x]}: {abs(delta_ls_t[x])}")
        
        
    

Acc % difference: -18
Rec % difference: -33
Pre % difference: 0


## results 

In [26]:
print('Logistic Regression results')
print(f'val {res1}, \ntest{res1_t}\n')
print('Logistic Regression with Feature Engineering')
print(f'val {res2}, \ntest{res2_t}\n')

print('Val % difference')
for x in range(len(delta_ls)):
    if delta_ls[x] > 0:
        print(f"{delta_ls2[x]}: {delta_ls[x] * (-1)}")
    else:
        print(f"{delta_ls2[x]}: {abs(delta_ls[x])}")
    
print('\ntest % difference')    
for x in range(len(delta_ls_t)):
    if delta_ls_t[x] > 0:
        print(f"{delta_ls2_t[x]}: {delta_ls_t[x] * (-1)}")
    else:
        print(f"{delta_ls2_t[x]}: {abs(delta_ls_t[x])}")
    
    
# the test acc has decreased 18 % after applying the feature engineering 

Logistic Regression results
val {'Accuracy': 0.7241379310344828, 'Recall': 1.0, 'Precision': 0.7142857142857143}, 
test{'Accuracy': 0.7567567567567568, 'Recall': 1.0, 'Precision': 0.7272727272727273}

Logistic Regression with Feature Engineering
val {'Accuracy': 0.7241379310344828, 'Recall': 0.85, 'Precision': 0.7727272727272727}, 
test{'Accuracy': 0.6216216216216216, 'Recall': 0.6666666666666666, 'Precision': 0.7272727272727273}

Val % difference
Acc % difference: 0
Rec % difference: -15
Pre % difference: 8

test % difference
Acc % difference: -18
Rec % difference: -33
Pre % difference: 0


## Saving and Loading

[Joblib Docs](https://joblib.readthedocs.io/en/latest/)

In [28]:
# Used for saving out models
import joblib
import seaborn as sns
import datetime

In [29]:
# Getting a time stamp for today

str(datetime.datetime.today())

'2021-07-03 21:37:11.251127'

In [30]:
# Getting only the date from the time stamp

str(datetime.datetime.today())[:10]

'2021-07-03'

In [34]:
# Save the model as todays date

# Note: these will be saved in a file called models
# Note: if the file does not exist --> create it
# Todays date as string
today = str(datetime.datetime.today())[:10].replace("-", "_")

joblib.dump(one_hot, f"./models/one_hot_{today}")
joblib.dump(ordinal, f"./models/ordinal_{today}")
joblib.dump(lr2, f"./models/model_{today}")

['./models/model_2021_07_03']

In [35]:
# Load the model
jl_one_hot = joblib.load(f"./models/one_hot_{today}")
jl_ordinal = joblib.load(f"./models/ordinal_{today}")
jl_model = joblib.load(f"./models/model_{today}")