# Week 8 Homework
    - one hot encoding
    - decision trees
    - random forests

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report 

### Problem 1: Diamonds Dataset

In [None]:
diamonds = pd.read_csv('diamonds.txt', sep = '\t')
diamonds['ln_carat'] = np.log(diamonds['carat'])
diamonds['ln_price'] = np.log(diamonds['price'])                             
diamonds.head(5)

In [None]:
X1_num = diamonds[['ln_carat']].values#.reshape(-1,1)
X1_cat = diamonds[['cut', 'color', 'clarity']].values#.reshape(-1,1)
y1 = diamonds['ln_price'].values#.reshape(-1,1)

print(f' Numerical Feature Array Shape:   {X1_num.shape}\
     \n Categorical Feature Array Shape: {X1_cat.shape}\
     \n Label Array Shape:               {y1.shape}')

In [None]:
encoder = OneHotEncoder(sparse = False)
encoder.fit(X1_cat)
X1_enc = encoder.transform(X1_cat)

print(f'Encoded Feature Array Shape: {X1_enc.shape}')

In [None]:
X1 = np.hstack((X1_num, X1_enc))
print(f'Feature Array Shape: {X1.shape}')

In [None]:
X1_train, X1_hold, y1_train, y1_hold = \
    train_test_split(X1, y1, test_size = 0.2, random_state = 1)

X1_valid, X1_test, y1_valid, y1_test = \
    train_test_split(X1_hold, y1_hold, test_size = 0.5, random_state = 1)

print(f' Training Features Shape:   {X1_train.shape}\n \
Validation Features Shape: {X1_valid.shape}\n \
Test Features Shape:       {X1_test.shape}')

### Linear Regression Model with One Feature

In [None]:
dia_mod_1 = LinearRegression()
dia_mod_1.fit(X1_train [:, [0]], y1_train)

print(f' Training r-Squared:   {np.round((dia_mod_1.score(X1_train[:, [0]], y1_train)), 4)}\n \
Validation r-Squared: {np.round((dia_mod_1.score(X1_valid[:, [0]], y1_valid)), 4)}')

### Linear Regression Model with Several Features

In [None]:
dia_mod_2 = LinearRegression()
dia_mod_2.fit(X1_train, y1_train)

print(f' Training r-Squared:   {np.round((dia_mod_2.score(X1_train, y1_train)), 4)}\n \
Validation r-Squared: {np.round((dia_mod_2.score(X1_valid, y1_valid)), 4)}')

In [None]:
print(f'Testing r-Squared: {np.round(dia_mod_2.score(X1_test, y1_test), 4)}')

### Problem 2: Census Dataset

In [None]:
census = pd.read_csv('census.txt', sep = '\t')
census.head(10)

In [None]:
census.shape

In [None]:
census['salary'].value_counts()

### Prepare the Data

In [None]:
X2_num = census[['age', 'fnlwgt', 'educ_num', 'capital_gain', 'capital_loss',\
    'hrs_per_week']].values
X2_cat = census[['workclass', 'education', 'marital_status', 'occupation',\
    'relationship', 'race', 'sex', 'native_country']].values
y2 = census[['salary']].values
print(f' Numerical Feature Array Shape:   {X2_num.shape}\
     \n Categorical Feature Array Shape: {X2_cat.shape}\
     \n Label Array Shape:               {y2.shape}')

In [None]:
encoder = OneHotEncoder(sparse = False)
encoder.fit(X2_cat)
X2_enc = encoder.transform(X2_cat)

print(f'Encoded Feature Array Shape: {X2_enc.shape}')

In [None]:
X2 = np.hstack((X2_num, X2_enc))
print(f'Feature Array Shape: {X2.shape}')

In [None]:
X2_train, X2_hold, y2_train, y2_hold = \
    train_test_split(X2, y2, test_size = 0.3, random_state = 1, stratify = y2)

X2_valid, X2_test, y2_valid, y2_test = \
    train_test_split(X2_hold, y2_hold, test_size = 0.5, random_state = 1, stratify = y2_hold)

print(f' Training Features Shape:   {X2_train.shape}\n \
Validation Features Shape: {X2_valid.shape}\n \
Test Features Shape:       {X2_test.shape}')

### Logistic Regression Model

In [None]:
lr_mod = LogisticRegression(solver = 'lbfgs', max_iter = 1000, penalty = 'none')
lr_mod.fit(X2_train, y2_train)

print(f' Training Accuracy: {np.round(lr_mod.score(X2_train, y2_train), 4)}\n \
Testing Accuracy:  {np.round(lr_mod.score(X2_test, y2_test), 4)}')

### Decision Tree Models

In [None]:
# prep empty lists for training and validation results by depth
dt_train_acc = []
dt_valid_acc = []

# create depth range to determine maximum training result
depth_range = list(range(2, 31, 1))

# loop to find results for varying depths
for i in depth_range:
    np.random.seed(1)
    temp_tree = DecisionTreeClassifier(max_depth = i)
    temp_tree.fit(X2_train, y2_train)
    dt_train_acc.append(temp_tree.score(X2_train, y2_train))
    dt_valid_acc.append(temp_tree.score(X2_valid, y2_valid))
    
# determine best depth and find its index and apply to the decision tree to obtain results
dt_idx = np.argmax(dt_valid_acc)
dt_opt_depth = depth_range[dt_idx]

# apply results to decision tree and print
np.random.seed(1)
tree = DecisionTreeClassifier(max_depth = dt_opt_depth)
tree.fit(X2_train, y2_train)
print(f' Optimal Value for max_depth:           {dt_opt_depth} \n \
Training Accuracy for Optimal Model:   {np.round(tree.score(X2_train, y2_train), 4)}\n \
Validation Accuracy for Optimal Model: {np.round(tree.score(X2_valid, y2_valid), 4)}')

In [None]:
plt.figure(figsize = [10, 6])
plt.plot(depth_range, dt_train_acc, label = 'Training')
plt.plot(depth_range, dt_valid_acc, label = 'Validation')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy as a Function of Depth Hyperparameter')
plt.legend()
plt.grid()
plt.show()

### Random Forest Models

In [None]:
rf_train_acc = []
rf_valid_acc = []

for i in depth_range:
    np.random.seed(1)
    temp_forest = RandomForestClassifier(n_estimators = 100, max_depth = i)
    temp_forest.fit(X2_train, y2_train)
    rf_train_acc.append(temp_forest.score(X2_train, y2_train))
    rf_valid_acc.append(temp_forest.score(X2_valid, y2_valid))
    
rf_idx = np.argmax(rf_valid_acc)
rf_opt_depth = depth_range[rf_idx]

np.random.seed(1)
forest = RandomForestClassifier(n_estimators = 100, max_depth = rf_opt_depth)
forest.fit(X2_train, y2_train)

print(f' Optimal Value for max_depth:           {rf_opt_depth} \n \
Training Accuracy for Optimal Model:   {np.round(forest.score(X2_train, y2_train), 4)}\n \
Validation Accuracy for Optimal Model: {np.round(forest.score(X2_valid, y2_valid), 4)}')

In [None]:
plt.figure(figsize = [10, 6])
plt.plot(depth_range, rf_train_acc, label = 'Training')
plt.plot(depth_range, rf_valid_acc, label = 'Validation')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy as a Function of Depth Hyperparameter')
plt.legend()
plt.grid()
plt.show()

### Evaluate Final Model

Logistic Model:  
                        Training Accuracy: 0.7988   
                        Testing Accuracy:  0.7943
                        
Decision Tree:     
    Training Accuracy for Optimal Model:   0.8585   
    Validation Accuracy for Optimal Model: 0.8565  
    
Random Forest:  
    Training Accuracy for Optimal Model:   0.9192  
    Validation Accuracy for Optimal Model: 0.8643

In [None]:
np.random.seed(1)
final_model = RandomForestClassifier(n_estimators = 100, max_depth = 19)
final_model.fit(X2_train, y2_train)

print(f' Training Accuracy for Final Model:   {np.round(final_model.score(X2_train, y2_train), 4)}\n \
Validation Accuracy for Final Model: {np.round(final_model.score(X2_valid, y2_valid), 4)}\n \
Testing Accuracy for Final Model:    {np.round(final_model.score(X2_test, y2_test), 4)}')

In [None]:
test_pred = final_model.predict(X2_test)

conf_matrix = confusion_matrix(y2_test, test_pred)
conf_matrix = pd.DataFrame(conf_matrix, index = ['<= 50K', '> 50K'], columns = ['<= 50K', '> 50K'])
conf_matrix

In [None]:
print(classification_report(y2_test, test_pred, target_names=['<= 50K', '> 50K']))