# Data Analysis

**Import Libraries**

In [1]:
# Import Relevant Libraries
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
%reload_ext autoreload

In [3]:
# Import Relevant Libraries
import sys
# Add Path to Project Files 
sys.path.append('/Users/Masa/Documents/Data Science/Metis/6. Projects/Project-McNulty/python')
sys.path.append('/Users/Masa/Documents/Data Science/Metis/6. Projects/Project-McNulty/python/mklearn')

**Import Project Python Modules**

In [4]:
# Import Python File
import data_visualization as dv
import data_import as di
from mklearn.mklearn import KNearestNeighbors_MK

## Import Data

**Import Labeled Data**

In [5]:
df = di.create_dataframe()

In [6]:
# Preview Data
df.head(n=3)

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,6,59,43,50,68,98,119,139,145,149,...,59,58,65,59,46,57,104,140,84,72
1,9,154,126,105,102,125,155,172,180,142,...,22,42,67,101,122,133,136,139,142,144
2,9,255,253,253,253,253,253,253,253,253,...,78,83,80,69,66,72,79,83,83,84


In [7]:
df_sorted = df.sort_values(by='Labels').reset_index(drop=True)

In [8]:
# Preview Data
df_sorted.head(n=3)

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
0,0,112,109,108,106,102,99,99,99,99,...,35,45,63,80,70,61,62,60,74,89
1,0,184,182,182,184,185,186,186,187,187,...,139,139,138,134,133,133,133,133,133,136
2,0,202,200,200,200,200,200,200,201,201,...,76,74,74,76,77,77,74,74,74,75


**Target Classes**

In [9]:
classes = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']

**Define X (Features) & Y (Target)**

In [10]:
X = df_sorted.iloc[:,1:]

In [11]:
y = df_sorted.iloc[:,0]

**Create RBF Arrays**

In [12]:
rbf = di.create_rbf_array(df)

In [13]:
rbf_sorted = di.create_rbf_array_sorted(df)

# Create Models

**Standardize X (Features)**

In [14]:
X_scale = pd.DataFrame(scale(X))

**Train Test Split**

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.02, random_state=2)
X_val = X_train.iloc[:1000,:]
y_val = y_train.iloc[:1000]
X_train = X_train.iloc[1000:,:]
y_train = y_train.iloc[1000:]

## Create Random Forest Model Using Sklearn

In [16]:
from sklearn.ensemble import RandomForestClassifier

### Iteration 1 - Preliminary Values

In [17]:
rf_1 = RandomForestClassifier(n_estimators=100, max_depth=50,max_features='sqrt',n_jobs=-1,random_state=2)
rf_1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=2,
            verbose=0, warm_start=False)

In [18]:
y_pred_1 = rf_1.predict(X_test)
accuracy_1 = accuracy_score(y_test, y_pred_1)
print('Accuracy of First Iteration:', accuracy_1)

Accuracy of First Iteration: 0.478


### Iteration 2 - Try Grid Search to Optimize Parameters

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
rf_2 = RandomForestClassifier(n_estimators=400, max_depth=50, n_jobs=-1, random_state=2)
param_grid = {'max_features':[50, 100, 200, 400], 'min_samples_leaf':[1,5,10]}
gridsearch = GridSearchCV(rf_2, param_grid, cv=3, n_jobs=-1,verbose=2)
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] min_samples_leaf=1, max_features=50 .............................
[CV] min_samples_leaf=1, max_features=50 .............................
[CV] min_samples_leaf=1, max_features=50 .............................
[CV] min_samples_leaf=5, max_features=50 .............................
[CV] .................... min_samples_leaf=5, max_features=50 -  13.5s
[CV] min_samples_leaf=5, max_features=50 .............................
[CV] .................... min_samples_leaf=1, max_features=50 -  25.0s
[CV] .................... min_samples_leaf=1, max_features=50 -  23.8s
[CV] .................... min_samples_leaf=1, max_features=50 -  25.5s
[CV] min_samples_leaf=5, max_features=50 .............................
[CV] min_samples_leaf=10, max_features=50 ............................
[CV] min_samples_leaf=10, max_features=50 ............................
[CV] .................... min_samples_leaf=5, max_features=50 -   9.8s
[CV] min_samples

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 385.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=400, n_jobs=-1, oob_score=False, random_state=2,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_leaf': [1, 5, 10], 'max_features': [50, 100, 200, 400]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [37]:
gridsearch.best_params_

{'max_features': 400, 'min_samples_leaf': 1}

In [32]:
y_pred_2 = gridsearch.predict(X_test)
accuracy_2 = accuracy_score(y_test, y_pred_2)
print('Accuracy of Second Iteration:', accuracy_2)

Accuracy of Second Iteration: 0.51


# Save Predictions

In [22]:
pred_df = pd.DataFrame()

In [33]:
pred_df['Indices'] = y_test.index
pred_df['Y True'] = y_test.values
pred_df['Y Pred RF Iter 1'] = y_pred_1
pred_df['Y Pred RF Iter 2'] = y_pred_2

In [36]:
# Preview Predictions Dataframe
pred_df.head(n=10)

Unnamed: 0,Indices,Y True,Y Pred RF Iter 1,Y Pred RF Iter 2
0,23656,4,4,4
1,27442,5,5,5
2,40162,8,8,8
3,8459,1,1,1
4,8051,1,9,1
5,42404,8,5,3
6,89,0,0,0
7,1461,0,9,9
8,13519,2,3,3
9,42536,8,8,2


In [35]:
pred_df.to_pickle('../data/pickled_predictions/pred_rf.pkl')

# Work in Progress

## Create Random Forest Model From Scratch

In [140]:
class RandomForest_MK(object):
    
    def get_split(self, X, y, max_features):
        m, n = X.shape
        # List of Class Labels
        labels = list(set(y))
        # Create List of Random Indices
        indices = np.random.choice(n, max_features, replace=False)
        X_subset = X.iloc[:, indices]
        
        # Initialize Best Index
        best_gini = 1e10
        best_index = 0
        best_split = 0
        
        for col in indices:
            X_temp = X_subset.loc[:, col]
            for row in range(m):
                # Separate into Left & Right Branch
                X_left = X_temp[X_temp <= X_temp.iloc[row]]
                X_right = X_temp[X_temp > X_temp.iloc[row]]
                
                y_left, y_right = y[X_left.index], y[X_right.index]
            
                # Calculate Gini Impurity
                gini = self.gini_impurity(y_left, y_right, labels)
                
                if gini < best_gini:
                    best_gini = gini
                    best_index = col
                    best_split = X_temp[row]
                    
        return {'Gini Impurity': best_gini, 
                'Index': best_index, 
                'Split Value': best_split}     
    
    def gini_impurity(self, y_left, y_right, class_labels):
        # Number of Total Samples
        num_obs = len(y)
        # Initialize Gini Impurity
        gini_impurity = 0
        
        for class_label in class_labels:
            frequency_left = sum(y_left == class_label) / num_obs
            gini_impurity += frequency_left * (1 - frequency_left)
            
            frequency_right = sum(y_right == class_label) / num_obs
            gini_impurity += frequency_right * (1 - frequency_right)
            
        return gini_impurity