In [1]:
import pandas as pd
import numpy as np
import os#, time, random

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split,cross_val_score, cross_val_predict
from sklearn.metrics import classification_report,confusion_matrix, precision_score, recall_score


#### Identify Directories

In [2]:
currDir = os.getcwd()
rootDir = os.path.abspath(os.path.join(currDir,'..'))

dataDir = os.path.abspath(os.path.join(rootDir,'data'))
rawDataDir = os.path.abspath(os.path.join(dataDir,'raw'))
interimDataDir = os.path.abspath(os.path.join(dataDir,'interim'))
finalDataDir = os.path.abspath(os.path.join(dataDir,'final'))

#### Helper Functions

In [3]:
def evaluate_model(model,X_train,y_train,cv_folds=5):
        
    predictions   = cross_val_predict(model, X_train, y_train, cv=cv_folds)

    cv_accuracies = cross_val_score(model, X_train, y_train, 
                                    cv=cv_folds, scoring="accuracy")
        
    conf_matrix = classification_report(y_train, predictions)
    '''    
    try:
        # only for binary classification
        precision = precision_score(y_train, predictions)
        recall    = recall_score(y_train, predictions)
    except:
        # for multi-class classification
        precision = precision_score(y_train, predictions, average=None)
        recall    = recall_score(y_train, predictions, average=None)  
    '''
    #### PRINT KEY DATA ####
    
    print(f"{cv_folds} Cross Validations")
    print("Mean CV Accuracy",np.round(np.mean(cv_accuracies),2))
    print("CV Accuracies:",cv_accuracies)
    print("\n") 
    
    print("Confusion Matrix:")
    print(conf_matrix)
    '''
    print("\n") 

    print("Precision Score:",np.round(precision,2))
    print('Recall Score:',np.round(recall,2))
    '''
    #### RETURN DATA
    
    return predictions, conf_matrix

#### Getting Data

In [4]:
MAX_INT = 100

LEN = 10_000
DIMS = (LEN,2)

cols = ["A","B"]

addition = pd.DataFrame(np.random.randint(MAX_INT+1, size=DIMS),columns=cols)
addition['C'] = addition['A'] + addition['B']
addition['Label'] = 'Addition'

subtraction = pd.DataFrame(np.random.randint(MAX_INT+1, size=DIMS),columns=cols)
subtraction['C'] = subtraction['A'] - subtraction['B']
subtraction['Label'] = 'Subtraction'

multiplication = pd.DataFrame(np.random.randint(MAX_INT+1, size=DIMS)
                              ,columns=cols)
multiplication['C'] = multiplication['A'] * multiplication['B']
multiplication['Label'] = 'Multiplication'

division = pd.DataFrame(np.random.randint(MAX_INT+1, size=DIMS),columns=cols)
division['C'] = division['A'] / division['B']
division['Label'] = 'Division'
#division = np.round(division,2)

In [5]:
# concatenate dataframes
df_list = [addition,subtraction,multiplication,division]
df = pd.concat(df_list,ignore_index=True)

# drop rows where division['y'] == np.inf
df = df[df['C']!=np.inf]

# shuffle dataframe
df = df.sample(frac=1,random_state=42)

# drop NULLs
df.dropna(inplace=True)

# round dataframe and reset index
df = np.round(df,2)
df.reset_index(drop=True,inplace=True)

print(df.shape)

(39903, 4)


In [6]:
df.head()

Unnamed: 0,A,B,C,Label
0,95,48,143.0,Addition
1,83,8,10.38,Division
2,48,11,59.0,Addition
3,68,50,1.36,Division
4,20,24,-4.0,Subtraction


#### Prep Data for ML

In [7]:
# perform stratified random shuffling 
splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=42)
for train_idx,test_idx in splitter.split(df,df['Label']):
    strat_train_set = df.loc[train_idx]
    strat_test_set = df.loc[test_idx]

# create train/test sets from stratified shuffle
X_train, X_test = strat_train_set[['A','B','C']].values, strat_test_set[['A','B','C']].values
y_train, y_test = strat_train_set['Label'].values, strat_test_set['Label'].values

print("Training set size:",len(strat_train_set))
print("Test set size:",len(strat_test_set))

Training set size: 27932
Test set size: 11971


#### Multi-Class Classifier

In [8]:
#from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm


In [9]:
# random forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train);

preds, conf_matrix = evaluate_model(rf,X_train,y_train,cv_folds=5)

5 Cross Validations
Mean CV Accuracy 0.98
CV Accuracies: [0.98389118 0.97977448 0.98496241 0.98066595 0.98227712]


Confusion Matrix:
                precision    recall  f1-score   support

      Addition       0.98      0.98      0.98      7000
      Division       0.98      0.98      0.98      6932
Multiplication       0.98      0.98      0.98      7000
   Subtraction       0.99      0.98      0.98      7000

      accuracy                           0.98     27932
     macro avg       0.98      0.98      0.98     27932
  weighted avg       0.98      0.98      0.98     27932



In [104]:
preds = rf.predict(X_test)

rf_results = pd.DataFrame([y_test,preds]).T
rf_results.columns = ['Label','Pred']

acc = len(rf_results[rf_results['Label']==rf_results['Pred']])/len(rf_results)

print("Accuracy:",acc*100)

Accuracy: 98.48801269735193


In [10]:
'''
# SVM
svc = svm.SVC(gamma='auto', random_state=42)
svc.fit(X_train, y_train);

preds, conf_matrix = evaluate_model(svc,X_train,y_train,cv_folds=5)
'''

"\n# SVM\nsvc = svm.SVC(gamma='auto', random_state=42)\nsvc.fit(X_train, y_train);\n\npreds, conf_matrix = evaluate_model(svc,X_train,y_train,cv_folds=5)\n"

#### Plotting Random Forest Tree Structure

#### Random Forest Hyperparameter Tuning

#### Neural Network

In [23]:
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

In [42]:
### one-hot encoding y variables

enc = OneHotEncoder(handle_unknown='ignore')
y_train_encoded = enc.fit_transform(y_train.reshape(-1,1))
y_test_encoded = enc.fit_transform(y_test.reshape(-1,1))


In [43]:
def build_model(train_dataset,activation_fn='relu',num_of_nodes=32):
    
    model = tf.keras.Sequential([
        
    tf.keras.layers.Dense(num_of_nodes, activation=activation_fn, 
                          input_shape=[train_dataset.shape[-1]]),
    tf.keras.layers.Dense(num_of_nodes*2, activation=activation_fn),
    tf.keras.layers.Dense(num_of_nodes*2, activation=activation_fn),
    tf.keras.layers.Dense(num_of_nodes, activation=activation_fn),
    tf.keras.layers.Dense(4, activation='softmax')
        
    ])

    #optimizer = 'adam' #tf.keras.optimizers.RMSprop(0.001)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
    
    model.compile(loss='mae',
                  optimizer=optimizer,
                  metrics=['mae', 'mse'])
    return model

In [95]:
# model = build_model(X_train,activation_fn='relu',num_of_nodes=64)
nn = build_model(X_train,activation_fn='sigmoid',num_of_nodes=64)

In [96]:
nn.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 64)                256       
_________________________________________________________________
dense_11 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_12 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_14 (Dense)             (None, 4)                 260       
Total params: 33,604
Trainable params: 33,604
Non-trainable params: 0
_________________________________________________________________


In [97]:
EPOCHS = 100

nn.fit(X_train,y_train_encoded,
          epochs=EPOCHS, 
          validation_split = 0.2, 
          verbose=0)

<tensorflow.python.keras.callbacks.History at 0x1bf804bde48>

In [98]:
preds = nn.predict(X_test)
flat_preds = np.argmax(preds,axis=1)
nn.evaluate(X_test, y_test_encoded, verbose=2)

11971/11971 - 1s - loss: 0.0648 - mean_absolute_error: 0.0648 - mean_squared_error: 0.0648


[0.06484539245077518, 0.06484544, 0.064751096]

In [77]:
# preds = nn.predict(X_test).flatten()

In [109]:
feature_list = enc.categories_[0]

nn_results = pd.DataFrame((y_test,flat_preds)).T
nn_results.columns = ["Label","Index Pred"]

nn_results['Pred'] = [feature_list[idx] for idx in nn_results["Index Pred"].values]
del nn_results['Index Pred']

acc = len(nn_results[nn_results['Label']==nn_results['Pred']])/len(nn_results)

print("Accuracy:",acc*100)

# relu accuracy: 74.3045693759919
# sigmoid accuracy: 87.03533539386852

Accuracy: 87.03533539386852


In [110]:
nn_results.head(15)

Unnamed: 0,Label,Pred
0,Subtraction,Subtraction
1,Multiplication,Multiplication
2,Multiplication,Multiplication
3,Subtraction,Subtraction
4,Division,Division
5,Division,Division
6,Multiplication,Multiplication
7,Addition,Addition
8,Subtraction,Subtraction
9,Multiplication,Multiplication


In [111]:
rf_results.head(30)

Unnamed: 0,Label,Pred
0,Subtraction,Subtraction
1,Multiplication,Multiplication
2,Multiplication,Multiplication
3,Subtraction,Subtraction
4,Division,Division
5,Division,Division
6,Multiplication,Multiplication
7,Addition,Addition
8,Subtraction,Subtraction
9,Multiplication,Multiplication
