In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score, f1_score
from sklearn import metrics
from joblib import dump, load

In [67]:
def model_maker(encoded_data, original_data, model_name, model_type):
    """
    Inputs:
    
        encoded_data: would be only text embeddings from encoders such as USE
    
        original_data: originally scrapped data that includes cat1 & cat2
    
        model_name: string, name of the model that you are going to save as
    
        model_type: type of model you are building in the hierarchy, supported 
        options are X (directly predicting cat2), X1 (predicting cat1), 
        X2X (predicting cat2 after cat1s are predicted)
    
    Output:
        X2X saves as many models as there is cat1 in the dataset
    """
    if model_type == "X":
        cat1 = original_data['cat1'].reset_index(drop=True)
        cat2 = original_data['cat2'].reset_index(drop=True)
        df1 = pd.concat([encoded_data,cat1,cat2],axis=1,ignore_index=True)
        df1.rename(columns={ df1.columns[-2]: "cat1", df1.columns[-1]: "cat2" }, inplace = True)
        X = df1.drop(['cat2','cat1'], axis=1)
        y = df1['cat2']
        x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=20)
        
        clf = LinearSVC()
        clf.fit(x_train,y_train)
        y_pred_train = clf.predict(x_train)
        y_pred_test = clf.predict(x_test)
        print("training accuracy:", round(accuracy_score(y_train,y_pred_train),4), "\n"
         "test accuracy:", round(accuracy_score(y_test,y_pred_test),4))
        print("F1 score:", round(f1_score(y_test,y_pred_test,average = 'weighted'),4))
        print(metrics.classification_report(y_test, y_pred_test, target_names=y.unique()))
        dump(clf, model_name + '.joblib')
        print('model has been saved in the same location as this notebook')
    elif model_type == "X1":
        cat1 = original_data['cat1'].reset_index(drop=True)
        cat2 = original_data['cat2'].reset_index(drop=True)
        df1 = pd.concat([encoded_data,cat1,cat2],axis=1,ignore_index=True)
        df1.rename(columns={ df1.columns[-2]: "cat1", df1.columns[-1]: "cat2" }, inplace = True)
        X = df1.drop(['cat2','cat1'], axis=1)
        y = df1['cat1']
        x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=20)
        
        clf = LinearSVC()
        clf.fit(x_train,y_train)
        y_pred_train = clf.predict(x_train)
        y_pred_test = clf.predict(x_test)
        print("training accuracy:", round(accuracy_score(y_train,y_pred_train),4), "\n"
         "test accuracy:", round(accuracy_score(y_test,y_pred_test),4))
        print("F1 score:", round(f1_score(y_test,y_pred_test,average = 'weighted'),4))
        print(metrics.classification_report(y_test, y_pred_test, target_names=y.unique()))
        dump(clf, model_name + '.joblib')
        print('model has been saved in the same location as this notebook')
    elif model_type == "X2X":
        cat1 = original_data['cat1'].reset_index(drop=True)
        cat2 = original_data['cat2'].reset_index(drop=True)
        df1 = pd.concat([encoded_data,cat1,cat2],axis=1,ignore_index=True)
        df1.rename(columns={ df1.columns[-2]: "cat1", df1.columns[-1]: "cat2" }, inplace = True)
        u = df1['cat1'].unique()
        for c in u:
            df_sub = df1[df1['cat1']==c]
            X = df_sub.drop(['cat2','cat1'], axis=1)
            y = df_sub['cat2']
            x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=20)
        
            clf = LinearSVC()
            clf.fit(x_train,y_train)
            y_pred_train = clf.predict(x_train)
            y_pred_test = clf.predict(x_test)
            print("training accuracy:", round(accuracy_score(y_train,y_pred_train),4), "\n"
             "test accuracy:", round(accuracy_score(y_test,y_pred_test),4))
            print("F1 score:", round(f1_score(y_test,y_pred_test,average = 'weighted'),4))
            print(metrics.classification_report(y_test, y_pred_test, target_names=y.unique()))
            dump(clf, model_name + '('+c+')'+'.joblib')
            print('model ' +model_name+ '('+c+')'+' has been saved in the same location as this notebook')
        
    else:
        print('Valid model types are: X, X1, X2X')

In [66]:
#read in Encoded set
df = pd.read_csv('C:/Users/Junhong/Google Drive/Spring 2020/Capstone Project/Data/Final Datasets/Home_Furniture_Appliances_clean.csv')
#y, input is the original set
#y = pd.read_csv('C:/Users/Junhong/Google Drive/Spring 2020/Capstone Project/Data/Final Datasets/electronics_clean.csv')
y = pd.read_csv('C:/Users/Junhong/Google Drive/Spring 2020/Capstone Project/Data/Final Datasets//beauty.csv')

In [63]:
model_maker(encoded_data = df, original_data=y, model_name='H', model_type='X')

training accuracy: 0.9859 
test accuracy: 0.9399
F1 score: 0.9395
                  precision    recall  f1-score   support

           Desks       0.89      1.00      0.94        39
          Chairs       1.00      0.89      0.94        47
       TV Stands       0.90      0.84      0.87        55
           Sofas       0.95      0.98      0.97        43
      Mattresses       0.95      0.91      0.93        45
            Beds       0.90      1.00      0.95        43
       Bookcases       0.93      1.00      0.96        40
         Vacuums       0.94      0.94      0.94        48
      Microwaves       0.86      0.86      0.86        43
         Fridges       0.93      0.93      0.93        42
     Dishwashers       0.94      0.92      0.93        50
 Washer & Dryers       0.91      0.89      0.90        46
Small Appliances       0.89      0.91      0.90        35
  Ovens & Ranges       1.00      0.96      0.98        48
        Wall Art       0.93      0.93      0.93        46
     

In [64]:
model_maker(encoded_data = df, original_data=y, model_name='H1', model_type='X1')

training accuracy: 0.9883 
test accuracy: 0.9649
F1 score: 0.9649
                  precision    recall  f1-score   support

      Furnitures       0.97      0.98      0.97       335
      Appliances       0.97      0.97      0.97       343
      Home Decor       0.96      0.95      0.96       307
Kitchen & Dining       0.95      0.96      0.96       296

        accuracy                           0.96      1281
       macro avg       0.96      0.96      0.96      1281
    weighted avg       0.96      0.96      0.96      1281

model has been saved in the same location as this notebook


In [65]:
model_maker(encoded_data = df, original_data=y, model_name='H2', model_type='X2X')

training accuracy: 0.9935 
test accuracy: 0.9451
F1 score: 0.9449
              precision    recall  f1-score   support

       Desks       0.98      0.98      0.98        46
      Chairs       0.93      0.98      0.95        43
   TV Stands       0.88      0.88      0.88        40
       Sofas       0.90      0.88      0.89        40
  Mattresses       1.00      0.98      0.99        58
        Beds       0.94      0.90      0.92        51
   Bookcases       0.96      1.00      0.98        50

    accuracy                           0.95       328
   macro avg       0.94      0.94      0.94       328
weighted avg       0.95      0.95      0.94       328

model H2(Furnitures) has been saved in the same location as this notebook
training accuracy: 0.9739 
test accuracy: 0.9543
F1 score: 0.9542
                  precision    recall  f1-score   support

         Vacuums       0.98      0.98      0.98        53
      Microwaves       0.98      0.96      0.97        51
         Fridges      