In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.stem import SnowballStemmer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import HistGradientBoostingClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping


2023-12-04 09:28:51.351039: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 09:28:51.351109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 09:28:51.388465: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-04 09:28:51.483599: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
kaggle = False

In [3]:
filepath = 'data/reviews.csv' if not kaggle else './kaggle/input/consumer-review-of-clothing-product/Consumer Review of Clothing Product/data_amazon.xlsx - Sheet1.csv'

df = pd.read_csv(filepath)

display(df)

Unnamed: 0.1,Unnamed: 0,Title,Review,Cons_rating,Cloth_class
0,2,Some major design flaws,I had such high hopes for this dress and reall...,3.0,Dresses
1,3,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5.0,Pants
2,4,Flattering shirt,This shirt is very flattering to all due to th...,5.0,Blouses
3,5,Not for the very petite,"I love tracy reese dresses, but this one is no...",2.0,Dresses
4,6,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5.0,Knits
...,...,...,...,...,...
45303,49333,Dress felt and fit great. I got lots of compl...,Loved the color!!! Dress fit great and I got ...,5.0,Dresses
45304,49334,Loved the dress but poor quality,This dress looked great and I loved the materi...,2.0,Dresses
45305,49335,"Cute dress, didn't fit",Wanted this dress to work it didn't. It is ver...,1.0,Dresses
45306,49336,Very cute!,No complaints othe than the zipper gets stuck ...,4.0,Dresses


In [4]:
models = []

In [5]:
def train_test(name, model, X, y, grid={}):
    print(name)

    scoring = make_scorer(roc_auc_score, multi_class='ovo',needs_proba=True)
    
    res_grid = None
    test_size = 0.4
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size,random_state=42, stratify = y)
    res = GridSearchCV(model,grid, scoring = scoring, n_jobs = 8)
    res.fit(X_train,Y_train)
    res_grid = res.best_params_

    prob_train = res.predict_proba(X_train)
    prob_test = res.predict_proba(X_test)
    
    scores = [roc_auc_score(Y_train, prob_train, multi_class = 'ovo'), roc_auc_score(Y_test, prob_test, multi_class = 'ovo')]
    
    print("Train score: ", scores[0])
    print("Test score: ", scores[1])
    print("Best params: ", res_grid)
    print("\n\n\n")
    
    models.append((name,scores[0],scores[1],res_grid))
    return res,res_grid,scores
    

In [6]:
stemmer = SnowballStemmer('english')

In [7]:
word_features = df['Review']

words = "".join(word for word in word_features.values)

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
languages = {'english', 'spanish', 'portuguese', 'italian'}

stopwords = list()

for lang in languages:
    stopwords.extend(nltk.corpus.stopwords.words(lang))

In [10]:
stemmed = word_features.str.split().apply(lambda x: [stemmer.stem(y) for y in x])

stemmed = stemmed.apply(lambda x: ' '.join(x))

In [11]:
tfidf_vect = TfidfVectorizer(stop_words = stopwords, min_df = 3, max_df = 6000)

text_all = tfidf_vect.fit_transform(stemmed)

In [12]:
len(tfidf_vect.get_feature_names_out())

7603

In [13]:
target_classes = list(df['Cloth_class'].unique())

targets = df['Cloth_class']

class_map = {cloth_class: target_classes.index(cloth_class) for cloth_class in target_classes} 

In [14]:
X = text_all
y = targets.map(class_map)

In [15]:
X.shape

(45308, 7603)

In [16]:
len(np.unique(y))

11

In [17]:

name = "Logistic Regression"
logistic_grid = {'C':[0.1,1,10]}
train_test(name,LogisticRegression(max_iter = 5000), X, y, logistic_grid)

Logistic Regression
Train score:  0.9426044043803135
Test score:  0.8973464721065647
Best params:  {'C': 1}






(GridSearchCV(estimator=LogisticRegression(max_iter=5000), n_jobs=8,
              param_grid={'C': [0.1, 1, 10]},
              scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)),
 {'C': 1},
 [0.9426044043803135, 0.8973464721065647])

In [18]:
name = 'SVC'

svc_grid = {
  'rbf': {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1],
    'kernel': ['rbf'],
    'probability': [True]
  },
  'poly': {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4],
    'kernel': ['poly'],
    'probability': [True]
  },
  'sigmoid': {
    'C': [0.1, 1, 10],
    'kernel': ['sigmoid'],
    'probability': [True]
  }
}

kernel = 'rbf'
t = train_test(f"{name} {kernel}", SVC(kernel=kernel), X, y, svc_grid[kernel])
print(t)

kernel = 'poly'
t = train_test(f"{name} {kernel}", SVC(kernel=kernel), X, y, svc_grid[kernel])
print(t)

kernel = 'sigmoid'
t = train_test(f"{name} {kernel}", SVC(kernel=kernel), X, y, svc_grid[kernel])
print(t)

SVC rbf
Train score:  0.9235903030731365
Test score:  0.8890468442351281
Best params:  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True}




(GridSearchCV(estimator=SVC(), n_jobs=8,
             param_grid={'C': [0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1],
                         'kernel': ['rbf'], 'probability': [True]},
             scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)), {'C': 10, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True}, [0.9235903030731365, 0.8890468442351281])
SVC poly
Train score:  0.9881512484318783
Test score:  0.8743302774706202
Best params:  {'C': 1, 'degree': 2, 'kernel': 'poly', 'probability': True}




(GridSearchCV(estimator=SVC(kernel='poly'), n_jobs=8,
             param_grid={'C': [0.1, 1, 10], 'degree': [2, 3, 4],
                         'kernel': ['poly'], 'probability': [True]},
             scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)), {'C': 1, 'degree': 2, 'kernel': 'poly', 'probab

In [19]:
name = "Random Forest Classifier"
random_forest_grid = {'n_estimators':[300,400,500, 600], 'max_depth':[50,100, None], 'max_leaf_nodes': [600,800,1000,None]}
train_test(name,RandomForestClassifier(), X, y, random_forest_grid)

Random Forest Classifier




Train score:  0.9581990221704432
Test score:  0.8864625704991309
Best params:  {'max_depth': 100, 'max_leaf_nodes': 1000, 'n_estimators': 600}






(GridSearchCV(estimator=RandomForestClassifier(), n_jobs=8,
              param_grid={'max_depth': [50, 100, None],
                          'max_leaf_nodes': [600, 800, 1000, None],
                          'n_estimators': [300, 400, 500, 600]},
              scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)),
 {'max_depth': 100, 'max_leaf_nodes': 1000, 'n_estimators': 600},
 [0.9581990221704432, 0.8864625704991309])

In [20]:

name = "KNN"
knn_grid = {'n_neighbors':[5,7,10, 15, 20]}
train_test(name,KNeighborsClassifier(),X,y,knn_grid)

KNN
Train score:  0.9391052797915587
Test score:  0.5640452633619534
Best params:  {'n_neighbors': 10}






(GridSearchCV(estimator=KNeighborsClassifier(), n_jobs=8,
              param_grid={'n_neighbors': [5, 7, 10, 15, 20]},
              scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)),
 {'n_neighbors': 10},
 [0.9391052797915587, 0.5640452633619534])

In [21]:

name = "Naive Bayes"
train_test(name,MultinomialNB(),X,y)

Naive Bayes
Train score:  0.914849467140905
Test score:  0.8684332396182726
Best params:  {}






(GridSearchCV(estimator=MultinomialNB(), n_jobs=8, param_grid={},
              scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)),
 {},
 [0.914849467140905, 0.8684332396182726])

In [22]:

name = "Decision Tree"
decision_tree_grid = {'max_depth':[None,50,100]}
train_test(name,DecisionTreeClassifier(),X,y,decision_tree_grid)

Decision Tree
Train score:  0.9384053073251551
Test score:  0.7203832275923768
Best params:  {'max_depth': 50}






(GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=8,
              param_grid={'max_depth': [None, 50, 100]},
              scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)),
 {'max_depth': 50},
 [0.9384053073251551, 0.7203832275923768])

In [23]:

name = "LightGBM"
lightgbm_grid = {'num_leaves':[10, 20, 30], 'learning_rate': [0.06, 0.08, 0.1]}
train_test(name,lgb.LGBMClassifier(),X,y,lightgbm_grid)

LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 8.158684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91436
[LightGBM] [Info] Number of data points in the train set: 21747, number of used features: 1836
[LightGBM] [Info] Start training from score -1.644391
[LightGBM] [Info] Start training from score -2.541814
[LightGBM] [Info] Start training from score -2.088078
[LightGBM] [Info] Start training from score -2.431849
[LightGBM] [Info] Start training from score -2.211116
[LightGBM] [Info] Start training from score -2.811741
[LightGBM] [Info] Start training from score -2.277026
[LightGBM] [Info] Start training from score -2.486149
[LightGBM] [Info] Start training from score -2.707912
[LightGBM] [Info] Start training from score -3.243172
[LightGBM] [Info] Start training from score -2.897988
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 82.023764 seconds.


(GridSearchCV(estimator=LGBMClassifier(), n_jobs=8,
              param_grid={'learning_rate': [0.06, 0.08, 0.1],
                          'num_leaves': [10, 20, 30]},
              scoring=make_scorer(roc_auc_score, needs_proba=True, multi_class=ovo)),
 {'learning_rate': 0.08, 'num_leaves': 30},
 [0.9660896714985653, 0.895285645296603])

In [24]:

name = "XGBoost"
xgboost_grid = {'max_depth':[10]}
train_test(name,xgb.XGBClassifier(),X,y,xgboost_grid)

XGBoost
Train score:  0.9824808710472075
Test score:  0.887187639040006
Best params:  {'max_depth': 10}






(GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                      callbacks=None, colsample_bylevel=None,
                                      colsample_bynode=None,
                                      colsample_bytree=None, device=None,
                                      early_stopping_rounds=None,
                                      enable_categorical=False, eval_metric=None,
                                      feature_types=None, gamma=None,
                                      grow_policy=None, importance_type=None,
                                      interaction_constraints=None,
                                      learning_rate=None, max_b...
                                      max_cat_threshold=None,
                                      max_cat_to_onehot=None,
                                      max_delta_step=None, max_depth=None,
                                      max_leaves=None, min_child_weight=None,
          

In [None]:

name = "CatBoost"
catboost_grid = {'max_depth':[10,100,1000]}
train_test(name,cb.CatBoostClassifier(verbose=False),X,y,catboost_grid)


CatBoost


In [None]:

name = "HGBoost"
hgboost_grid = {'max_depth':[10,100,1000]}
train_test(name,HistGradientBoostingClassifier(),X,y,hgboost_grid)

In [None]:
#turn models into a dataframe
models_df = pd.DataFrame(models, columns = ['Model','Train Score','Test Score','Best Params'])
models_df.to_csv('models4.csv')
models_df

In [None]:
output_dim = len(np.unique(y))

def nn_3():
    model = keras.Sequential()
    model.add(layers.Dense(128, activation='relu', input_shape=(X.shape[1],)))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(16))
    model.add(layers.Dense(output_dim, activation='softmax'))
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
    return model

name = "Neural Network 2x64D"

train_test(name,KerasClassifier(),X,y,{
    'epochs':[100,500],
    'batch_size':[100,1000], 
    'callbacks':[EarlyStopping(patience=10)],
    'model': [nn_3],
    'verbose':[0]
})

In [None]:
models

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

In [None]:
rf = RandomForestClassifier(max_leaf_nodes = 504, n_estimators = 474, random_state = 1)
tree = DecisionTreeClassifier(max_leaf_nodes = 504)

log_r = LogisticRegression(max_iter = 8000)
log_r2 = LogisticRegression(max_iter = 8000)

reducer = TruncatedSVD(n_components = 1000)

log_r_pipe = Pipeline([
                ('reducer', reducer),
                ('logistic_regression', log_r2)
             ])

estimators = [
              ('random_forest', rf),
              ('tree', tree),
              ('logistic_regression1', log_r),    
              ('logistic_regression2', log_r_pipe)
            ]

train_test("Voting Ensamble",VotingClassifier(estimators = estimators, voting = 'soft', n_jobs = 4),X,y)

In [None]:
#turn models into a dataframe
models_df = pd.DataFrame(models, columns = ['Model','Train Score','Test Score','Best Params'])
models_df.to_csv('models3.csv')
models_df