### DO NOT PUSH CHANGES IN THIS NOTEBOOK
#### SAVE EXPERIMENTS RESULTS IN EXTERNAL FILE OR USE PICKLE TO SAVE MODEL.

In [273]:
import pandas as pd
import numpy as np
from model.text_normalizer import normalize_corpus, tokenizer, stopword_list
from model import evaluation
from model.utils import vectorizer, decoder
from scripts.build_df import build_df
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
import xgboost as xgb
from joblib import dump, load

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Features normalization
Arguments of 'normalization' function could be modified between experiments.

In [239]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

'''
def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)
'''

"\ndef normalization(input):\n    output = text_normalizer.normalize_corpus(\n        input,\n        html_stripping=True,\n        contraction_expansion=True,\n        accented_char_removal=True,\n        text_lower_case=True,\n        text_stemming=True,\n        text_lemmatization=False,\n        special_char_removal=True,\n        remove_digits=False,\n        stopword_removal=True,\n        stopwords=text_normalizer.stopword_list\n    )       \n    return output\n\ndf['name'] = normalization(df['name'].apply(str))\ndf['description'] = normalization(df['description'].apply(str))\ndf['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]\nnormalized_data = df.to_csv('data/normalized_data.csv', index=False)\n"

## 2. Labels selection
'build_df' function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category.

In [317]:
y = build_df(json_path='data/products.json', threshold=30, preprocessed_csv='data/normalized_data.csv')['leaf']     

## 3. Train/test split
'X' will vary depending if we choose name, description or name_and_description as feature.

In [319]:
name = df['name'].apply(str)
description = df['description'].apply(str)
#name_and_description = df['name_and_description']
X = name

In [322]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    #stratify = y
)

## 4. Feature engineering
Try different values for max_features and ngram_range in TF-IDF. \
For Word2Vec, vector_size can also be changed.

**Use TF-IDF if X = 'name'**

In [42]:
tfid_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

**Use W2V if X = 'description' or 'name_and_description'**

In [12]:
X_train_tok = [tokenizer.tokenize(doc) for doc in X_train]
X_test_tok = [tokenizer.tokenize(doc) for doc in X_test]
model_w2v = Word2Vec(sentences = X_train_tok, vector_size=100)
X_train = vectorizer(X_train_tok, model_w2v)
X_test = vectorizer(X_test_tok, model_w2v)

## 5. Modeling
Try different classifiers and compare results.

In [39]:
logreg = LogisticRegression(max_iter=7000, n_jobs=-1, multi_class='multinomial', solver='newton-cg')
svc = SVC()
lgbm = LGBMClassifier(objective='multiclass')

In [40]:
logreg.fit(X_train, y_train)

## 6. Evaluation

In [33]:
y_pred_train = logreg.predict(X_train)

In [211]:
evaluation.get_performance(predictions=y_pred_train, y_test=y_train,labels=y, model=logreg, timestamp=time.time(), average='micro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Performance metrics:
------------------------------
Accuracy: 0.6314260819053151
Precision: 0.6314260819053151
Recall: 0.6314260819053151
F1 Score: 0.6314260819053151

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.80      0.97      0.87       186
                              3D Printers       0.89      0.42      0.57        19
                   3D Printers & Filament       0.53      0.20      0.29        41
                          4K Ultra HD TVs       0.65      0.66      0.65        50
                     6" x 9" Car Speakers       0.67      0.38      0.48        16
                        6.5" Car Speakers       0.55      0.46      0.50        35
                             A/V Adapters       0.50      0.30      0.38        23
                  A/V Cables & Connectors       0.48      0.46      0.47        24
                  

  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
y_pred_test = logreg.predict(X_test)

In [212]:
evaluation.get_performance(y_pred_test, y_test, y, logreg, timestamp=time.time(), average='micro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Performance metrics:
------------------------------
Accuracy: 0.5758954501452082
Precision: 0.5758954501452082
Recall: 0.5758954501452082
F1 Score: 0.5758954501452082

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.89      0.94      0.91        51
                              3D Printers       0.00      0.00      0.00         3
                   3D Printers & Filament       0.50      0.12      0.20         8
                          4K Ultra HD TVs       0.67      0.75      0.71         8
                     6" x 9" Car Speakers       0.33      0.17      0.22         6
                        6.5" Car Speakers       0.25      0.33      0.29         9
                             A/V Adapters       0.00      0.00      0.00         6
                  A/V Cables & Connectors       0.33      0.33      0.33         6
                  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 7. Save model to file

In [43]:
dump(logreg, 'logreg.joblib') 

['logreg.joblib']

In [45]:
dump(model_w2v, 'w2v.joblib')

['w2v.joblib']

## 8. Predict on new data

In [87]:
desc = 'dog'

In [116]:
normalized_desc = normalize_corpus(
        [desc],
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )

tok_desc = [tokenizer.tokenize(doc) for doc in normalized_desc]
vec_desc = vectorizer(tok_desc, model_w2v)[0].reshape(1, -1)
preds_desc = logreg.predict(vec_desc)
label = decode_id(preds_desc[0])
class_name = label

In [280]:
class_name

'Pre-Owned Games'

## 9. Creating a Pipeline

In [256]:
from sklearn.base import BaseEstimator, TransformerMixin

class Normalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        normalized_desc = normalize_corpus(
        X,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
        return normalized_desc
        

In [323]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('normalizer', Normalizer()), 
                 ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 1))),
                 ('logreg', LogisticRegression(max_iter=7000, n_jobs=-1, multi_class='multinomial', solver='newton-cg'))
                ])

pipe.fit(X_train,y_train)

In [345]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

pipe_2 = Pipeline([('normalizer', Normalizer()), 
                 ('tfidf', TfidfVectorizer()),
                 ('gbc', GradientBoostingClassifier())
                ])

parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l1', 'l2', None),
              'gbc__loss':['log_loss', 'exponential'],
              'gbc__n_estimators':[100, 500, 900, 1100, 1500],
              'gbc__max_depth':[2, 3, 5, 10, 15],
              'gbc__min_samples_leaf':[1, 2, 4, 6, 8],
              'gbc__min_samples_split':[2, 4, 6, 10],
              'gbc__max_features':['auto', 'sqrt', 'log2', None]
              }

grid = RandomizedSearchCV(pipe_2, parameters, cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




KeyboardInterrupt: 

In [341]:
y_pred_test = pipe.predict(X_test)

In [342]:
evaluation.get_performance(y_pred_test, y_test, y, logreg, timestamp=time.time(), average='micro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Performance metrics:
------------------------------
Accuracy: 0.7463697967086157
Precision: 0.7463697967086157
Recall: 0.7463697967086157
F1 Score: 0.7463697967086157

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.92      0.90      0.91        51
                   3D Printers & Filament       0.80      0.50      0.62         8
                          4K Ultra HD TVs       0.47      0.88      0.61         8
                        6.5" Car Speakers       0.80      0.44      0.57         9
                  A/V Cables & Connectors       1.00      0.47      0.64        19
                            A/V Switchers       0.20      0.11      0.14         9
                    Acoustic Drums & Sets       0.47      1.00      0.64         7
                         Acoustic Guitars       0.80      0.50      0.62         8
                  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [343]:
new_data = 'head phone'
decoder(pipe.predict([new_data]))

array(['Cell Phone Cases & Clips'], dtype='<U24')