### DO NOT PUSH CHANGES IN THIS NOTEBOOK
#### SAVE EXPERIMENTS RESULTS IN .CSV FILE

In [46]:
import pandas as pd
import numpy as np
import text_normalizer
import evaluation
from utils import vectorizer
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from text_normalizer import tokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Features normalization
Arguments of 'normalization' function could be modified between experiments.

In [32]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)

## 2. Labels selection
'build_df' function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category.

In [33]:
y = build_df(json_path='data/products.json', threshold=10, preprocessed_csv='data/normalized_data.csv')['leaf']     

## 3. Train/test split
'X' will vary depending if we choose name, description or name_and_description as feature.

In [34]:
name = df['name']
description = df['description']
name_and_description = df['name_and_description']
X = name

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    #stratify = y
)

## 4. Feature engineering
Try different values for max_features and ngram_range in TF-IDF. \
For Word2Vec, vector_size can also be changed.

**Use TF-IDF if X = 'name'**

In [42]:
tfid_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

**Use W2V if X = 'description' or 'name_and_description'**

In [None]:
X_train_tok = [tokenizer.tokenize(doc) for doc in X_train]
X_test_tok = [tokenizer.tokenize(doc) for doc in X_test]
model_w2v = Word2Vec(sentences = X_train_tok, vector_size=100)
X_train = vectorizer(X_train_tok, model_w2v)
X_test = vectorizer(X_test_tok, model_w2v)

## 5. Modeling
Try different classifiers and compare results.

In [40]:
logreg = LogisticRegression(max_iter=1000)

In [None]:
svc = SVC()

In [None]:
lgbm = LGBMClassifier(objective='multiclass')

In [43]:
logreg.fit(X_train, y_train)

## 6. Evaluation

In [44]:
y_pred_train = logreg.predict(X_train)

In [55]:
evaluation.get_performance(y_pred_train, y_train, labels=y, average='micro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Performance metrics:
------------------------------
Accuracy: 0.7586407203020622
Precision: 0.7586407203020622
Recall: 0.7586407203020622
F1 Score: 0.7586407203020623

Model Classification report:
------------------------------
                    precision    recall  f1-score   support

      abcat0101000       0.00      0.00      0.00         2
      abcat0101001       0.88      0.95      0.91       128
      abcat0102000       0.89      0.89      0.89         9
      abcat0102003       0.77      1.00      0.87        20
      abcat0102008       0.67      1.00      0.80         8
      abcat0106000       0.00      0.00      0.00         8
      abcat0106001       0.00      0.00      0.00        38
      abcat0106002       0.85      0.61      0.71        18
      abcat0106004       0.66      0.93      0.77       121
      abcat0106007       0.67      0.47      0.55        34
      abcat0106008       0.69      0.43      0.53        21
      abcat0106009       0.94      0.87      

  _warn_prf(average, modifier, msg_start, len(result))


(0.7586407203020622,
 0.7586407203020622,
 0.7586407203020622,
 0.7586407203020623)

In [58]:
y_pred_test = logreg.predict(X_test)

In [59]:
evaluation.get_performance(y_pred_test, y_test, labels=y, average='micro')

Model Performance metrics:
------------------------------
Accuracy: 0.7177153920619554
Precision: 0.7177153920619554
Recall: 0.7177153920619554
F1 Score: 0.7177153920619554

Model Classification report:
------------------------------
                    precision    recall  f1-score   support

      abcat0101000       0.00      0.00      0.00         1
      abcat0101001       0.88      0.84      0.86        25
      abcat0102000       0.33      0.33      0.33         3
      abcat0102003       0.80      0.80      0.80         5
      abcat0102008       0.50      1.00      0.67         5
      abcat0106000       0.00      0.00      0.00         2
      abcat0106001       0.00      0.00      0.00        14
      abcat0106002       0.83      0.50      0.62        10
      abcat0106004       0.51      0.93      0.66        29
      abcat0106007       0.67      0.25      0.36         8
      abcat0106008       0.60      0.75      0.67         4
      abcat0106009       0.82      0.82      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.7177153920619554,
 0.7177153920619554,
 0.7177153920619554,
 0.7177153920619554)