# Imports

In [1]:
import pandas as pd
import pickle
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import bz2

# Inputs

In [42]:
df_products = pd.read_csv("all_products.csv")

In [43]:
df_products.shape

(1729, 4)

In [44]:
df_products.head()

Unnamed: 0,title,url,text,categories
0,Dessert bébé dès 6 Mois brassés nature sans...,https://www.carrefour.fr/p/dessert-bebe-des-6-...,dessert bébé dès mois brassés nature sans sucr...,bebe-bio
1,Compote bébé Bio dès 6 mois Gourdes Multipac...,https://www.carrefour.fr/p/compotes-bebe-des-6...,compote bébé bio dès mois gourde multipack var...,bebe-bio
2,Desserts bébé bio dès 6 mois pomme coing CAR...,https://www.carrefour.fr/p/desserts-bebe-bio-d...,dessert bébé bio dès mois pomme coing baby bi...,bebe-bio
3,Plat bébé bio hachis parmentier dès 12 mois ...,https://www.carrefour.fr/p/plat-bebe-bio-hachi...,plat bébé bio hachis parmentier dès mois baby...,bebe-bio
4,Plat bébé bio polenta saumon dès 15 mois CAR...,https://www.carrefour.fr/p/plat-bebe-bio-polen...,plat bébé bio polenta saumon dès mois baby as...,bebe-bio


# TFIDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

tfidfconverter = TfidfVectorizer(max_features=200, min_df=0, max_df=0.7, stop_words=stopwords.words('french'))
X = tfidfconverter.fit_transform(df_products.text.to_list()).toarray()

# Training and Testing Sets

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, df_products.categories.to_list(), test_size=0.3, random_state=0)

# Training Text Classification Model

In [34]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [36]:
y_pred = classifier.predict(X_test)

In [47]:
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

                                      precision    recall  f1-score   support

                            bebe-bio       0.31      0.17      0.22        24
                             boisson       1.00      0.85      0.92        13
                            cereales       1.00      1.00      1.00         8
             chauffage-climatisation       0.58      0.78      0.67        18
                    couches-culottes       1.00      1.00      1.00        22
                       cuisson-robot       0.77      1.00      0.87        17
                       entretien-sol       0.83      1.00      0.91        15
                              fruits       0.86      0.73      0.79        26
               fruits-et-legumes-bio       0.54      0.81      0.65        16
              fruits-et-legumes-secs       0.91      0.95      0.93        22
                    gouters-desserts       0.58      0.73      0.65        15
                 gros-electromenager       1.00      0.89      

# Saving the Model and TFIDF converter

In [54]:
# Compressing Data
 
ofile = bz2.BZ2File('text_classifier', 'wb')
pickle.dump(classifier,ofile)
ofile.close()

ofile = bz2.BZ2File('tfidfconverter', 'wb')
pickle.dump(tfidfconverter,ofile)
ofile.close()