<a href="https://colab.research.google.com/github/manju1201/Flipkart_Data_Classification_using_Description/blob/main/3_Product_Classification_Using_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import numpy as np 
import pandas as pd 
import re 
import nltk 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from gensim.models.fasttext import FastText
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data Loading

In [2]:
data = pd.read_csv('/content/drive/MyDrive/MIDAS/2_cleaned_flipkart_com_ecommerce_sample.csv')

In [3]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,uniq_id,description,primary_category
0,0,c2d766ca982eca8304150849735ffef9,key feature alisha solid woman cycling short c...,Clothing


# PreProcessing Function

In [4]:
stemmer = WordNetLemmatizer()
def preprocess_string(text):
    text = re.sub(r'\W', ' ', str(text)) #conversion of datatype to string
    text=re.sub('[^a-z\s]+',' ',text,flags=re.IGNORECASE) #every char except alphabets is replaced
    text=re.sub('(\s+)',' ',text) #multiple spaces are replaced by single space
    text=text.lower() #converting the cleaned string to lower case
    tokens = text.split() # splitting based on space
    tokens = [stemmer.lemmatize(word) for word in tokens] # lemmatization(any form into root word) of words
    tokens = [word for word in tokens if word not in en_stop] # removing Stopwords
    tokens = [word for word in tokens if len(word) > 2] # considering words of length greater than 2
    preprocessed_text = ' '.join(tokens)  # joining all the processed tokens into sentence
    return preprocessed_text # returns the sentence

Preparing data to feed the Model

In [5]:
# primary_category processing.
# Considering the 27 labels to perform prediction
sort_level_1 = list(data.groupby('primary_category').count().sort_values(by='uniq_id',ascending=False).index) 
# Selecting only relevant columns that is primary_category and description
processed_df = data[data['primary_category'].isin(sort_level_1)][['primary_category','description']]
# # Cleaning strings
processed_df['description'] = processed_df['description'].astype('str').apply(preprocess_string)
# Using LabelEncoder Encoding target labels with value between 0 and 265
le = preprocessing.LabelEncoder()
category_encoded=le.fit_transform(processed_df['primary_category'])
processed_df['primary_category'] = category_encoded

# Train Test Split 

In [6]:
X_train, X_test, y_train, y_test = train_test_split(processed_df['description'],processed_df['primary_category'],test_size=0.2,stratify = processed_df['primary_category'])

# Multinomial Naive Bayes

In [7]:
vect = CountVectorizer(stop_words = 'english')
X_train_matrix = vect.fit_transform(X_train) 

# Defining model
clf=MultinomialNB()
# Fitting to multinomial NB model 
clf.fit(X_train_matrix, y_train)
# Scoring the trained model 
print(clf.score(X_train_matrix, y_train)*100)
# Converting the test data
X_test_matrix = vect.transform(X_test) 
# Scoring for the test data
print (clf.score(X_test_matrix, y_test)*100)
# Classification Report
predicted_result=clf.predict(X_test_matrix)
print(classification_report(y_test,predicted_result))

94.93029943114334
92.925
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       206
           1       0.83      0.62      0.71        97
           2       0.88      0.66      0.76        56
           3       0.90      0.89      0.89       144
           4       0.75      0.17      0.27        18
           5       0.98      0.98      0.98      1268
           6       0.88      0.78      0.83       116
           7       0.98      0.97      0.98       253
           8       0.97      0.97      0.97        36
           9       0.71      0.71      0.71         7
          10       1.00      0.33      0.50         9
          11       1.00      0.60      0.75         5
          12       0.90      0.98      0.94       188
          13       0.00      0.00      0.00         4
          14       0.94      0.96      0.95       142
          15       1.00      0.50      0.67        18
          16       0.88      1.00      0.93       709
  

  _warn_prf(average, modifier, msg_start, len(result))


# Decision Tree Classifier

In [8]:
vect = CountVectorizer(stop_words = 'english')
X_train_matrix = vect.fit_transform(X_train) 

# Defining model
clf1 = DecisionTreeClassifier(random_state=0)
# Fitting to DecisionTreeClassifier 
clf1.fit(X_train_matrix, y_train)
# Scoring the trained model 
print(clf1.score(X_train_matrix, y_train)*100)
# Converting the test data
X_test_matrix = vect.transform(X_test) 
# Scoring for the test data
print (clf1.score(X_test_matrix, y_test)*100)
# Classification Report
predicted_result=clf1.predict(X_test_matrix)
print(classification_report(y_test,predicted_result))

99.92498593486279
95.125
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       206
           1       0.91      0.85      0.88        97
           2       0.89      0.88      0.88        56
           3       0.93      0.91      0.92       144
           4       1.00      0.94      0.97        18
           5       0.98      0.99      0.99      1268
           6       0.90      0.89      0.89       116
           7       0.98      0.97      0.97       253
           8       0.80      0.92      0.86        36
           9       0.54      1.00      0.70         7
          10       1.00      0.56      0.71         9
          11       0.00      0.00      0.00         5
          12       0.95      0.94      0.94       188
          13       1.00      0.50      0.67         4
          14       0.95      0.91      0.93       142
          15       0.81      0.72      0.76        18
          16       0.99      0.99      0.99       709
  

  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest Classifier

In [9]:
vect = CountVectorizer(stop_words = 'english')
X_train_matrix = vect.fit_transform(X_train) 

# Defining model
clf1 = RandomForestClassifier(max_depth=200, random_state=0)
# Fitting to RandomForest
clf1.fit(X_train_matrix, y_train)
# Scoring the trained model 
print(clf1.score(X_train_matrix, y_train)*100)
# Converting the test data
X_test_matrix = vect.transform(X_test) 
# Scoring for the test data
print (clf1.score(X_test_matrix, y_test)*100)
# Classification Report
predicted_result=clf1.predict(X_test_matrix)
print(classification_report(y_test,predicted_result))

99.92498593486279
96.25
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       206
           1       0.97      0.64      0.77        97
           2       0.98      0.89      0.93        56
           3       1.00      0.91      0.95       144
           4       1.00      0.83      0.91        18
           5       0.96      1.00      0.98      1268
           6       0.96      0.91      0.93       116
           7       1.00      0.98      0.99       253
           8       1.00      0.97      0.99        36
           9       0.78      1.00      0.88         7
          10       1.00      0.78      0.88         9
          11       0.00      0.00      0.00         5
          12       0.91      0.98      0.94       188
          13       1.00      0.25      0.40         4
          14       0.97      0.96      0.96       142
          15       0.94      0.89      0.91        18
          16       0.99      1.00      1.00       709
   

  _warn_prf(average, modifier, msg_start, len(result))
