In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [2]:
train.columns

Index(['Unnamed: 0', 'product_name', 'product_category_tree', 'description',
       'brand', 'product_specifications', 'Label', 'Label_1st_category',
       'des_preprocess', 'all_features_preprocess'],
      dtype='object')

# Main category as label

In [26]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [27]:
categories = list(set(train['Label_1st_category']))

In [10]:
# Naive Bayes Classifer
from sklearn.naive_bayes import MultinomialNB
def nb_classifier(X_train, X_test, y_train, y_test):
  
    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)
    
    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [6]:
# Linear SVM

from sklearn.linear_model import SGDClassifier
def linear_svm(X_train, X_test, y_train, y_test):
  
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)
 
    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [7]:
# Logistic Regrassion

from sklearn.linear_model import LogisticRegression
def logisticreg(X_train, X_test, y_train, y_test):

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=500)),
                 ])
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [8]:
# Random Forest 

from sklearn.ensemble import RandomForestClassifier
def randomforest(X_train, X_test, y_train, y_test):
    
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [9]:
# MLP CLassifier

def mlpclassifier(X_train, X_test, y_train, y_test):
  
    from sklearn.neural_network import MLPClassifier
    
    mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [11]:
from sklearn.model_selection import train_test_split

print("\033[1mUsing Description as feature\033[0m")
X_train, X_test, y_train, y_test = train_test_split(list(train['des_preprocess']),list(train['Label_1st_category']) , test_size=0.2, random_state = 42) 

print("NB Classifier: ")
acc,mat=nb_classifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Linear SVM: ")
acc,mat=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*79)

print("Logistic Reg: ")
acc,mat=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Random Forest: ")
acc,mat=randomforest(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("MLP Classifier: ")
acc,mat=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)


print("\033[1m Using product_name, description, brand, product_specifications as feature \033[0m")
X_train, X_test, y_train, y_test = train_test_split(list(train['all_features_preprocess']),list(train['Label_1st_category']) , test_size=0.2, random_state = 42) 

print("NB Classifier: ")
acc,mat=nb_classifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Linear SVM: ")
acc,mat=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Logistic Reg: ")
acc,mat=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Random Forest: ")
acc,mat=randomforest(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("MLP Classifier: ")
acc,mat=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)

[1mUsing Description as feature[0m
NB Classifier: 
Accuracy:  0.8661485319516408
                            precision    recall  f1-score   support

  Beauty and Personal Care       0.96      0.96      0.96       187
           Home Furnishing       0.97      0.38      0.54        77
          Tools & Hardware       0.67      0.04      0.08        46
                  Footwear       1.00      0.58      0.74       132
    Toys & School Supplies       0.86      1.00      0.92      1114
                 Baby Care       1.00      0.72      0.84       108
                 Computers       1.00      0.91      0.95       197
     Bags, Wallets & Belts       1.00      0.64      0.78        22
Home Decor & Festive Needs       0.91      0.82      0.86       182
     Mobiles & Accessories       0.95      0.84      0.89       113
                   Watches       0.00      0.00      0.00         3
         Pens & Stationery       0.73      1.00      0.84       635
          Sports & Fitness      

# Label category based on most common categories in the dataset as label

In [18]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [19]:
# Naive Bayes Classifer
from sklearn.naive_bayes import MultinomialNB
def nb_classifier(X_train, X_test, y_trai, y_test):
  
    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)
    
    return accuracy_score(y_pred, y_test)

In [20]:
# Linear SVM

from sklearn.linear_model import SGDClassifier
def linear_svm(X_train, X_test, y_train, y_test):
  
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)
 
    return accuracy_score(y_pred, y_test)

In [21]:
# Logistic Regrassion

from sklearn.linear_model import LogisticRegression
def logisticreg(X_train, X_test, y_train, y_test):

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=500)),
                 ])
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)
    print(len(set(y_pred)),len(set(y_test)),len(set(y_train)))
    return accuracy_score(y_pred, y_test)

In [22]:
# Random Forest 

from sklearn.ensemble import RandomForestClassifier
def randomforest(X_train, X_test, y_train, y_test):
    
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    return accuracy_score(y_pred, y_test)

In [24]:
# MLP CLassifier

def mlpclassifier(X_train, X_test, y_train, y_test):
  
    from sklearn.neural_network import MLPClassifier
    
    
    mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    return accuracy_score(y_pred, y_test)

In [25]:
from sklearn.model_selection import train_test_split

print("\033[1mUsing Description as feature\033[0m")
X_train, X_test, y_train, y_test = train_test_split(list(train['des_preprocess']),list(train['Label']) , test_size=0.2, random_state = 42) 

print("NB Classifier: ")
acc=nb_classifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Linear SVM: ")
acc=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Logistic Reg: ")
acc=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Random Forest: ")
acc=randomforest(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("MLP Classifier: ")
acc=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)

print('-'*75)


print("\033[1m Using product_name, description, brand, product_specifications as feature \033[0m")
X_train, X_test, y_train, y_test = train_test_split(list(train['all_features_preprocess']),list(train['Label']) , test_size=0.2, random_state = 42) 

print("NB Classifier: ")
acc=nb_classifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Linear SVM: ")
acc=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Logistic Reg: ")
acc=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Random Forest: ")
acc=randomforest(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("MLP Classifier: ")
acc=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)

[1mUsing Description as feature[0m
NB Classifier: 
Accuracy:  0.6776050662061025
Linear SVM: 
Accuracy:  0.8799654576856649
Logistic Reg: 
89 90 95
Accuracy:  0.947035118019574
Random Forest: 
Accuracy:  0.9234312032239493
MLP Classifier: 
Accuracy:  0.8986758779504893
---------------------------------------------------------------------------
[1m Using product_name, description, brand, product_specifications as feature [0m
NB Classifier: 
Accuracy:  0.717328727691422
Linear SVM: 
Accuracy:  0.8831318364997122
Logistic Reg: 
88 90 95
Accuracy:  0.9533678756476683
Random Forest: 
Accuracy:  0.9214162348877375
MLP Classifier: 
Accuracy:  0.9159470351180196


# Testing the best model for Main category on test dataset

In [28]:
# Logistic Regrassion

from sklearn.linear_model import LogisticRegression
def logisticreg(X_train, X_test, y_train, y_test):

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=500)),
                 ])
    logreg.fit(X_train, y_train)

    #y_pred = logreg.predict(X_test)

    return logreg

X_train, X_test, y_train, y_test = train_test_split(list(train['des_preprocess']),list(train['Label_1st_category']) , test_size=0.2, random_state = 42) 
model=logisticreg(X_train, X_test, y_train, y_test)

In [29]:
label=[]
for i in range(len(test)):
    label.append(test['product_category_tree'][i][1:-2].replace("'","").split(", ")[0])
#test['Label_1st_category'] = label
print(len(label))

2000


In [31]:
y_pred = model.predict(test['des_preprocess'])
accuracy_score(y_pred,label)

0.934

Test accuracy is 93.4 %