In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Word2vec function

In [None]:
word2vec_path = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'

word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments.apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/midas/train.csv')
data.columns

Index(['Unnamed: 0', 'product_name', 'product_category_tree', 'description',
       'brand', 'product_specifications', 'Label', 'Label_1st_category',
       'des_preprocess', 'all_features_preprocess'],
      dtype='object')

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Add
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
from keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt

# 1st category of product category tree as label

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
categories = list(set(data['Label_1st_category']))

In [None]:
# Linear SVM

from sklearn.linear_model import SGDClassifier
def linear_svm(X_train, X_test, y_train, y_test):
  
    sgd =  SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)
 
    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [None]:
# Logistic Regrassion

from sklearn.linear_model import LogisticRegression
def logisticreg(X_train, X_test, y_train, y_test):

    logreg = LogisticRegression(n_jobs=1, C=1e5,max_iter=500)
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [None]:
# Random Forest 

from sklearn.ensemble import RandomForestClassifier
def randomforest(X_train, X_test, y_train, y_test):
    
    ranfor = RandomForestClassifier(n_estimators = 1000, random_state = 42)
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [None]:
# MLP CLassifier

def mlpclassifier(X_train, X_test, y_train, y_test):
  
    from sklearn.neural_network import MLPClassifier
    
    
    mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    return accuracy_score(y_pred, y_test), classification_report(y_test, y_pred,target_names=categories)

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from sklearn.model_selection import train_test_split

print("\033[1mUsing Description as feature\033[0m")
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
tokenized_x = data['des_preprocess'].apply(tokenizer.tokenize)
# delete Stop Words
tokenized_x = tokenized_x.apply(lambda vec: [word for word in vec if word not in stop_words])
   
training_embeddings = get_word2vec_embeddings(word2vec, tokenized_x, generate_missing=True)

X_train, X_test, y_train, y_test = train_test_split(training_embeddings,list(data['Label_1st_category']) , test_size=0.2, random_state = 42) 

print("Linear SVM: ")
acc,mat=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*79)

print("Logistic Reg: ")
acc,mat=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Random Forest: ")
acc,mat=randomforest(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("MLP Classifier: ")
acc,mat=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)


print("\033[1mUsing product_name, description, brand, product_specifications as feature \033[0m")

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
tokenized_x = data['all_features_preprocess'].apply(tokenizer.tokenize)
# delete Stop Words
tokenized_x = tokenized_x.apply(lambda vec: [word for word in vec if word not in stop_words])
   
training_embeddings = get_word2vec_embeddings(word2vec, tokenized_x, generate_missing=True)
X_train, X_test, y_train, y_test = train_test_split(training_embeddings,list(data['Label_1st_category']) , test_size=0.2, random_state = 42) 

print("Linear SVM: ")
acc,mat=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Logistic Reg: ")
acc,mat=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("Random Forest: ")
acc,mat=randomforest(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)
print('-'*75)

print("MLP Classifier: ")
acc,mat=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)
print(mat)

[1mUsing Description as feature[0m
Linear SVM: 
Accuracy:  0.9018422567645366
                            precision    recall  f1-score   support

Home Decor & Festive Needs       0.89      0.95      0.92       187
                 Furniture       0.81      0.44      0.57        77
  Beauty and Personal Care       0.76      0.48      0.59        46
     Bags, Wallets & Belts       0.87      0.89      0.88       132
                 Jewellery       0.94      1.00      0.97      1114
          Home Improvement       0.78      0.77      0.78       108
         Pens & Stationery       0.84      0.95      0.89       197
                  Footwear       1.00      0.77      0.87        22
                 Baby Care       0.88      0.76      0.82       182
          Sports & Fitness       0.90      0.92      0.91       113
                  Clothing       0.00      0.00      0.00         3
           Home Furnishing       0.91      0.99      0.95       635
                Automotive       0.

# category based on most common categories in the dataset as label

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, auc
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Linear SVM

from sklearn.linear_model import SGDClassifier
def linear_svm(X_train, X_test, y_train, y_test):
  
    sgd =  SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)
    #print(len(set(y_pred)),len(set(y_test)),len(set(y_train)))
    return accuracy_score(y_pred, y_test)

In [None]:
# Logistic Regrassion

from sklearn.linear_model import LogisticRegression
def logisticreg(X_train, X_test, y_train, y_test):

    logreg = LogisticRegression(n_jobs=1, C=1e5,max_iter=500)
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    return accuracy_score(y_pred, y_test)

In [None]:
# MLP CLassifier

def mlpclassifier(X_train, X_test, y_train, y_test):
  
    from sklearn.neural_network import MLPClassifier
    
    
    mlp = MLPClassifier(hidden_layer_sizes=(30,30,30))
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    return accuracy_score(y_pred, y_test)

In [None]:
from sklearn.model_selection import train_test_split

print("\033[1mUsing Description as feature\033[0m")
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
tokenized_x = data['des_preprocess'].apply(tokenizer.tokenize)
# delete Stop Words
tokenized_x = tokenized_x.apply(lambda vec: [word for word in vec if word not in stop_words])
   
training_embeddings = get_word2vec_embeddings(word2vec, tokenized_x, generate_missing=True)

X_train, X_test, y_train, y_test = train_test_split(training_embeddings,list(data['Label']) , test_size=0.2, random_state = 42) 

print("Linear SVM: ")
acc=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Logistic Reg: ")
acc=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("MLP Classifier: ")
acc=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)

print('-'*70)


print("\033[1mUsing product_name, description, brand, product_specifications as feature \033[0m")

tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
tokenized_x = data['all_features_preprocess'].apply(tokenizer.tokenize)
# delete Stop Words
tokenized_x = tokenized_x.apply(lambda vec: [word for word in vec if word not in stop_words])
   
training_embeddings = get_word2vec_embeddings(word2vec, tokenized_x, generate_missing=True)
X_train, X_test, y_train, y_test = train_test_split(training_embeddings,list(data['Label']) , test_size=0.2, random_state = 42) 

print("Linear SVM: ")
acc=linear_svm(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("Logistic Reg: ")
acc=logisticreg(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)


print("MLP Classifier: ")
acc=mlpclassifier(X_train, X_test, y_train, y_test)
print('Accuracy: ',acc)

[1mUsing Description as feature[0m
Linear SVM: 
Accuracy:  0.7748992515831894
Logistic Reg: 
Accuracy:  0.902993667242372
MLP Classifier: 
Accuracy:  0.8316062176165803
----------------------------------------------------------------------
[1mUsing product_name, description, brand, product_specifications as feature [0m
Linear SVM: 
Accuracy:  0.7711571675302246
Logistic Reg: 
Accuracy:  0.91335636154289
MLP Classifier: 
Accuracy:  0.8347725964306275
