# Word2Vec


## Import packages

In [1]:
import numpy as np
import os
import nltk

import torch
import torch.nn
import torchtext.vocab as vocab
from sklearn import metrics
import pandas as pd
import warnings
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.linear_model import LinearRegression
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'torch'

## Function definition

In [6]:
def load_data():
    column = ['product_name', 'cat1']
    df = pd.read_csv("../data_cleaning/unique_product.csv", encoding='utf_8_sig', usecols=column)
    # print(df.columns)
    #df.drop(df[df.cat== 'Seafood'].index, inplace= True )
    df.columns = ['name', 'cat']
    texts = df['name'].values
    df = df.fillna(-1)
    # labels = np.argmax(df[df.columns[-4:]].values, axis=1)
    labels = df['cat'].values
    label2id = {l: i for i, l in enumerate(set(labels))}
    id2label = {i: l for l, i in label2id.items()}
    labels = [label2id[l] for l in labels]
    return texts, labels, id2label

In [7]:
def text_preprocess(text):
    text = str(text)
    english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\'', '/']
    text = "".join([(a if a not in english_punctuations else " ") for a in text])
    text = " ".join(nltk.tokenize.word_tokenize(text.lower()))
    return text

In [8]:
def load_embeddings():

    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)

    word2vec = vocab.Vectors(name=r'./word2vec_300dim.txt', cache=cache)

    return word2vec

In [9]:
def encode_text_to_features(vector, text):
    vectors = vector.get_vecs_by_tokens(text.split())
    sentence_vector = torch.mean(vectors, dim=0)
    return sentence_vector.tolist()

In [10]:
def evaluation(predictions, labels, id2label, model_name=None):
    acc = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average="macro")
    f1 = f1_score(labels, predictions, average="macro")
    report = metrics.classification_report(labels, predictions,
                                           target_names=[id2label[i] for i in range(len(id2label))])
    info = "acc:%s, recall:%s, f1 score:%s" % (acc, recall, f1)
    if model_name is not None:
        info = "%s: %s" % (model_name, info)
    print(info)
    print(report)



## Read data

In [11]:
col_list = ['product_name', 'cat1']
data = pd.read_csv('../data_cleaning/unique_product.csv', encoding='utf_8_sig', usecols=col_list)
data.columns = ['name', 'cat']
data.head()

Unnamed: 0,name,cat
0,Moccona Strong Cappuccino Sachets,Drinks
1,Nescafe Dolce Gusto Americano Capsules,Drinks
2,Starbucks Caffe Varona Capsules,Drinks
3,Robert Harris Italian Roast Coffee Capsules,Drinks
4,Moccona Latte Sachets,Drinks


In [12]:
data = data.apply([text_preprocess])

In [13]:
data.columns = ['name', 'cat']
data.head()

Unnamed: 0,name,cat
0,moccona strong cappuccino sachets,drinks
1,nescafe dolce gusto americano capsules,drinks
2,starbucks caffe varona capsules,drinks
3,robert harris italian roast coffee capsules,drinks
4,moccona latte sachets,drinks


## Prepare wordvec

In [14]:
corpus = open('./corpus.txt', 'w', encoding='utf_8_sig')
for i in range(len(data['name'])):
    newline = data['name'][i] + ' ' + data['cat'][i] + '\n'
    corpus.write(newline)

In [16]:
#https://mp.weixin.qq.com/s/UFiZsPfXwg8lQUsGXcgHjA

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

In [17]:
readlines = open('./corpus.txt', 'r', encoding='utf_8_sig')

model = Word2Vec(LineSentence(readlines),
                 size=300, window=5, min_count=10, sample=1e-5,
                 workers=multiprocessing.cpu_count(), iter=5,)

model.wv.save_word2vec_format(r'./word2vec_300dim.txt', binary=False)

In [18]:
texts, labels, id2label = load_data()
texts = [text_preprocess(t) for t in texts]
vector = load_embeddings()
features = [encode_text_to_features(vector, text) for text in texts]
print("data len:", len(texts))
print("id2label",id2label)

data len: 390811
id2label {0: 'Pantry', 1: 'Beer, Cider & Wine', 2: 'Drinks', 3: 'Pets', 4: 'Fresh Foods & Bakery', 5: 'Baby, Toddler & Kids', 6: 'Chilled, Frozen & Desserts', 7: 'Personal Care', 8: 'Kitchen, Dining & Household'}


## Start Training

In [20]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25,
                                                                            random_state=3)

### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(SVC())#C=1, gamma=20, decision_function_shape='ovr'))
model.fit(x_train, y_train)
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "svm")

### KNN

In [24]:
# knn
k = len(id2label)
model = KNeighborsClassifier(n_neighbors=k)
model.fit(x_train, y_train)
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "knn")

knn: acc:0.9794069782913524, recall:0.9805184480036127, f1 score:0.9808807908923433
                             precision    recall  f1-score   support

                     Pantry       0.97      0.98      0.98     30136
         Beer, Cider & Wine       0.98      0.99      0.99      6259
                     Drinks       0.98      0.98      0.98      7027
                       Pets       0.99      0.99      0.99      4094
       Fresh Foods & Bakery       0.97      0.95      0.96     10664
       Baby, Toddler & Kids       0.99      1.00      0.99      3067
 Chilled, Frozen & Desserts       0.98      0.97      0.97     11187
              Personal Care       0.99      0.99      0.99     14761
Kitchen, Dining & Household       0.98      0.98      0.98     10508

                   accuracy                           0.98     97703
                  macro avg       0.98      0.98      0.98     97703
               weighted avg       0.98      0.98      0.98     97703



### Logistic Rregression

In [21]:
# logistic
model = LinearRegression()
model.fit(x_train, y_train)
predict_labels = model.predict(x_test)
# print(predict_labels)
predict_labels = [np.round(p) for p in predict_labels]
max_label = len(id2label) - 1
predict_labels = [l if max_label >= l else max_label for l in predict_labels]
min_label = 0
predict_labels = [l if min_label <= l else min_label for l in predict_labels]
evaluation(predict_labels, y_test,id2label, "logistic")

logistic: acc:0.3544415217547056, recall:0.37425695116724866, f1 score:0.3271159870445225
                             precision    recall  f1-score   support

                     Pantry       0.87      0.37      0.52     30136
         Beer, Cider & Wine       0.16      0.35      0.22      6259
                     Drinks       0.20      0.38      0.26      7027
                       Pets       0.17      0.53      0.26      4094
       Fresh Foods & Bakery       0.33      0.32      0.32     10664
       Baby, Toddler & Kids       0.15      0.46      0.23      3067
 Chilled, Frozen & Desserts       0.25      0.21      0.23     11187
              Personal Care       0.54      0.35      0.42     14761
Kitchen, Dining & Household       0.59      0.41      0.48     10508

                   accuracy                           0.35     97703
                  macro avg       0.36      0.37      0.33     97703
               weighted avg       0.51      0.35      0.39     97703



### GDBT

In [25]:
# GDBT
import lightgbm as lgb

def lgb_model(x_train, x_test, y_train, y_test, verbose):
    params = {'num_leaves': 60,
    'min_data_in_leaf': 30,
    'objective': 'multiclass',
    'num_class': 33,
    'max_depth': -1,
    'learning_rate': 0.03,
    "min_sum_hessian_in_leaf": 6,
    "boosting": "gbdt",
    "feature_fraction": 0.9,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "bagging_seed": 11,
    "lambda_l1": 0.1,
    "verbosity": -1,
    "nthread": 15,
    'metric': 'multi_error',
    "random_state": 2020
          }

    model = lgb.train(params
                      , lgb.Dataset(x_train, y_train)
                      , num_boost_round = 100000
                      , valid_sets = [lgb.Dataset(x_test, y_test)]
                      , verbose_eval = verbose
                      , early_stopping_rounds = 500
                     )

    print('Predicting...')
    y_prob = model.predict(x_test, num_iteration=model.best_iteration)
    y_pred = [list(x).index(max(x)) for x in y_prob]
    print("AUC score: {:<8.5f}".format(metrics.accuracy_score(y_pred, y_test)))
    report = metrics.classification_report(y_test, y_pred, target_names=[id2label[i] for i in range(len(id2label))])
    print(report)

    return True




In [None]:
final = lgb_model(x_train, x_test, y_train, y_test, verbose=False)

### SVM

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
model = OneVsRestClassifier(SVC())
model.fit(x_train, y_train)
predict_labels = model.predict(x_test)
evaluation(predict_labels, y_test, id2label, "svm")

In [23]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

In [None]:
train_sizes,train_score,test_score = learning_curve(SVC(),features,labels,train_sizes=[0.1,0.2,0.4,0.6,0.8,1],cv=5,scoring='accuracy')
train_error =  1- np.mean(train_score,axis=1)
test_error = 1- np.mean(test_score,axis=1)
plt.plot(train_sizes,train_error,'o-',color = 'r',label = 'training')
plt.plot(train_sizes,test_error,'o-',color = 'g',label = 'testing')
plt.legend(loc='best')
plt.xlabel('traing examples')
plt.ylabel('error')
plt.show()

In [None]:
# from sklearn.naive_bayes import MultinomialNB