In [41]:
import logging
import os
from gensim.models import word2vec
import pandas as pd
from helper import *

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Step 1. Define a function to train word2vec model

In [42]:
def train_Word2Vec(df):
    sentences = [Product_Name.strip().split() for Product_Name in df['Product_Name_s'].tolist()]
    
    window = df.apply(lambda row: len(row['Product_Name_s'].split()), axis = 1).max()
    
    model = word2vec.Word2Vec(sentences, min_count=1,  window = window, sg = 1)
        
    return model

## Step2: Read the data

In [43]:
ff_train = pd.read_csv('ff_train.csv', encoding = 'utf-8')
mg_train = pd.read_csv('mg_train.csv', encoding = 'utf-8')
mf_train = pd.read_csv('mf_train.csv', encoding = 'utf-8')


## Step3: Model training

#### 3.1 For titles of 'male fashion' products

In [44]:
model_mf = train_Word2Vec(mf_train)
model_mf.save('model_mf.model')

2018-04-03 19:04:11,262 : INFO : collecting all words and their counts
2018-04-03 19:04:11,263 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-03 19:04:11,281 : INFO : collected 6604 word types from a corpus of 41241 raw words and 3070 sentences
2018-04-03 19:04:11,284 : INFO : Loading a fresh vocabulary
2018-04-03 19:04:11,311 : INFO : min_count=1 retains 6604 unique words (100% of original 6604, drops 0)
2018-04-03 19:04:11,314 : INFO : min_count=1 leaves 41241 word corpus (100% of original 41241, drops 0)
2018-04-03 19:04:11,356 : INFO : deleting the raw counts dictionary of 6604 items
2018-04-03 19:04:11,359 : INFO : sample=0.001 downsamples 54 most-common words
2018-04-03 19:04:11,362 : INFO : downsampling leaves estimated 37124 word corpus (90.0% of prior 41241)
2018-04-03 19:04:11,397 : INFO : estimated required memory for 6604 words and 100 dimensions: 8585200 bytes
2018-04-03 19:04:11,401 : INFO : resetting layer weights
2018-04-03 19:04:11,5

#### 3.2 For titles of 'mobile & gadget' products

In [45]:
model_mg = train_Word2Vec(mg_train)
model_mg.save('model_mg.model')

2018-04-03 19:04:15,040 : INFO : collecting all words and their counts
2018-04-03 19:04:15,043 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-03 19:04:15,077 : INFO : collected 7418 word types from a corpus of 56878 raw words and 4169 sentences
2018-04-03 19:04:15,079 : INFO : Loading a fresh vocabulary
2018-04-03 19:04:15,214 : INFO : min_count=1 retains 7418 unique words (100% of original 7418, drops 0)
2018-04-03 19:04:15,219 : INFO : min_count=1 leaves 56878 word corpus (100% of original 56878, drops 0)
2018-04-03 19:04:15,287 : INFO : deleting the raw counts dictionary of 7418 items
2018-04-03 19:04:15,290 : INFO : sample=0.001 downsamples 59 most-common words
2018-04-03 19:04:15,292 : INFO : downsampling leaves estimated 50337 word corpus (88.5% of prior 56878)
2018-04-03 19:04:15,350 : INFO : estimated required memory for 7418 words and 100 dimensions: 9643400 bytes
2018-04-03 19:04:15,357 : INFO : resetting layer weights
2018-04-03 19:04:15,5

#### 3.3 For titles of 'female fashion' products

In [46]:
model_ff = train_Word2Vec(ff_train)
model_ff.save('model_ff.model')

2018-04-03 19:04:19,813 : INFO : collecting all words and their counts
2018-04-03 19:04:19,815 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-03 19:04:19,838 : INFO : collected 6244 word types from a corpus of 45502 raw words and 3064 sentences
2018-04-03 19:04:19,843 : INFO : Loading a fresh vocabulary
2018-04-03 19:04:19,878 : INFO : min_count=1 retains 6244 unique words (100% of original 6244, drops 0)
2018-04-03 19:04:19,881 : INFO : min_count=1 leaves 45502 word corpus (100% of original 45502, drops 0)
2018-04-03 19:04:19,933 : INFO : deleting the raw counts dictionary of 6244 items
2018-04-03 19:04:19,936 : INFO : sample=0.001 downsamples 57 most-common words
2018-04-03 19:04:19,950 : INFO : downsampling leaves estimated 39259 word corpus (86.3% of prior 45502)
2018-04-03 19:04:19,984 : INFO : estimated required memory for 6244 words and 100 dimensions: 8117200 bytes
2018-04-03 19:04:19,987 : INFO : resetting layer weights
2018-04-03 19:04:20,2

## Step4: Reviewing

In [47]:
print(model_ff.similarity('短裤','长裤'))
print(model_ff.similarity('短裤','t恤'))

0.8605663887251421
0.7665001355901444


  """Entry point for launching an IPython kernel.
  


In [55]:
model_ff.most_similar(['t恤'])

  """Entry point for launching an IPython kernel.


[('素t', 0.9832006692886353),
 ('短袖上衣', 0.9797725677490234),
 ('圆领', 0.977285623550415),
 ('短袖t恤', 0.9683908224105835),
 ('短袖', 0.9616423845291138),
 ('宽松短袖', 0.9612372517585754),
 ('衣服', 0.960029125213623),
 ('打底衫', 0.9598938822746277),
 ('韩范', 0.9583677649497986),
 ('字母', 0.9565817713737488)]

In [49]:
model_mg.most_similar(['手机壳'])

  """Entry point for launching an IPython kernel.
2018-04-03 19:04:23,949 : INFO : precomputing L2-norms of word weight vectors


[('软壳', 0.9073449373245239),
 ('保护壳', 0.8957708477973938),
 ('防摔壳', 0.8705962896347046),
 ('防摔', 0.8630738258361816),
 ('加厚', 0.845794677734375),
 ('英文', 0.8428493142127991),
 ('辛普森', 0.842496395111084),
 ('少女', 0.8423134088516235),
 ('原宿', 0.8413721323013306),
 ('渐层', 0.8407720923423767)]

In [50]:
model_mf.most_similar(['沙滩裤'])

  """Entry point for launching an IPython kernel.
2018-04-03 19:04:24,020 : INFO : precomputing L2-norms of word weight vectors


[('海滩裤', 0.9694491028785706),
 ('五分裤', 0.9669184684753418),
 ('休闲', 0.9435919523239136),
 ('休閒短裤', 0.9359419345855713),
 ('松紧', 0.9226898550987244),
 ('中裤', 0.9216972589492798),
 ('时尚', 0.920397162437439),
 ('绑带', 0.9201370477676392),
 ('韩系', 0.9176990985870361),
 ('五分短裤', 0.9129565954208374)]

In [56]:
model_mf['沙滩裤']

  """Entry point for launching an IPython kernel.


array([-0.19924697,  0.6276916 , -0.08995506,  0.32426256, -0.04493741,
        0.3257724 , -0.17402256, -0.27104753, -0.17894614,  0.10198706,
       -0.27319348,  0.02146268, -0.16099626, -0.47675055, -0.15807314,
       -0.47139677, -0.5648258 ,  0.4939753 ,  0.10391641,  0.13731194,
       -0.21126764,  0.3107965 , -0.03591251,  0.76509726,  0.558321  ,
       -0.0936106 ,  0.07412483,  0.23064597, -0.33411935,  0.08496652,
        0.04709025,  0.13435403, -0.25183725, -0.46391034,  0.39573026,
        0.29214576,  0.33173665, -0.4364207 , -0.12067068,  0.6887042 ,
        0.3636579 ,  0.10469751,  0.2157699 ,  0.1632134 ,  0.12865828,
       -0.01313473, -0.4635576 , -0.18016255, -0.11433658,  0.15676758,
        0.6672143 , -0.0509435 , -0.24562328, -0.02731759,  0.09797992,
        0.33072612, -0.08965809,  0.1328719 ,  0.37181646, -0.07301752,
        0.13087064, -0.03873652,  0.255346  ,  0.09856891,  0.20357797,
       -0.14227699, -0.36370927,  0.34647298,  0.05255154, -0.09