In [None]:
##### Step 1 : Import library


import numpy as np
from numpy import log, min
import pandas as pd
import re
import os
from collections import defaultdict,Counter
from tqdm import tqdm, tqdm_notebook


from datetime import datetime, timedelta
import time
import pytz

from bs4 import BeautifulSoup

import gensim
import gensim.corpora as corpora
from gensim.models.word2vec import Word2Vec, LineSentence
from gensim.models.fasttext import FastText
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors, LsiModel
from gensim.similarities import Similarity,SparseMatrixSimilarity,MatrixSimilarity
from multiprocessing import cpu_count
from gensim.models.doc2vec import TaggedDocument


import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')#忽略警告
from google.colab import files, drive
!mkdir -p drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




```
# 此內容會顯示為程式碼
```

# define file path

In [None]:
##### Step 2 : define time variable


os.environ['TZ'] = 'Asia/Hong_Kong'

HK_now_datetime=datetime.now().astimezone(tz=pytz.timezone('Asia/Hong_Kong')).replace(tzinfo=None)#-timedelta(hours=4)
HK_now_str=HK_now_datetime.strftime('%Y-%m-%d %H:%M:%S')
HK_today_str=HK_now_str[:10]
HK_year_str=HK_now_str[:4]

HK_today_file=re.sub('\-|\:|\s','',HK_today_str)
HK_now_file=re.sub('\-|\:|\s','',HK_now_str)

In [None]:
##### step 3:  define project path

encoding='utf-8'

master_dir='/content/gdrive/My Drive/word2vec_training'#



In [None]:
##### step 4:  define load and clean training data functions

def load_tokens(token_file):
 
  with open(token_file,'r',encoding='utf-8') as g:
    token_doc=g.readlines()
  
  return token_doc


def cleaning_tokens(token_doc):
  t1=[]
  for t_str in token_doc:
    t_list=re.split(' ',t_str)    
    t_list_clean=[re.sub('\n','',t) for t in t_list if t !='' and len(t)>1]  ## remove blank space and \n
    t1.append(t_list_clean)
  

  t2=[]
  for t_list in t1:
    t_clean=[]
    for t in t_list:
      if t !='' and len(t)>1:
        t_clean.append(t)
    t2.append(t_clean)

  return t2

In [None]:
#### step 4 : import training data

## 4.1. load trainingdata : THUCNews 
THUCNews_data_dir=os.path.join(master_dir,r'tokens', 'THUCNews',)


THUC_financial_token_file=os.path.join(THUCNews_data_dir,'finance','THUCNews_finance_Clean.txt')
THUC_financial_doc=load_tokens(THUC_financial_token_file)
THUC_financial_token=cleaning_tokens(THUC_financial_doc)


THUC_fashion_token_file=os.path.join(THUCNews_data_dir,'fashion','THUCNews_fashion_Clean.txt')
THUC_fashion_doc=load_tokens(THUC_fashion_token_file)
THUC_fashion_token=cleaning_tokens(THUC_fashion_doc)

THUC_game_token_file=os.path.join(THUCNews_data_dir,'game','THUCNews_game_Clean.txt')
THUC_game_doc=load_tokens(THUC_game_token_file)
THUC_game_token=cleaning_tokens(THUC_game_doc)

print(THUC_financial_token[:1])
print(THUC_fashion_token[:1])
print(THUC_game_token[:1])

In [None]:
## 4.2. load trainingdata : lifestyle/DIVA


lifestyle_token_dir='/content/gdrive/My Drive/ETNET/news_folders/DIVA/Finance/article_tokens'
lifestyle_token_filename='tokens_w2v_20191121.txt' 
lifestyle_token_file=os.path.join(lifestyle_token_dir,lifestyle_token_filename)

with open(lifestyle_token_file,'r',encoding='utf-8') as g:
  lifestyle_token_doc=g.readlines()

lifestyle_token=cleaning_tokens(lifestyle_token_doc)
print(len(lifestyle_token))



In [None]:
## 4.3. load trainingdata : news 
newstoken_holder='/content/gdrive/My Drive/ETNET/NER/news/code/PeijiYang/predict_data/corpus'#
newstoken_filename='full.txt'

newstoken_file=os.path.join(newstoken_holder,newstoken_filename)

with open(newtoken_file,'r',encoding='utf-8') as g:
  newtoken_doc=g.readlines()

newtoken=cleaning_tokens(newtoken_doc)
print(newtoken[:1])

In [None]:
## 4.4 combine datasets

total_token=THUC_financial_token +THUC_fashion_token+ THUC_game_token+  lifestyle_token + newtoken

print(len(total_token))
print(total_token[:5])

In [None]:
## 4.5. load training data :stopwords
stopwords_filename='stopwords.txt'
stopwords_file=os.path.join(newstoken_holder,stopwords_filename)

with open(stopwords_file,'r',encoding='utf-8') as g:
  stopwords=[w.strip('\n').strip(' ').strip() for w in g.readlines()]



In [None]:
#### step 5 :training and saving 

### 5.1. define hyper-parameters
min_count=1
size=300
window=5
iter=30
sg=1
hs=1


In [None]:
## 5.2.  Build vocab/dictionary for word2vec model
start_build_vocab=time.time()
w2v_model= Word2Vec( min_count=min_count, size=size, workers=cpu_count(), window=window, sg=sg,hs=hs) #iter=iter,
w2v_model.build_vocab(sentences, progress_per=300000, trim_rule=None)
print(w2v_model.corpus_total_words)
finish_build_vocab=time.time()
print('build_vocab time:',round(finish_build_vocab-start_build_vocab,2)/60,'mins')

18893136
build_vocab time: 1.7215 mins


In [None]:
## 5.3 train word2vec models
start_train=time.time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)
finish_train=time.time()
print('Gensim Word2Vec model training time:',round(finish_train-start_train,2)/60,'mins')

Gensim Word2Vec model training time: 37.46033333333333 mins


In [None]:
### 5.4 test word2vec model

test_word='陳志全' #TikTok #菅義偉  #甲骨文  #沃爾瑪

if test_word in w2v_model.wv.vocab:
  query_word=test_word
elif  test_word.lower() in w2v_model.wv.vocab:
  query_word=test_word
else:
  raise (f'OOV error: {test_word} is not in w2v_model.wv.vocab')

for w in w2v_model.wv.most_similar([query_word], topn=30):
    print(w)

('議會陣線', 0.7391514778137207)
('麥嘉晉', 0.7254851460456848)
('灑潑', 0.7130780220031738)
('朱凱廸', 0.7068368196487427)
('鄒家成', 0.6879165172576904)
('李偲嫣', 0.6834797859191895)
('廖添誠', 0.6802979707717896)
('立法會議員', 0.676720142364502)
('朱凱迪', 0.6760485172271729)
('陳克勤', 0.6726042032241821)
('郭家麒', 0.6643840074539185)
('陳恆鑌', 0.6597760915756226)
('公民黨', 0.6561070084571838)
('侯志強', 0.6550020575523376)
('人民力量', 0.6525052785873413)
('黃潤達', 0.6483956575393677)
('毛孟靜', 0.6458508372306824)
('鄭松泰', 0.6438637375831604)
('楊岳橋', 0.6401040554046631)
('鄭達鴻', 0.6396377682685852)
('會議廳', 0.6363130211830139)
('葛珮帆', 0.6345359086990356)
('扮無知', 0.6327601075172424)
('民主派', 0.6317592859268188)
('林卓廷', 0.6315789818763733)
('旅館業修訂條例草案', 0.6291408538818359)
('陳云根', 0.6289880275726318)
('165717', 0.6288633346557617)
('區諾軒', 0.6275712251663208)
('朱韶洪', 0.6263978481292725)


  if np.issubdtype(vec.dtype, np.int):


# 新增區段

In [None]:
### 5.5. Save word2vec model

w2v_model_dir=os.path.join('/content/gdrive/My Drive/ETNET/NER/news/code/PeijiYang/predict_data','w2v_model') ##r'model', ,'w2v_model'
news_type='etnet'

w2v_model_filename='{}.model'.format('{}_w2v'.format(news_type))
w2v_model_file=os.path.join(w2v_model_dir,w2v_model_filename)

w2v_bin_filename='{}.bin'.format('{}_w2v'.format(news_type))
w2v_bin_file=os.path.join(w2v_model_dir,w2v_bin_filename)


w2v_model.save(w2v_model_file)  # C binary format 磁碟空間比上一方法減半
w2v_model.wv.save_word2vec_format(w2v_bin_file, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
