In [None]:
import pandas as pd
import glob
import codecs
import os
from tqdm import tqdm_notebook as tqdm

In [None]:
#preprocessing
dirlist = ["dokujo-tsushin","it-life-hack","kaden-channel","livedoor-homme",
           "movie-enter","peachy","smax","sports-watch","topic-news"]

In [None]:
df = pd.DataFrame(columns=["class","news"])
for i in tqdm(dirlist):
    path = "../japanese-dataset/livedoor-news-corpus/"+i+"/*.txt"
    files = glob.glob(path)
    files.pop()
    for j in tqdm(files):
        f = codecs.open(j, 'r', 'utf-8')
        data = f.read() 
        f.close()
        t = pd.Series([i,"".join(data.split("\n")[3:])],index = df.columns)
        df  = df.append(t,ignore_index=True)

In [None]:
df[df["class"]=="livedoor-homme"]

In [None]:
df.to_csv("../japanese-dataset/livedoor-news-corpus/all.csv")

In [None]:
## create word2vec
import logging
import numpy as np
from gensim.models import Word2Vec
import MeCab
import time
from sklearn.preprocessing import normalize
import sys
import re
 
%matplotlib inline
import matplotlib.pyplot as plt 
from sklearn.manifold import TSNE
import numpy as np

In [None]:
start = time.time()
tokenizer =  MeCab.Tagger("-Owakati")  
sentences = []
print ("Parsing sentences from training set...")

# Loop over each news article.
for review in tqdm(df["news"]):
    try:
        # Split a review into parsed sentences.
        result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
        result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
        h = result.split(" ")
        h = list(filter(("").__ne__, h))
        sentences.append(h)
    except:
        continue

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

num_features = 200     # Word vector dimensionality
min_word_count = 20   # Minimum word count
num_workers = 40       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

print ("Training Word2Vec model...")
# Train Word2Vec model.
model = Word2Vec(sentences, workers=num_workers, hs = 0, sg = 1, negative = 10, iter = 25,\
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, seed=1)

model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(context) + "context_len2alldata"
model.init_sims(replace=True)
# Save Word2Vec model.
print ("Saving Word2Vec model...")
model.save("../japanese-dataset/livedoor-news-corpus/model/"+model_name)
endmodeltime = time.time()

print ("time : ", endmodeltime-start)

In [None]:
# plain word2vec t-SNE Visualization
word2vec_model=model

skip=0
limit=241 

vocab = word2vec_model.wv.vocab
emb_tuple = tuple([word2vec_model[v] for v in vocab])
X = np.vstack(emb_tuple)
 
model = TSNE(n_components=2, random_state=0,verbose=2)
np.set_printoptions(suppress=True)
model.fit_transform(X) 

In [None]:
# import plotly
# import plotly.plotly as py
# import plotly.graph_objs as go
# plotly.offline.init_notebook_mode(connected=False)
# # Create a trace
# trace = go.Scatter(
#     x =model.embedding_[skip:limit, 0],
#     y = model.embedding_[skip:limit, 1],
#     text=list(vocab.keys()),
#     mode = 'markers+text'
# )

# data = [trace]

# # Plot and embed in ipython notebook!
# plotly.offline.iplot(data)

In [None]:
## create gwbowv
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer
import pickle
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
def drange(start, stop, step):
    r = start
    while r < stop:
        yield r
        r += step

def cluster_GMM(num_clusters, word_vectors):
    # Initalize a GMM object and use it for clustering.
    clf =  GaussianMixture(n_components=num_clusters,
                    covariance_type="tied", init_params='kmeans', max_iter=50)
    # Get cluster assignments.
    clf.fit(word_vectors)
    idx = clf.predict(word_vectors)
    print ("Clustering Done...", time.time()-start, "seconds")
    # Get probabilities of cluster assignments.
    idx_proba = clf.predict_proba(word_vectors)
    # Dump cluster assignments and probability of cluster assignments. 
    pickle.dump(idx, open('../japanese-dataset/livedoor-news-corpus/model/gmm_latestclusmodel_len2alldata.pkl',"wb"))
    print ("Cluster Assignments Saved...")

    pickle.dump(idx_proba,open( '../japanese-dataset/livedoor-news-corpus/model/gmm_prob_latestclusmodel_len2alldata.pkl',"wb"))
    print ("Probabilities of Cluster Assignments Saved...")
    return (idx, idx_proba)

def read_GMM(idx_name, idx_proba_name):
    # Loads cluster assignments and probability of cluster assignments. 
    idx = pickle.load(open('../japanese-dataset/livedoor-news-corpus/model/gmm_latestclusmodel_len2alldata.pkl',"rb"))
    idx_proba = pickle.load(open( '../japanese-dataset/livedoor-news-corpus/model/gmm_prob_latestclusmodel_len2alldata.pkl',"rb"))
    print ("Cluster Model Loaded...")
    return (idx, idx_proba)

def get_probability_word_vectors(featurenames, word_centroid_map, num_clusters, word_idf_dict):
    # This function computes probability word-cluster vectors
    prob_wordvecs = {}
    for word in word_centroid_map:
        prob_wordvecs[word] = np.zeros( num_clusters * num_features, dtype="float32" )
        for index in range(0, num_clusters):
            try:
                prob_wordvecs[word][index*num_features:(index+1)*num_features] = model[word] * word_centroid_prob_map[word][index] * word_idf_dict[word]
            except:
                continue

    # prob_wordvecs_idf_len2alldata = {}
    # i = 0
    # for word in featurenames:
    #     i += 1
    #     if word in word_centroid_map:    
    #         prob_wordvecs_idf_len2alldata[word] = {}
    #         for index in range(0, num_clusters):
    #                 prob_wordvecs_idf_len2alldata[word][index] = model[word] * word_centroid_prob_map[word][index] * word_idf_dict[word] 

    

    # for word in prob_wordvecs_idf_len2alldata.keys():
    #     prob_wordvecs[word] = prob_wordvecs_idf_len2alldata[word][0]
    #     for index in prob_wordvecs_idf_len2alldata[word].keys():
    #         if index==0:
    #             continue
    #         prob_wordvecs[word] = np.concatenate((prob_wordvecs[word], prob_wordvecs_idf_len2alldata[word][index]))
    return prob_wordvecs

def create_cluster_vector_and_gwbowv(prob_wordvecs, wordlist, word_centroid_map, word_centroid_prob_map, dimension, word_idf_dict, featurenames, num_centroids, train=False):
    # This function computes SDV feature vectors.
    bag_of_centroids = np.zeros( num_centroids * dimension, dtype="float32" )
    global min_no
    global max_no

    for word in wordlist:
        try:
            temp = word_centroid_map[word]
        except:
            continue

        bag_of_centroids += prob_wordvecs[word]

    norm = np.sqrt(np.einsum('...i,...i', bag_of_centroids, bag_of_centroids))
    if(norm!=0):
        bag_of_centroids /= norm

    # To make feature vector sparse, make note of minimum and maximum values.
    if train:
        min_no += min(bag_of_centroids)
        max_no += max(bag_of_centroids)

    return bag_of_centroids

In [None]:
num_features = 200     # Word vector dimensionality
min_word_count = 20   # Minimum word count
num_workers = 40       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str(context) + "context_len2alldata"
# Load the trained Word2Vec model.
word2vec_model = Word2Vec.load("../japanese-dataset/livedoor-news-corpus/model/"+model_name)
# Get wordvectors for all words in vocabulary.
word_vectors = word2vec_model.wv.syn0

# Load train data.
train,test = train_test_split(df,test_size=0.3,random_state=40)
all = df

# Set number of clusters.
#num_clusters = 60
num_clusters = 1
# Uncomment below line for creating new clusters.
idx, idx_proba = cluster_GMM(num_clusters, word_vectors)

# Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
# idx_name = "gmm_latestclusmodel_len2alldata.pkl"
# idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
# idx, idx_proba = read_GMM(idx_name, idx_proba_name)

# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_centroid_map = dict(zip( word2vec_model.wv.index2word, idx ))
# Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to
# list of probabilities of cluster assignments.
word_centroid_prob_map = dict(zip( word2vec_model.wv.index2word, idx_proba ))

In [None]:
# Computing tf-idf values.
traindata = []
for review in all["news"]:
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    traindata.append(" ".join(h))

tfv = TfidfVectorizer(dtype=np.float32)
tfidfmatrix_traindata = tfv.fit_transform(traindata)
featurenames = tfv.get_feature_names()
idf = tfv._tfidf.idf_

# Creating a dictionary with word mapped to its idf value 
print ("Creating word-idf dictionary for Training set...")

word_idf_dict = {}
for pair in zip(featurenames, idf):
    word_idf_dict[pair[0]] = pair[1]
    
# Pre-computing probability word-cluster vectors.
prob_wordvecs = get_probability_word_vectors(featurenames, word_centroid_map, num_clusters, word_idf_dict)

In [None]:
pickle.dump(prob_wordvecs,open("../japanese-dataset/livedoor-news-corpus/model/prob_wordvecs.pkl","wb"))

In [None]:
# gwbowv is a matrix which contains normalised document vectors.
gwbowv = np.zeros( (train["news"].size, num_clusters*(num_features)), dtype="float32")

counter = 0

min_no = 0
max_no = 0
for review in train["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = h
    gwbowv[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, word_centroid_map, word_centroid_prob_map, num_features, word_idf_dict, featurenames, num_clusters, train=True)
    counter+=1
    if counter % 1000 == 0:
        print ("Train News Covered : ",counter)

gwbowv_name = "SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"

gwbowv_test = np.zeros( (test["news"].size, num_clusters*(num_features)), dtype="float32")

counter = 0

for review in test["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = h
    gwbowv_test[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, word_centroid_map, word_centroid_prob_map, num_features, word_idf_dict, featurenames, num_clusters)
    counter+=1
    if counter % 1000 == 0:
        print ("Test News Covered : ",counter)

test_gwbowv_name = "TEST_SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"

print ("Making sparse...")
# Set the threshold percentage for making it sparse. 
percentage = 0.04
min_no = min_no*1.0/len(train["news"])
max_no = max_no*1.0/len(train["news"])
print ("Average min: ", min_no)
print ("Average max: ", max_no)
thres = (abs(max_no) + abs(min_no))/2
thres = thres*percentage

# Make values of matrices which are less than threshold to zero.
temp = abs(gwbowv) < thres
gwbowv[temp] = 0

temp = abs(gwbowv_test) < thres
gwbowv_test[temp] = 0

#saving gwbowv train and test matrices
np.save("../japanese-dataset/livedoor-news-corpus/model/"+gwbowv_name, gwbowv)
np.save("../japanese-dataset/livedoor-news-corpus/model/"+test_gwbowv_name, gwbowv_test)

In [None]:
#all version
counter = 0
all_gwbowv_name = "ALL_SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"
gwbowv_all = np.zeros( (all["news"].size, num_clusters*(num_features)), dtype="float32")

for review in all["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = list(h)
    gwbowv_all[counter] = create_cluster_vector_and_gwbowv(prob_wordvecs, words, word_centroid_map, word_centroid_prob_map, num_features, word_idf_dict, featurenames, num_clusters)
    counter+=1
    if counter % 1000 == 0:
        print ("All News Covered : ",counter)
        
print ("Making sparse...")
# Set the threshold percentage for making it sparse. 
percentage = 0.04
print ("Average min: ", min_no)
print ("Average max: ", max_no)
thres = (abs(max_no) + abs(min_no))/2
thres = thres*percentage

temp = abs(gwbowv_all) < thres
gwbowv_all[temp] = 0

np.save("../japanese-dataset/livedoor-news-corpus/model/"+all_gwbowv_name,gwbowv_all)

In [None]:
import pickle
prob_wordvecs = pickle.load(open("../japanese-dataset/livedoor-news-corpus/model/prob_wordvecs.pkl","rb")) 

In [None]:
len(prob_wordvecs["独身"])

In [None]:
import numpy as np

In [None]:
## plot modified word vector representation
skip=0
limit=241 

vocab = list(prob_wordvecs.keys())
tsne_target = []
for i in range(limit):
    tsne_target.append(prob_wordvecs[vocab[i]])
X = np.vstack(tsne_target)
 
tsne_model_scdv = TSNE(n_components=2, random_state=0,verbose=2)
np.set_printoptions(suppress=True)
tsne_model_scdv.fit_transform(X)
pickle.dump(tsne_model_scdv,open("../japanese-dataset/livedoor-news-corpus/model/tsne_scdv.pkl","wb"))

In [None]:
#num_clusters = 60
num_clusters = 1
num_features = 200
test_gwbowv_name = "TEST_SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"
gwbowv_name = "SDV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_gmm_sparse.npy"
gwbowv = np.load("../japanese-dataset/livedoor-news-corpus/model/"+gwbowv_name)
gwbowv_test = np.load("../japanese-dataset/livedoor-news-corpus/model/"+test_gwbowv_name)

In [None]:
## test lgb
from sklearn.metrics import classification_report
import lightgbm as lgb

strt = time.time()
clf = lgb.LGBMClassifier(objective="multiclass",random_state=40)
clf.fit(gwbowv, train["class"])
Y_true, Y_pred  = test["class"], clf.predict(gwbowv_test)
print ("Report")
print (classification_report(Y_true, Y_pred, digits=6))
print ("Accuracy: ",clf.score(gwbowv_test,test["class"]))
print ("Time taken:", time.time() - strt, "\n")

In [None]:
#document vector visualization
## plain word2vec　or fasttext
def plain_word2vec_document_vector(sentence,word2vec_model,num_features):
    bag_of_centroids = np.zeros(num_features, dtype="float32")

    for word in sentence:
        try:
            temp = word2vec_model[word]
        except:
            continue
        bag_of_centroids += temp

    bag_of_centroids =  bag_of_centroids / len(sentence)
        
    return bag_of_centroids

In [None]:
plainDocVec_all = {}
counter = 0
num_features = 200

for review in all["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = list(h)
    plainDocVec_all[counter] = plain_word2vec_document_vector(words,word2vec_model,num_features)
    counter+=1
    if counter % 1000 == 0:
        print ("All News Covered : ",counter)

In [None]:
plainDocVec_train = {}
counter = 0
for review in train["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = list(h)
    plainDocVec_train[counter] = plain_word2vec_document_vector(words,word2vec_model,num_features)
    counter+=1
    if counter % 1000 == 0:
        print ("Train News Covered : ",counter)
        
plainDocVec_test = {}
counter = 0
for review in test["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = list(h)
    plainDocVec_test[counter] = plain_word2vec_document_vector(words,word2vec_model,num_features)
    counter+=1
    if counter % 1000 == 0:
        print ("Test News Covered : ",counter)

In [None]:
pickle.dump(plainDocVec_all,open("../japanese-dataset/livedoor-news-corpus/model/plaindocvec_all.pkl","wb"))
pickle.dump(plainDocVec_train,open("../japanese-dataset/livedoor-news-corpus/model/plaindocvec_train.pkl","wb"))
pickle.dump(plainDocVec_test,open("../japanese-dataset/livedoor-news-corpus/model/plaindocvec_test.pkl","wb"))

In [None]:
## visualize all document vector
emb_tuple = tuple([plainDocVec_all[v] for v in plainDocVec_all.keys()])
X = np.vstack(emb_tuple)
 
plain_w2v_doc= TSNE(n_components=2, random_state=0,verbose=2)
np.set_printoptions(suppress=True)
plain_w2v_doc.fit(X)

In [None]:
#alldoc_plainw2v_tsne

In [None]:
import seaborn as sns

alldoc_plainw2v_tsne = pd.DataFrame(plain_w2v_doc.embedding_[:, 0],columns = ["x"])
alldoc_plainw2v_tsne["y"] = pd.DataFrame(plain_w2v_doc.embedding_[:, 1])
alldoc_plainw2v_tsne["class"] = list(all["class"])

sns.lmplot(data=alldoc_plainw2v_tsne,x="x",y="y",hue="class",fit_reg=False,size=8)

In [None]:
## test lgb plain
from sklearn.metrics import classification_report
import lightgbm as lgb

strt = time.time()
clf = lgb.LGBMClassifier(objective="multiclass",random_state=40)
clf.fit(np.array(list(plainDocVec_train.values())), train["class"])
Y_true, Y_pred  = test["class"], clf.predict(np.array(list(plainDocVec_test.values())))
print ("Report")
print (classification_report(Y_true, Y_pred, digits=6))
print ("Accuracy: ",clf.score(np.array(list(plainDocVec_test.values())),test["class"]))
print ("Time taken:", time.time() - strt, "\n")

In [None]:
## visualize all document vector SCDV
X = np.vstack(gwbowv_all)
 
scdv_doc= TSNE(n_components=2, random_state=0,verbose=2)
np.set_printoptions(suppress=True)
scdv_doc.fit(X)

In [None]:
alldoc_scdv_tsne = pd.DataFrame(scdv_doc.embedding_[:, 0],columns = ["x"])
alldoc_scdv_tsne["y"] = pd.DataFrame(scdv_doc.embedding_[:, 1])
alldoc_scdv_tsne["class"] = list(all["class"])
alldoc_scdv_tsne.to_csv("alldoc_scdv_tsne.csv")
sns.lmplot(data=alldoc_scdv_tsne,x="x",y="y",hue="class",fit_reg=False,size=8)

## FastText

In [None]:
# create txt file for fasttext
def create_txt_fasttext(all):
    sentence = all["news"]
    res = []
    for doc in sentence:
        result = tokenizer.parse(doc).replace("\u3000","").replace("\n","")
        result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
        h = result.split(" ")
        h = list(filter(("").__ne__, h))
        res.append(" ".join(h))
    return " ".join(res)

In [None]:
all_words = create_txt_fasttext(all)
f = codecs.open("../japanese-dataset/livedoor-news-corpus/for-fasttext/corpus.txt", 'w', 'utf-8') #
f.write(all_words) 
f.close()

In [None]:
import fasttext
fasttext_model = fasttext.train_unsupervised('../japanese-dataset/livedoor-news-corpus/for-fasttext/corpus.txt', model='skipgram')
fasttext_model.save_model("model.bin")

In [None]:
fasttextDocVec_train = {}
counter = 0
num_features_fasttext = 100
for review in train["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = list(h)
    fasttextDocVec_train[counter] = plain_word2vec_document_vector(words,fasttext_model,num_features_fasttext)
    counter+=1
    if counter % 1000 == 0:
        print ("Train News Covered : ",counter)
        
fasttextDocVec_test = {}
counter = 0
for review in test["news"]:
    # Get the wordlist in each news article.
    result = tokenizer.parse(review).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = filter(("").__ne__, h)
    words = list(h)
    fasttextDocVec_test[counter] = plain_word2vec_document_vector(words,fasttext_model,num_features_fasttext)
    counter+=1
    if counter % 1000 == 0:
        print ("Test News Covered : ",counter)

In [None]:
#fasttext
strt = time.time()
clf = lgb.LGBMClassifier(objective="multiclass",random_state=40)
clf.fit(np.array(list(fasttextDocVec_train.values())), train["class"])
Y_true, Y_pred  = test["class"], clf.predict(np.array(list(fasttextDocVec_test.values())))
print ("Report")
print (classification_report(Y_true, Y_pred, digits=6))
print ("Accuracy: ",clf.score(np.array(list(fasttextDocVec_test.values())),test["class"]))
print ("Time taken:", time.time() - strt, "\n")

## Doc2Vec

In [None]:
from gensim import models
from gensim.models.doc2vec import LabeledSentence

In [None]:
def doc_to_sentence(doc, name):
    result = tokenizer.parse(doc).replace("\u3000","").replace("\n","")
    result = re.sub(r'[0123456789０１２３４５６７８９！＠＃＄％＾＆\-|\\＊\“（）＿■×※⇒—●(：〜＋=)／*&^%$#@!~`){}…\[\]\"\'\”:;<>?＜＞？、。・,./『』【】「」→←○]+', "", result)
    h = result.split(" ")
    h = list(filter(("").__ne__, h))
    return LabeledSentence(words=h, tags=[name])

def corpus_to_sentences(df):
    docs   = df["news"]
    res = []
    for i in range(len(docs)):
        t = doc_to_sentence(docs[i],i)
        res.append(t)
    return res

In [None]:
sentences = corpus_to_sentences(all)

In [None]:
doc2_model = models.Doc2Vec(dm=0, size=300, window=15, alpha=.025,
        min_alpha=.025, min_count=1, sample=1e-6)
doc2_model.build_vocab(sentences)

In [None]:
for epoch in tqdm(range(10)):
    doc2_model.train(sentences,total_examples=len(all),epochs=word2vec_model.iter)
    doc2_model.alpha -= 0.002  # decrease the learning rate`
    doc2_model.min_alpha = doc2_model.alpha  # fix the learning rate, no decay
doc2_model.save("../japanese-dataset/livedoor-news-corpus/model/doc2vec.model")

In [None]:
doc2_train = []
for i in train.index:
    doc2_train.append(doc2_model.docvecs[i])

In [None]:
doc2_test = []
for i in test.index:
    doc2_test.append(doc2_model.docvecs[i])

In [None]:
## test lgb doc2vec
strt = time.time()
clf = lgb.LGBMClassifier(objective="multiclass",random_state=40)
clf.fit(np.array(doc2_train), train["class"])
Y_true, Y_pred  = test["class"], clf.predict(np.array(doc2_test))
print ("Report")
print (classification_report(Y_true, Y_pred, digits=6))
print ("Accuracy: ",clf.score(np.array(doc2_test),test["class"]))
print ("Time taken:", time.time() - strt, "\n")