In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import Word2Vec



In [2]:
#Extracting key-word of each article of X
def key_extract(X,Y,weights,spilt_factor=0.1,print_detail = 0):  
    ret_X = []                         #save the key-word of all article 
    for i in range(len(X)):
        words = X[i]                  #each word in article-i 
        temp_X = []                     #save the key-word repeated of an article 
        for word in words:
            temp = weights[int(word)][Y[i]]
            if temp > spilt_factor:
                temp_X.append(word)     #save the word
        if i<print_detail:
            print('Line'+str(i)+': total word: '+str(len(words))+' key word(repition): '+str(len(temp_X))+' percentage: '+str(len(temp_X)*1./len(words)))
            print(temp_X)
        ret_X.append(temp_X)             #save the article
    return ret_X

In [3]:
def word_count(X):
    words = set()
    for x in X:
        for word in x:
            words.add(word)
    return len(words),words

In [4]:
def article_count(X,key_word,print_detail = 0):
    X_key_word = []   # key-word repeated of all articles
    no_word_article = 0   #sum of article without key-word 
    for i in range(len(X)):
        x_key_word = []    #key-word repeated of an article
        dif_word = set()
        for word in X[i]:
            if word in key_word:
                x_key_word.append(word)
                dif_word.add(word)
        if len(dif_word) == 0:
            no_word_article +=1
        X_key_word.append(x_key_word)
        if i<print_detail:
            print('line '+str(i)+' existing word: '+str(len(x_key_word))+' '+str(len(x_key_word)*1./len(X[i]))+' different word: '+str(len(dif_word))+' '+str(len(dif_word)*1./len(X[i])))
    return no_word_article,X_key_word

In [5]:
def choose_weights(X,Y,weights1,weights1_split_factor,weights2,weights2_split_factor):
    log = {'num_of_key_word_after_weights1':0,'num_of_key_word_after_weights2':0,'no_word_article':0}
    
    weights_X = key_extract(X,Y,weights1,weights1_split_factor)    #using weights1 to get key-word
    log['num_of_key_word_after_weights1'],_ = word_count(weights_X)
    
    weights_X = key_extract(weights_X,Y,weights2,weights2_split_factor)  #using weights2 to get key-word
    log['num_of_key_word_after_weights2'],key_word_set = word_count(weights_X)
    
    log['no_word_article'],_ = article_count(X,key_word_set,print_detail = 0)  #counting the missing articles
    return log

In [10]:
#generator version:
def choose_weights_generator(X,Y,weights1,weights1_split_factor,weights2):
    log = {'num_of_key_word_after_weights1':0,'num_of_key_word_after_weights2':0,'no_word_article':0}
    
    weights1_X = key_extract(X,Y,weights1,weights1_split_factor)    #using weights1 to get key-word
    log['num_of_key_word_after_weights1'],_ = word_count(weights1_X)
    for j in range(11):
        log['num_of_key_word_after_weights2'] = 0
        log['no_word_article'] = 0
        weights2_X = key_extract(weights1_X,Y,weights2,j/10.)  #using weights2 to get key-word
        log['num_of_key_word_after_weights2'],key_word_set = word_count(weights2_X)
    
        log['no_word_article'],_ = article_count(X,key_word_set,print_detail = 0)  #counting the missing articles
        yield log

In [11]:
train_set = pd.read_csv('./../data/train_set.csv')
weights1 = np.load('./../data/word_weigt.npy')
weights2 = np.load('./../data/word_article.npy')

In [12]:
pre_X = train_set['word_seg'].values
Y = train_set['class'].values
X = []
for x in pre_X:
    X.append(x.split())

In [8]:
ret_orig_X = key_extract(X,Y,weights1,0.4)
print(len(ret_orig_X))
word_num,_ = word_count(ret_orig_X)
print(word_num)

102277
761917


In [9]:
ret_orig_X = key_extract(ret_orig_X,Y,weights2,0.1)
print(len(ret_orig_X))
word_num,key_word = word_count(ret_orig_X)
print(word_num)

102277
2041


In [10]:
print(choose_weights(X,Y,weights1,0.4,weights2,0.1))

{'num_of_key_word_after_weights1': 761917, 'num_of_key_word_after_weights2': 2041, 'no_word_article': 2285}


In [14]:
for i in range(11):
     for j in range(11):
        log = choose_weights_generator(X,Y,weights1,i/10.,weights2,j/10.)
        print('('+str(i/10.)+','+str(j/10.)+'):',end = '')
        print(log)

(0.0,0.0):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 875129, 'no_word_article': 0}
(0.0,0.1):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 5637, 'no_word_article': 0}
(0.0,0.2):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 2691, 'no_word_article': 0}
(0.0,0.3):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 1642, 'no_word_article': 0}
(0.0,0.4):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 1126, 'no_word_article': 0}
(0.0,0.5):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 821, 'no_word_article': 0}


KeyboardInterrupt: 

In [15]:
for i in range(11):
    j = 0
    for log in choose_weights_generator(X,Y,weights1,i/10.,weights2):
        print('('+str(i/10.)+','+str(j/10.)+'):',end = '')
        print(log)
        j+=1

(0.0,0.0):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 875129, 'no_word_article': 0}
(0.0,0.1):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 5637, 'no_word_article': 0}
(0.0,0.2):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 2691, 'no_word_article': 0}
(0.0,0.3):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 1642, 'no_word_article': 0}
(0.0,0.4):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 1126, 'no_word_article': 0}
(0.0,0.5):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 821, 'no_word_article': 0}
(0.0,0.6):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 632, 'no_word_article': 0}
(0.0,0.7):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weights2': 505, 'no_word_article': 0}
(0.0,0.8):{'num_of_key_word_after_weights1': 875129, 'num_of_key_word_after_weigh

(0.6,0.5):{'num_of_key_word_after_weights1': 643767, 'num_of_key_word_after_weights2': 110, 'no_word_article': 29648}
(0.6,0.6):{'num_of_key_word_after_weights1': 643767, 'num_of_key_word_after_weights2': 78, 'no_word_article': 33992}
(0.6,0.7):{'num_of_key_word_after_weights1': 643767, 'num_of_key_word_after_weights2': 67, 'no_word_article': 35581}
(0.6,0.8):{'num_of_key_word_after_weights1': 643767, 'num_of_key_word_after_weights2': 55, 'no_word_article': 38103}
(0.6,0.9):{'num_of_key_word_after_weights1': 643767, 'num_of_key_word_after_weights2': 44, 'no_word_article': 42556}
(0.6,1.0):{'num_of_key_word_after_weights1': 643767, 'num_of_key_word_after_weights2': 38, 'no_word_article': 46029}
(0.7,0.0):{'num_of_key_word_after_weights1': 603205, 'num_of_key_word_after_weights2': 603205, 'no_word_article': 3937}
(0.7,0.1):{'num_of_key_word_after_weights1': 603205, 'num_of_key_word_after_weights2': 750, 'no_word_article': 24982}
(0.7,0.2):{'num_of_key_word_after_weights1': 603205, 'num_o