In [230]:
def is_valid_char(c):
    ## modify it to be faster
    # h = '{:06X}'.format(ord(c))
    # if ord('a') <= ord(c.lower()) and ord(c) <= ord('z'):  
    oc = ord(c)
    if 97 <= oc and oc <= 122:
        # latin character
        # print(c, 'is latin lower')
        return True
    if 65 <= oc and oc <= 90:
        # latin character
        # print(c, 'is latin upper')
        return True
    # if h.startswith('000E'):
    if (oc & 0xFFFF00) == 0x000E00:
        # is thai character
        return True
    return False

def valid_words(word):
    result = []
    buffer = None
    for c in word:
        if is_valid_char(c):
            if buffer is None:
                buffer = c
                result.append(buffer)
            else:
                buffer += c
                result[-1] = buffer
        else:
            if buffer is not None:
                buffer = None
    return result

ss = ['sinthorn summary ', '11', '/', '10', '/', '2560', ' ', ':', ' ', '‡∏ï‡πà‡∏≤‡∏á', '‡∏ä‡∏≤‡∏ï‡∏¥', '‡∏ã‡∏∑‡πâ‡∏≠', '‡∏™‡∏∏‡∏ó‡∏ò‡∏¥', ' ', '1', ',', '600', '.', '57', ' ', '‡∏•‡∏ö.']
# valid_words('‡∏≠‡∏≤‡∏Å‡∏≤‡∏® üò≠_üò≠_üò≠ ‡∏£‡πâ‡∏≠‡∏ô_‡∏Å‡∏±‡∏ö ADVANC ad')
valid_words(' '.join(ss))

['sinthorn', 'summary', '‡∏ï‡πà‡∏≤‡∏á', '‡∏ä‡∏≤‡∏ï‡∏¥', '‡∏ã‡∏∑‡πâ‡∏≠', '‡∏™‡∏∏‡∏ó‡∏ò‡∏¥', '‡∏•‡∏ö']

In [246]:
def file_sentences(path):
    import ujson
    
    
    with open(path) as f:
        try:
            topic = ujson.loads(f.read())
            attributes = [topic['message'], topic['title']] + [comment['text'] for comment in topic['comments']]
            for sentences in attributes:
                for sentence in sentences:
                    sentence = [word.lower() for word in sentence]
                    sentence2 = []
                    for word in sentence:
                        words = valid_words(word)
                        for valid_word in words:
                            sentence2.append(valid_word)

                    if len(sentence2) > 0:
                        yield(sentence2)
        except ValueError as e:
            pass

def corpus_sentences(size_limit = 10000, seed=None):
    import glob
    import itertools
    import random
    if seed is not None:
        random.seed(seed)
    paths = glob.glob('tokenized/*/*/*/*.json')
    random.shuffle(paths)
    sample_paths = paths
    total_size = 0
    for path in sample_paths:
        for sentence in file_sentences(path):
            total_size += sum([len(word) for word in sentence])
            yield(sentence)
        if total_size > size_limit:
            break
            

for sentence in corpus_sentences(seed=10):
    print(sentence)

['‡∏ï‡∏≠‡∏ô', '‡∏•‡∏á', '‡∏à‡∏∞', '‡∏•‡∏á', '‡πÅ‡∏ö‡∏ö', '‡∏•‡∏¥‡∏ü‡∏ï‡πå‡πÑ‡∏°‡πä‡πÄ‡∏ô‡∏µ‡πà‡∏¢']
['paf', '‡∏°‡∏±‡∏ô', '‡πÄ‡∏•‡πà‡∏ô', '‡∏≠‡∏∞‡πÑ‡∏£', '‡∏Å‡∏±‡∏ô']
['‡πÑ‡∏°‡πà', '‡∏£‡∏π‡πâ', '‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô', '‡∏Å‡∏±‡∏ô', '‡∏Ñ‡∏£‡∏±‡∏ö']
['‡∏´‡∏∏‡πâ‡∏ô', '‡∏Ç‡∏≠‡∏á', '‡∏Ñ‡∏ô', '‡∏´‡∏ß‡∏±‡∏á', '‡∏£‡∏ß‡∏¢', '‡∏Å‡πà‡∏≠‡∏ô', '‡∏õ‡∏µ', '‡πÉ‡∏´‡∏°‡πà', '‡∏•‡∏∞‡∏°‡∏±‡πà‡∏á', '‡∏Ñ‡∏£‡∏±‡∏ö']
['paf', '‡∏ö‡∏ß‡∏Å', '‡∏™‡∏ß‡∏ô', '‡∏ï‡∏•‡∏≤‡∏î‡∏Ø', '‡∏Å‡∏ß‡πà‡∏≤', '‡πÇ‡∏ö‡∏£‡∏Å', '‡∏Ø', '‡πÅ‡∏ô‡∏∞', '‡∏Ç‡∏≤‡∏¢', '‡∏ó‡∏µ‡πà', '‡πÅ‡∏ô‡∏ß‡∏ï‡πâ‡∏≤‡∏ô', '‡∏ö']
['‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏Ç‡πà‡∏≤‡∏ß', '‡∏≠‡∏µ‡πÑ‡∏ü‡πÅ‡∏ô‡∏ô‡∏ã‡πå', '‡πÑ‡∏ó‡∏¢', '‡∏ò', '‡∏Ñ', '‡∏ô']
['‡∏ú‡∏π‡πâ', '‡∏™‡∏∑‡πà‡∏≠', '‡∏Ç‡πà‡∏≤‡∏ß', '‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô', '‡∏Ñ‡∏ß‡∏≤‡∏°', '‡πÄ‡∏Ñ‡∏•‡∏∑‡πà‡∏≠‡∏ô‡πÑ‡∏´‡∏ß', '‡∏£‡∏≤‡∏Ñ‡∏≤', '‡∏´‡∏∏‡πâ‡∏ô', '‡∏ö‡∏£‡∏¥‡∏©‡∏±‡∏ó', '‡πÅ‡∏û‡∏ô‡πÄ‡∏≠‡πÄ‡∏ã‡∏µ‡∏¢‡∏ü‡∏∏‡∏ï‡πÅ‡∏ß‡∏£‡πå', '‡∏à‡∏≥‡∏Å‡∏±‡∏î', '‡∏°‡∏´‡∏≤‡∏ä‡∏ô', '‡∏´‡∏£‡∏∑‡∏≠', 'paf', '‡∏õ‡∏£‡∏±‡∏ö', '‡∏ï‡∏±‡∏ß', '‡πÄ‡∏û‡∏¥‡πà‡∏°', '‡∏Ç‡∏∂‡πâ‡∏ô'

In [6]:
import random
import sys
seed = random.randrange(sys.maxsize)
size_limit = 50000000

In [7]:
%%time
with open('corpus.cache.txt','w') as f:
    for sentence in corpus_sentences(seed=seed, size_limit=size_limit):
        f.write(' '.join(sentence))
        f.write("\n")

CPU times: user 1min 17s, sys: 1.62 s, total: 1min 18s
Wall time: 2min 7s


In [8]:
%%time

def cached_sentences():
    with open('corpus.cache.txt') as f:
        for line in f:
            sentence = line.split()
            if len(sentence) > 0:
                yield(sentence)
                
for s in cached_sentences():
    pass

CPU times: user 1.76 s, sys: 96 ms, total: 1.85 s
Wall time: 13.8 s


In [9]:
%%time
import gensim
# biphrases = gensim.models.phrases.Phrases(corpus_sentences(size_limit=size_limit, seed=seed))
biphrases = gensim.models.phrases.Phrases(cached_sentences())
bigram_transformer = gensim.models.phrases.Phraser(biphrases)
biphrases = None

triphrases = gensim.models.phrases.Phrases(bigram_transformer[cached_sentences()])
trigram_transformer = gensim.models.phrases.Phraser(triphrases)
triphrases = None

CPU times: user 2min 28s, sys: 496 ms, total: 2min 29s
Wall time: 2min 30s


In [10]:
bigram_transformer.save('bigram.transformer.bin')
trigram_transformer.save('trigram.transformer.bin')

In [None]:
# calculation time log
# trigram Wall time: 4min 12s
# word vector Wall time: 3min 38s

In [171]:
def trisentences():
    return trigram_transformer[bigram_transformer[cached_sentences()]]

In [172]:
%%time
with open('corpus.3gram.cache.txt','w') as f:
    for sentence in trisentences():
        f.write(' '.join(sentence))
        f.write("\n")

CPU times: user 1min 23s, sys: 448 ms, total: 1min 23s
Wall time: 1min 25s


In [258]:
def tri_cached_sentences():
    with open('corpus.3gram.cache.txt') as f:
        for line in f:
            sentence = line.split()
            if len(sentence) > 0:
                yield(sentence)
                
for s in tri_cached_sentences():
    pass

In [259]:
# make it iterator, not just generator
class TriCachedCorpus(object):
    def __iter__(self):
        for sentence in tri_cached_sentences():
            yield(sentence)

In [328]:
bigram_transformer = None
trigram_transformer = None

In [260]:
import itertools
for s in itertools.islice(TriCachedCorpus(), 100):
    print(s)

['eva_air', '‡∏´‡∏±‡∏Å', '‡∏à‡∏≤‡∏Å', '‡∏ö‡∏±‡∏ï‡∏£_‡πÄ‡∏î‡∏ö‡∏¥‡∏ï', '‡πÑ‡∏î‡πâ', '‡πÑ‡∏´‡∏°', '‡∏Ñ‡πà‡∏∞']
['‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô', '‡πÉ‡∏ä‡πâ', '‡∏ö‡∏±‡∏ï‡∏£', '‡∏Ç‡∏≠‡∏á', '‡∏Å‡∏™‡∏¥‡∏Å‡∏£', '‡∏°‡∏µ', '‡∏™‡πà‡∏ß‡∏ô', '‡∏•‡∏î', '‡∏î‡πâ‡∏ß‡∏¢', '‡∏Ñ‡∏£‡∏±‡∏ö']
['‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ', '‡∏Ñ‡∏£‡∏±‡∏ö', '‡∏ú‡∏≠‡∏á_‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô_‡πÄ‡∏´‡∏•‡πà‡∏≤', '‡∏ô‡∏±‡∏Å_‡∏£‡∏ö_‡∏Å‡∏≠‡∏á_‡∏ó‡∏∏‡∏ô', '‡πÅ‡∏•‡∏∞', '‡∏û‡∏£‡∏≤‡∏ô_‡∏´‡∏∏‡πâ‡∏ô_‡∏ó‡∏∏‡∏Å_‡∏ó‡πà‡∏≤‡∏ô', '‡πÄ‡∏°‡∏∑‡πà‡∏≠_‡∏ß‡∏≤‡∏ô', '‡∏õ‡∏π‡πà‡πÄ‡∏ã‡πá‡∏ï', '‡∏•‡∏á‡πÅ‡∏î‡∏á', '‡∏î‡∏±‡∏ä‡∏ô‡∏µ', '‡∏õ‡∏¥‡∏î', '‡∏ó‡∏µ‡πà', '‡∏à‡∏∏‡∏î', '‡∏´‡∏£‡∏∑‡∏≠', '‡πÄ‡∏´‡∏ï‡∏∏', '‡∏à‡∏≤‡∏Å', '‡∏Ñ‡∏ß‡∏≤‡∏°', '‡πÑ‡∏°‡πà', '‡πÅ‡∏ô‡πà‡∏ô‡∏≠‡∏ô', '‡∏î‡πâ‡∏≤‡∏ô', '‡∏ô‡πÇ‡∏¢‡∏ö‡∏≤‡∏¢', '‡∏á‡∏ö_‡∏õ‡∏£‡∏∞‡∏°‡∏≤‡∏ì', '‡∏ù‡∏±‡πà‡∏á_‡∏™‡∏´‡∏£‡∏±‡∏ê', '‡πÇ‡∏ô‡πà‡∏ô', '‡∏Å‡∏£‡∏∞‡∏ó‡∏ö', '‡πÑ‡∏õ', '‡∏ó‡∏±‡πà‡∏ß_‡πÇ‡∏•‡∏Å', '‡∏ó‡∏≥', '‡πÉ‡∏´‡πâ', '‡πÇ‡∏õ‡∏£‡πÄ‡∏à‡∏Ñ_ddt', '‡πÑ‡∏î‡πâ', '‡∏™‡πà‡∏ß‡∏ô', '‡∏•‡∏î', '‡πÑ‡∏õ', '‡∏û‡∏≠‡∏™‡∏°‡∏Ñ‡∏ß‡∏£', '‡∏î‡∏µ‡πÉ‡∏à', '‡∏°‡∏±‡πä‡

In [270]:
%%time
import datetime
print(datetime.datetime.now())
wvmodel = gensim.models.Word2Vec(TriCachedCorpus(), size=100, window=10, sg=1, iter=10)

wvmodel.train(TriCachedCorpus(), total_examples=wvmodel.corpus_count, epochs=wvmodel.epochs)

2018-08-15 16:23:50.819916
CPU times: user 30min 19s, sys: 3.57 s, total: 30min 22s
Wall time: 30min 42s


In [262]:
pairs = [
    ('‡∏Ñ‡∏±‡∏ó', 'cut'),
    ('‡∏î‡∏≠‡∏¢', '‡∏ï‡∏¥‡∏î_‡∏î‡∏≠‡∏¢'),
    ('‡∏î‡∏≠‡∏¢', '‡∏ñ‡∏±‡∏ß'),
    ('‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô', '‡∏î‡∏≠‡∏¢'),
    ('‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô', '‡∏Å‡∏≥‡πÑ‡∏£'),
    ('floor', '‡∏î‡∏≠‡∏¢'),
    ('floor', '‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô'),
    ('floor', '‡∏Ñ‡∏±‡∏ó'),
    ('set', '‡∏õ‡∏π‡πà'),
]

for pair in pairs:
    sim = wvmodel.wv.similarity(pair[0], pair[1])
    print('{:5.2f}% {} {}'.format(sim * 100, pair[0], pair[1]))

76.60% ‡∏Ñ‡∏±‡∏ó cut
81.96% ‡∏î‡∏≠‡∏¢ ‡∏ï‡∏¥‡∏î_‡∏î‡∏≠‡∏¢
73.12% ‡∏î‡∏≠‡∏¢ ‡∏ñ‡∏±‡∏ß
40.12% ‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô ‡∏î‡∏≠‡∏¢
53.43% ‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô ‡∏Å‡∏≥‡πÑ‡∏£
46.96% floor ‡∏î‡∏≠‡∏¢
37.01% floor ‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô
47.82% floor ‡∏Ñ‡∏±‡∏ó
55.03% set ‡∏õ‡∏π‡πà


  if np.issubdtype(vec.dtype, np.int):


In [359]:
wvmodel.wv.most_similar('‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß')

  if np.issubdtype(vec.dtype, np.int):


[('‡πÅ‡∏î‡∏á', 0.9008665680885315),
 ('‡∏û‡∏µ‡πà‡∏´‡∏°‡∏≤‡∏Å', 0.7620635628700256),
 ('‡πÄ‡∏î‡πâ‡∏á_‡∏î‡∏∂‡πã‡∏á', 0.7315344214439392),
 ('‡∏û‡∏µ‡πàwe_love_thailand', 0.7221149206161499),
 ('‡πÅ‡∏î‡∏á_‡πÅ‡∏à‡πã', 0.7144074440002441),
 ('‡∏•‡∏á‡πÅ‡∏î‡∏á', 0.7134711146354675),
 ('‡πÄ‡∏≠‡πÄ‡∏ä‡∏µ‡∏¢‡πÅ‡∏î‡∏á', 0.7097564339637756),
 ('‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß_‡∏≠‡πà‡∏≠‡∏ô', 0.707740306854248),
 ('‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß_‡πÄ‡∏Ç‡πâ‡∏°', 0.7052264213562012),
 ('‡∏≠‡∏≠‡∏Å‡πÅ‡∏î‡∏á', 0.6860056519508362)]

In [264]:
wvmodel.wv.most_similar('‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢')

  if np.issubdtype(vec.dtype, np.int):


[('‡∏ï‡∏¥‡∏î_‡∏î‡∏≠‡∏¢', 0.8950231075286865),
 ('‡∏î‡∏≠‡∏¢', 0.8065218925476074),
 ('‡∏Ç‡∏≤‡∏¢_‡∏´‡∏°‡∏π', 0.782018780708313),
 ('‡∏Ñ‡∏±‡∏ó', 0.7539661526679993),
 ('‡∏ñ‡∏±‡∏ß', 0.749467134475708),
 ('‡∏ó‡∏ô_‡∏ñ‡∏∑‡∏≠', 0.7403892278671265),
 ('vi_‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô', 0.7181094884872437),
 ('‡∏´‡∏°‡∏π_‡∏´‡∏Å', 0.7087310552597046),
 ('‡πÄ‡∏°‡πà‡∏≤', 0.7008231282234192),
 ('‡∏î‡∏≠‡∏¢‡∏´‡∏ô‡∏±‡∏Å', 0.6987157464027405)]

In [265]:
wvmodel.wv.most_similar('‡∏õ‡∏π‡πà')

  if np.issubdtype(vec.dtype, np.int):


[('dow_‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß', 0.6977851390838623),
 ('estprev_close', 0.691551685333252),
 ('‡∏î‡∏≤‡∏ß‡πÇ‡∏à‡∏£', 0.6877351999282837),
 ('‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß_‡πÅ‡∏™‡∏ö_‡∏ï‡∏≤', 0.671231746673584),
 ('‡∏õ‡∏π‡πã', 0.6710683107376099),
 ('p_m_previous', 0.6679481267929077),
 ('‡∏õ‡∏π‡πà_set', 0.6670135259628296),
 ('close_day_low_day', 0.6635088920593262),
 ('‡πÅ‡∏î‡∏á_‡πÅ‡∏à‡πã', 0.6617985367774963),
 ('‡πÄ‡∏ô‡∏µ‡πà‡∏¢‡∏¢‡∏¢‡∏¢‡∏¢', 0.65919029712677)]

In [268]:
wvmodel.wv.most_similar('‡∏Å‡∏≥‡πÑ‡∏£')

  if np.issubdtype(vec.dtype, np.int):


[('‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô', 0.7407156825065613),
 ('q_q_y_y', 0.6918238401412964),
 ('‡∏™‡∏∏‡∏ó‡∏ò‡∏¥‡∏ô‡∏¥‡∏ß‡πÑ‡∏Æ', 0.6777358651161194),
 ('‡∏´‡∏•‡∏±‡∏á‡∏Ñ‡πà‡∏≤', 0.6692806482315063),
 ('‡∏™‡∏ï‡πä‡∏≠‡∏Å‡∏ô‡πâ‡∏≥‡∏°‡∏±‡∏ô', 0.6682887077331543),
 ('‡∏ú‡∏•_‡∏õ‡∏£‡∏∞‡∏Å‡∏≠‡∏ö_‡∏Å‡∏≤‡∏£', 0.6618397235870361),
 ('‡∏ô‡∏¥‡∏ß‡πÑ‡∏Æ‡∏ï‡πà‡∏≠‡πÄ‡∏ô‡∏∑‡πà‡∏≠‡∏á', 0.6616955995559692),
 ('‡∏Ñ‡∏∏‡∏ì‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô', 0.655654788017273),
 ('‡∏Å‡∏≥‡πÑ‡∏£_‡∏™‡∏∏‡∏ó‡∏ò‡∏¥', 0.649263322353363),
 ('‡∏à‡πà‡∏≠_‡∏ö‡∏∏‡πä‡∏Å_‡∏Å‡∏≥‡πÑ‡∏£', 0.6485243439674377)]

In [266]:
wvmodel.wv.most_similar('‡∏ñ‡∏±‡∏ß')

  if np.issubdtype(vec.dtype, np.int):


[('‡∏Ñ‡∏±‡∏ó', 0.7897443771362305),
 ('‡∏ñ‡∏±‡∏ß_‡∏à‡∏ô', 0.7791475653648376),
 ('‡∏ã‡∏∑‡πâ‡∏≠‡∏ñ‡∏±‡∏ß', 0.7631266117095947),
 ('‡∏ñ‡∏∑‡∏≠‡∏Å‡∏¥‡∏ô_‡∏õ‡∏±‡∏ô_‡∏ú‡∏•', 0.7622551918029785),
 ('‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢', 0.7494672536849976),
 ('‡πÑ‡∏°‡πâ_‡πÅ‡∏£‡∏Å', 0.7355560064315796),
 ('‡∏ï‡∏¥‡∏î_‡∏î‡∏≠‡∏¢', 0.7347210645675659),
 ('‡∏î‡∏≠‡∏¢', 0.7312475442886353),
 ('‡∏Ñ‡∏±‡∏ï', 0.7222053408622742),
 ('‡∏ó‡∏ô_‡∏ñ‡∏∑‡∏≠', 0.7122272849082947)]

In [410]:
wv.most_similar('‡∏ó‡∏ô_‡∏£‡∏ß‡∏¢')

  if np.issubdtype(vec.dtype, np.int):


[('‡∏ó‡∏ô_‡∏ñ‡∏∑‡∏≠', 0.6762784719467163),
 ('‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢', 0.6643658876419067),
 ('‡∏õ‡∏¥‡∏î_‡πÅ‡∏Å‡∏õ', 0.6591575145721436),
 ('‡∏ï‡∏¥‡∏î_‡∏î‡∏≠‡∏¢', 0.6524772047996521),
 ('‡∏´‡∏ô‡∏≤‡∏¢‡∏¢‡∏¢', 0.6524191498756409),
 ('‡∏Ç‡∏≤_put', 0.6433411836624146),
 ('‡∏Ç‡∏≤_‡∏ä‡πâ‡∏≠‡∏ï', 0.640041708946228),
 ('‡∏î‡∏≠‡∏¢', 0.6352077722549438),
 ('‡πÄ‡∏û‡πà‡∏Å‡∏≠‡∏á', 0.6343613862991333),
 ('‡∏Æ‡πä‡∏≤‡∏ü', 0.6329175233840942)]

In [409]:
wvmodel.wv.most_similar('‡∏Å‡∏£‡∏∞‡∏î‡∏¥‡∏Å_‡∏ï‡∏µ‡∏ô')

  if np.issubdtype(vec.dtype, np.int):


[('‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏ß‡πÄ‡∏´‡∏ô‡∏µ‡∏¢‡∏ß', 0.673949658870697),
 ('‡∏°‡∏µ‡πÄ‡∏ã‡∏µ‡∏¢‡∏ô', 0.6632639169692993),
 ('‡∏ã‡∏±‡∏Å‡∏ï‡∏±‡∏ß', 0.6515215635299683),
 ('‡∏´‡∏•‡∏±‡∏ö_‡∏ï‡∏≤_‡∏à‡∏¥‡πâ‡∏°', 0.6429361701011658),
 ('‡πÅ‡∏ñ‡πÑ‡∏õ', 0.6382111310958862),
 ('‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô‡∏ô', 0.6377637982368469),
 ('‡∏Ñ‡∏∏‡∏ì‡∏û‡∏£‡∏∞‡∏ä‡πà‡∏ß‡∏¢', 0.6287404298782349),
 ('‡πÄ‡∏ó‡∏´‡∏°‡∏î', 0.627719521522522),
 ('‡∏Å‡∏≠‡∏á‡πÄ‡∏Ñ‡πâ‡∏≤', 0.6269105672836304),
 ('‡πÅ‡∏°‡πà‡∏ô‡∏à‡∏£‡∏¥‡∏á', 0.6259508728981018)]

In [406]:
wvmodel.wv.most_similar('‡∏õ‡∏¥‡∏î‡∏à‡∏≠')

  if np.issubdtype(vec.dtype, np.int):


[('‡πÄ‡∏Å‡∏≤‡πÑ‡∏Ç‡πà', 0.6946024298667908),
 ('‡∏î‡∏µ_‡∏Å‡πà‡∏≤', 0.6813105344772339),
 ('‡∏á‡∏µ‡∏ö', 0.6799486875534058),
 ('‡πÄ‡∏´‡∏ô‡∏µ‡πà‡∏≠‡∏¢', 0.6754287481307983),
 ('‡∏ï‡∏Å‡πÄ‡∏´‡∏ß', 0.6748765707015991),
 ('‡∏õ‡∏¥‡∏î_‡∏à‡∏≠', 0.6742295026779175),
 ('‡∏ó‡∏¥‡∏ï‡∏¢‡πå', 0.6655632257461548),
 ('‡πÄ‡∏ô‡∏µ‡πà‡∏¢‡∏¢‡∏¢‡∏¢', 0.662006139755249),
 ('‡∏ô‡∏≠‡∏ô', 0.6580524444580078),
 ('‡πÄ‡∏•‡πà‡∏ô_‡πÄ‡∏î‡∏¢‡πå', 0.6518649458885193)]

In [267]:
wvmodel.wv.most_similar('‡∏•‡πâ‡∏≤‡∏á_‡∏û‡∏≠‡∏£‡πå‡∏ó')

  if np.issubdtype(vec.dtype, np.int):


[('‡∏•‡πâ‡∏≤‡∏á_port', 0.6879936456680298),
 ('‡∏•‡πâ‡∏≤‡∏á_‡∏õ‡∏≠‡∏î', 0.683260440826416),
 ('‡∏Ñ‡∏±‡∏ó', 0.6706336736679077),
 ('‡∏ñ‡∏±‡∏ß_‡∏à‡∏ô', 0.6682713627815247),
 ('‡∏ó‡∏±‡∏ô‡∏Ñ‡∏£‡∏±‡∏ö', 0.6624603271484375),
 ('‡∏£‡∏¥‡∏ô‡∏Ç‡∏≤‡∏¢', 0.6522741317749023),
 ('‡∏Å‡∏£‡∏∞‡πÇ‡∏î‡∏î_‡∏´‡∏ô‡∏µ', 0.6517815589904785),
 ('‡πÄ‡∏•‡πà‡∏ô_‡∏õ‡∏±‡πà‡∏ô_‡πÅ‡∏õ‡∏∞', 0.6509268879890442),
 ('‡∏Ñ‡∏±‡∏ó‡∏•‡∏≠‡∏™', 0.650353729724884),
 ('‡∏™‡∏ß‡∏ô‡πÄ‡∏ó‡∏£‡∏ô', 0.6500637531280518)]

In [119]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="9"

In [271]:
wvmodel.wv.save('word-vector.window10.bin')

# PCA Time

In [343]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('word-vector.window10.bin')

In [344]:
bad_sentiment_words = ['‡∏î‡∏≠‡∏¢', '‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô', '‡πÅ‡∏î‡∏á', '‡∏•‡πâ‡∏≤‡∏á_‡∏õ‡∏≠‡∏î', '‡∏Ñ‡∏±‡∏ó', '‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢', 'new_low', '‡∏à‡∏ô']
good_sentiment_words = ['‡∏Å‡∏≥‡πÑ‡∏£', '‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß', '‡∏•‡∏¥‡πà‡∏á', '‡∏£‡∏ß‡∏¢', '‡∏Å‡∏¥‡∏ô_‡πÄ‡∏´‡∏•‡∏≤', 'new_high']

In [345]:
bad_sentiment_words = ['‡∏î‡∏≠‡∏¢', '‡∏•‡πâ‡∏≤‡∏á_‡∏õ‡∏≠‡∏î', '‡∏Ñ‡∏±‡∏ó', '‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢', '‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î']
good_sentiment_words = ['‡∏•‡∏¥‡πà‡∏á', '‡∏£‡∏ß‡∏¢', '‡∏Å‡∏¥‡∏ô_‡πÄ‡∏´‡∏•‡∏≤']

In [346]:
X = [
    wv[word]
    for word
    in bad_sentiment_words + good_sentiment_words
]

In [347]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=1)
pca.fit(X)
X1 = pca.transform(X)

In [348]:
allwords = bad_sentiment_words + good_sentiment_words
data = [
    {
        'word' : allwords[i],
        'score' : X1[i][0],
    }
    for i
    in range(len(X1))
]
data.sort(key=lambda item: item['score'])

In [349]:
for item in data:
    print('{:5.2f} {}'.format(item['score'], item['word']))

-1.02 ‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢
-0.81 ‡∏Ñ‡∏±‡∏ó
-0.73 ‡∏î‡∏≠‡∏¢
-0.67 ‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î
-0.27 ‡∏•‡∏¥‡πà‡∏á
-0.11 ‡∏•‡πâ‡∏≤‡∏á_‡∏õ‡∏≠‡∏î
 0.50 ‡∏£‡∏ß‡∏¢
 3.11 ‡∏Å‡∏¥‡∏ô_‡πÄ‡∏´‡∏•‡∏≤


In [358]:
bad_sentiment_words = ['‡∏î‡∏≠‡∏¢', '‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô', '‡πÅ‡∏î‡∏á', '‡∏•‡πâ‡∏≤‡∏á_‡∏õ‡∏≠‡∏î', '‡∏•‡πâ‡∏≤‡∏á_‡∏û‡∏≠‡∏£‡πå‡∏ó', '‡∏Ñ‡∏±‡∏ó', '‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢', 'new_low', '‡∏à‡∏ô', '‡∏ó‡∏∏‡∏ö']
good_sentiment_words = ['‡∏Å‡∏≥‡πÑ‡∏£', '‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß', '‡∏•‡∏¥‡πà‡∏á', '‡∏£‡∏ß‡∏¢', '‡∏Å‡∏¥‡∏ô_‡πÄ‡∏´‡∏•‡∏≤', 'new_high', '‡πÄ‡∏î‡πâ‡∏á', '‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î', '‡πÇ‡∏•‡∏†', '‡πÄ‡∏õ‡∏¥‡∏î_port']
allwords = bad_sentiment_words + good_sentiment_words
X = [
    wv[word]
    for word
    in bad_sentiment_words + good_sentiment_words
]
X1 = pca.transform(X)
data = [
    {
        'word' : allwords[i],
        'score' : X1[i][0],
    }
    for i
    in range(len(X1))
]
data.sort(key=lambda item: item['score'])
for item in data:
    print('{:5.2f} {}'.format(item['score'], item['word']))

-1.02 ‡∏ï‡∏¥‡∏î‡∏î‡∏≠‡∏¢
-0.81 ‡∏Ñ‡∏±‡∏ó
-0.73 ‡∏î‡∏≠‡∏¢
-0.52 ‡πÇ‡∏•‡∏†
-0.38 ‡∏Ç‡∏≤‡∏î‡∏ó‡∏∏‡∏ô
-0.28 new_low
-0.27 ‡∏•‡∏¥‡πà‡∏á
-0.19 ‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î
-0.12 ‡πÅ‡∏î‡∏á
-0.11 ‡∏•‡πâ‡∏≤‡∏á_‡∏õ‡∏≠‡∏î
-0.05 ‡πÄ‡∏õ‡∏¥‡∏î_port
-0.04 ‡∏ó‡∏∏‡∏ö
-0.04 ‡∏à‡∏ô
 0.03 ‡∏Å‡∏≥‡πÑ‡∏£
 0.05 ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß
 0.12 ‡πÄ‡∏î‡πâ‡∏á
 0.21 ‡∏•‡πâ‡∏≤‡∏á_‡∏û‡∏≠‡∏£‡πå‡∏ó
 0.50 ‡∏£‡∏ß‡∏¢
 0.56 new_high
 3.11 ‡∏Å‡∏¥‡∏ô_‡πÄ‡∏´‡∏•‡∏≤


# Good Day Bad Day Sample

In [102]:
## BADDD

import glob
import ujson

words = []
days = ['2016/1/7','2013/8/28', '2013/9/4']
for day in days:
    pattern = 'tokenized/{}/*json'.format(day)
    for path in glob.glob(pattern):
        sentences = file_sentences(path)
        sentences = trigram_transformer[bigram_transformer[sentences]]
        for sentence in sentences:
            for word in sentence:
                words.append(word)

In [103]:
bad_words = words

In [104]:
import collections
bow = collections.Counter(words)
for item in bow.most_common(1000):
    word = item[0]
    if word in wv:
        sent = pca.transform([wv[word]])[0][0]
        if sent < -1 or 1 < sent:
            print('{:7.4f}% {:5.2f} {}'.format(item[1]/len(words) * 100, sent, item[0]))

 0.6738% -1.06 ‡πÄ‡∏á‡∏¥‡∏ô
 0.1178% -1.29 ‡∏´‡∏≤
 0.0620% -1.19 ‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô
 0.0362% -1.01 ‡∏≠‡∏±‡∏ï‡∏£‡∏≤
 0.0358% -1.07 ‡∏û‡∏∑‡πâ‡∏ô‡∏ê‡∏≤‡∏ô
 0.0347% -1.16 ‡∏î‡∏≠‡∏¢
 0.0271% -1.04 vi
 0.0245% -1.05 e_b_e_b
 0.0228%  1.97 ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß
 0.0217% -1.02 ‡∏ö‡∏£‡∏¥‡∏´‡∏≤‡∏£
 0.0192%  1.51 ‡πÅ‡∏î‡∏á
 0.0181% -1.02 ‡∏≠‡∏≤‡∏ä‡∏µ‡∏û
 0.0181% -1.16 ‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å
 0.0162% -1.14 ‡∏≠‡∏≠‡∏°
 0.0153% -1.03 ‡∏ï‡πâ‡∏ô‡∏ó‡∏∏‡∏ô
 0.0149% -1.02 ‡∏û‡∏±‡∏ô_‡∏•‡πâ‡∏≤‡∏ô_‡∏ö‡∏≤‡∏ó
 0.0149% -1.05 ‡∏ú‡∏•_‡∏õ‡∏£‡∏∞‡πÇ‡∏¢‡∏ä‡∏ô‡πå
 0.0147% -1.15 ‡∏ò‡∏õ‡∏ó
 0.0147% -1.02 ‡∏î‡∏π‡πÅ‡∏•
 0.0136% -1.19 ‡πÇ‡∏ó‡∏©
 0.0136% -1.06 ‡∏ä‡∏≤‡∏ß_‡∏ô‡∏≤
 0.0132% -1.20 ‡∏õ‡∏£‡∏∞‡∏™‡∏ö‡∏Å‡∏≤‡∏£‡∏ì‡πå
 0.0130% -1.07 ‡∏à‡πâ‡∏≤‡∏á
 0.0128% -2.53 link_http_goo_gl
 0.0119% -1.19 ‡∏ê‡∏≤‡∏ô‡∏∞
 0.0117% -1.03 ‡∏Ñ‡∏≠‡∏¢
 0.0109% -1.00 ‡∏´‡∏ô‡∏µ‡πâ‡∏™‡∏¥‡∏ô
 0.0102% -1.38 ‡∏£‡∏≤‡∏Ñ‡∏≤_‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢
 0.0096% -1.33 ‡∏Å‡∏¢‡∏®


In [105]:
## GOOD

import glob
import ujson

words = []
days = ['2013/5/20','2013/5/28', '2013/3/15', '2013/5/21']
for day in days:
    pattern = 'tokenized/{}/*json'.format(day)
    for path in glob.glob(pattern):
        sentences = file_sentences(path)
        sentences = trigram_transformer[bigram_transformer[sentences]]
        for sentence in sentences:
            for word in sentence:
                words.append(word)

In [106]:
good_words = words

In [107]:
import collections
bow = collections.Counter(words)
for item in bow.most_common():
    word = item[0]
    if word in wv:
        sent = pca.transform([wv[word]])[0][0]
        if sent < -1 or 1 < sent:
            print('{:7.4f}% {:5.2f} {}'.format(item[1]/len(words) * 100, sent, item[0]))

 0.7248% -1.06 ‡πÄ‡∏á‡∏¥‡∏ô
 0.1199% -1.29 ‡∏´‡∏≤
 0.0579% -1.19 ‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô
 0.0435% -1.07 ‡∏û‡∏∑‡πâ‡∏ô‡∏ê‡∏≤‡∏ô
 0.0418% -1.01 ‡∏≠‡∏±‡∏ï‡∏£‡∏≤
 0.0338% -1.16 ‡∏î‡∏≠‡∏¢
 0.0314% -1.02 ‡∏ö‡∏£‡∏¥‡∏´‡∏≤‡∏£
 0.0294% -1.02 ‡∏î‡∏π‡πÅ‡∏•
 0.0275% -1.00 ‡∏ö‡∏£‡∏¥‡∏©‡∏±‡∏ó‡∏Ø
 0.0210%  1.97 ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß
 0.0195% -1.15 ‡∏ò‡∏õ‡∏ó
 0.0188% -1.16 ‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å
 0.0186% -1.02 ‡∏û‡∏±‡∏ô_‡∏•‡πâ‡∏≤‡∏ô_‡∏ö‡∏≤‡∏ó
 0.0176%  1.51 ‡πÅ‡∏î‡∏á
 0.0174% -1.20 ‡∏õ‡∏£‡∏∞‡∏™‡∏ö‡∏Å‡∏≤‡∏£‡∏ì‡πå
 0.0166% -1.14 ‡∏≠‡∏≠‡∏°
 0.0162% -1.10 ‡∏≠‡∏≤‡∏à‡∏≤‡∏£‡∏¢‡πå
 0.0138% -1.02 ‡∏Ñ‡∏£‡∏≠‡∏ö‡∏Ñ‡∏£‡∏±‡∏ß
 0.0135% -1.02 ‡∏ï‡∏≥‡πÅ‡∏´‡∏ô‡πà‡∏á
 0.0135% -1.03 ‡∏ï‡πâ‡∏ô‡∏ó‡∏∏‡∏ô
 0.0116% -1.26 ‡∏•‡∏≤_‡∏≠‡∏≠‡∏Å
 0.0116%  1.20 ‡∏à‡∏∏‡∏î_‡πÄ‡∏û‡∏¥‡πà‡∏°_‡∏Ç‡∏∂‡πâ‡∏ô
 0.0111% -1.05 e_b_e_b
 0.0109%  1.13 ‡πÑ‡∏î‡πâ_‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï_‡∏Ñ‡∏∏‡∏ì‡∏™‡∏ö‡∏≤‡∏¢
 0.0109% -1.19 ‡∏ê‡∏≤‡∏ô‡∏∞
 0.0108% -1.02 ‡∏≠‡∏≤‡∏ä‡∏µ‡∏û
 0.0104% -2.53 link_http_goo_gl
 0.0104% -1.30 ‡∏´‡∏∏‡πâ‡∏ô_forex
 0.0104% -1.11 ‡πÄ‡∏™‡∏µ‡∏¢_‡∏†‡∏≤‡∏©‡∏µ
 0.

 0.0003% -1.06 ‡∏´‡∏•‡∏ß‡∏°_‡∏ï‡∏±‡∏ß
 0.0003% -1.02 ‡∏Ñ‡πà‡∏≤_‡πÇ‡∏ó‡∏£‡∏®‡∏±‡∏û‡∏ó‡πå
 0.0003% -1.17 ‡∏ß‡∏±‡∏¢_‡πÄ‡∏î‡πá‡∏Å
 0.0003% -1.33 ‡∏ô‡∏û
 0.0003% -1.28 ‡πÇ‡∏Å‡∏¢‡πÄ‡∏ö‡∏µ‡πâ‡∏¢
 0.0003% -1.18 ‡∏Å‡∏£‡∏∞‡∏ó‡∏π‡πâ_‡πÄ‡∏Å‡πà‡∏≤
 0.0003% -1.14 ‡∏ö‡∏ß‡∏ä
 0.0003% -1.16 ‡∏Å‡∏£‡∏£‡∏°‡∏Å‡∏≤‡∏£_‡∏ú‡∏π‡πâ_‡∏ö‡∏£‡∏¥‡∏´‡∏≤‡∏£
 0.0003% -1.07 ‡∏à‡πà‡∏≠_‡∏ö‡∏∏‡πä‡∏Ñ_‡∏£‡∏≤‡∏¢
 0.0003% -1.19 ‡∏¢‡∏Å‡πÄ‡∏•‡∏¥‡∏Å_‡∏™‡∏±‡∏ç‡∏ç‡∏≤
 0.0003% -1.28 ‡∏°‡∏∑‡∏≠_‡πÄ‡∏Å‡πã‡∏≤
 0.0003% -1.25 ‡πÄ‡∏Å‡πâ‡∏≤
 0.0003%  1.53 ‡πÅ‡∏ó‡πà‡∏á_‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß
 0.0003% -1.18 ‡∏Ñ‡∏ß‡∏ö‡∏Ñ‡∏∏‡∏°_‡∏Ñ‡∏∏‡∏ì‡∏†‡∏≤‡∏û
 0.0003% -1.03 capital_gain
 0.0003%  1.16 ‡∏•‡∏á‡πÅ‡∏î‡∏á
 0.0003% -1.05 ‡∏Å‡∏≤‡∏£‡∏á‡∏≤‡∏ô
 0.0003% -1.22 ‡∏Å‡∏≤‡∏£‡∏ö‡∏¥‡∏ô‡πÑ‡∏ó‡∏¢
 0.0003% -1.03 ‡∏ß‡∏≤‡∏á_‡∏û‡∏¥‡∏Å‡∏±‡∏î
 0.0003% -1.06 ‡πÉ‡∏à‡∏Å‡∏•‡πâ‡∏≤
 0.0003% -1.07 ‡∏≠‡∏î‡∏ó‡∏ô_‡∏£‡∏≠
 0.0003% -1.07 ‡∏´‡∏ô‡πâ‡∏≤‡∏ï‡∏±‡∏Å
 0.0003% -1.21 money_management
 0.0003% -1.42 ‡πÉ‡∏ù‡πà
 0.0003% -1.08 ‡πÄ‡∏ó‡∏µ‡πà‡∏¢‡∏ß_‡∏ï‡πà‡∏≤‡∏á_‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®
 0.0003

In [122]:
goodcount = {}
for w in good_words:
    goodcount[w] = goodcount.get(w, 0) + 1
    
badcount = {}
for w in bad_words:
    badcount[w] = badcount.get(w, 0) + 1

    
items = []
allwords = list(set(good_words + bad_words))
for k in allwords:
    item = {
        'word' : k,
        'count' : [
            goodcount.get(k, 0)/len(good_words),
            badcount.get(k, 0)/len(bad_words),
        ]
    }
    item['score'] = max(item['count'][0], item['count'][1]) / max(0.0001, min(item['count']))
    item['sent'] = 1 if item['count'][0] > item['count'][1] else -1
    items.append(item)
    
items.sort(key=lambda item: -item['score'])
items

[{'count': [2.2196951846274155e-05, 0.0007284965705278405],
  'score': 7.284965705278404,
  'sent': -1,
  'word': '‡∏¢‡∏≤‡∏á'},
 {'count': [0.0006778607602285261, 4.260213862735909e-05],
  'score': 6.778607602285261,
  'sent': 1,
  'word': '‡∏û_‡∏Ñ'},
 {'count': [1.7074578343287814e-05, 0.000513355770459677],
  'score': 5.13355770459677,
  'sent': -1,
  'word': '‡∏ã‡∏µ‡πÄ‡∏£‡∏µ‡∏¢'},
 {'count': [3.927153018956197e-05, 0.0004260213862735909],
  'score': 4.260213862735909,
  'sent': -1,
  'word': '‡∏õ‡∏•‡∏π‡∏Å'},
 {'count': [0.00010756984356271323, 0.0004473224555872705],
  'score': 4.158437353555148,
  'sent': -1,
  'word': '‡∏Ç‡πâ‡∏≤‡∏ß'},
 {'count': [6.829831337315125e-06, 0.00038767946150896775],
  'score': 3.8767946150896773,
  'sent': -1,
  'word': '‡∏Å_‡∏¢'},
 {'count': [0.0006522488927135945, 0.0001746687683721723],
  'score': 3.7342044533332204,
  'sent': 1,
  'word': '‡∏ß‡∏¥‡πà‡∏á'},
 {'count': [0.0003722258078836743, 4.260213862735909e-05],
  'score': 3.722258078836743,
  'sen

In [156]:
import csv
with open('word-polarity.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['word','polar','size','positive','negative'])
    writer.writeheader()
    for item in items:
        row = {
            'word' : item['word'],
            'polar' : item['sent'],
            'size' : item['score'],
            'positive' : item['count'][0],
            'negative' : item['count'][1],
        }
        writer.writerow(row)

In [168]:
goodwords = ['‡∏ó‡∏∞‡∏¢‡∏≤‡∏ô', '‡∏ß‡∏¥‡πà‡∏á','‡πÅ‡∏ï‡∏∞','‡πÅ‡∏Ç‡πá‡∏á','‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß','‡∏Ç‡∏≤_‡∏Ç‡∏∂‡πâ‡∏ô','‡∏™‡∏π‡∏á_‡∏™‡∏∏‡∏î', '‡∏Å‡∏≥‡πÑ‡∏£', '‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏Å‡∏£‡πà‡∏á', '‡∏õ‡∏±‡πà‡∏ô', '‡∏Ç‡∏≤‡∏¢_‡∏´‡∏°‡∏π', 'call', 'long', '‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô', '‡πÄ‡∏õ‡∏¥‡∏î_l', 'l', '‡∏•‡∏≤‡∏Å']
badwords = ['‡∏î‡∏≠‡∏¢', '‡∏´‡∏•‡∏∏‡∏î','‡∏£‡πà‡∏ß‡∏á','‡∏ï‡∏≤‡∏¢','‡πÅ‡∏î‡∏á',  '‡∏Ç‡∏≤_‡∏•‡∏á', '‡∏ï‡πà‡∏≥_‡∏™‡∏∏‡∏î', '‡∏Ñ‡∏±‡∏ó', '‡∏´‡∏ô‡∏±‡∏Å', '‡∏•‡∏ö', '‡∏ï‡∏Å', '‡∏£‡πà‡∏ß‡∏á_‡∏•‡∏á', '‡πÄ‡∏•‡∏¥‡∏Å', '‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î', '‡∏£‡∏≠‡∏î', '‡∏ó‡∏∏‡∏ö', '‡∏Å‡∏±‡∏á‡∏ß‡∏•', 'put', 'short', 's', '‡∏≠‡πà‡∏≠‡∏ô', '‡πÑ‡∏™‡πâ_‡πÅ‡∏ï‡∏Å', '‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏™‡∏≤‡∏î', '‡∏ï‡∏ö']

In [169]:
sentwords = []
for w in goodwords:
    item = {'word' : w, 'sent' : 1}
    sentwords.append(item)
for w in badwords:
    item = {'word' : w, 'sent' : -1}
    sentwords.append(item)

In [236]:
sent_cache = {}

In [244]:
sent_cache

{'‡∏™‡∏°‡∏≤‡∏Ñ‡∏°‡∏Ñ‡πâ‡∏≤_‡∏ó‡∏≠‡∏á‡∏Ñ‡∏≥': {'similarity': 0.36978504,
  'word': {'sent': -1, 'word': '‡∏£‡πà‡∏ß‡∏á_‡∏•‡∏á'}},
 '‡πÅ‡∏õ‡πä‡∏õ': {'similarity': 0.52990305, 'word': {'sent': -1, 'word': '‡∏ï‡∏ö'}},
 '‡∏≠‡∏≠‡∏ô‡πÑ‡∏•‡∏î‡πå': {'similarity': 0.32509127,
  'word': {'sent': 1, 'word': '‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏Å‡∏£‡πà‡∏á'}},
 '‡∏ï‡πâ‡∏≠‡∏á_‡πÄ‡∏ú‡∏ä‡∏¥‡∏ç_‡∏Å‡∏±‡∏ö': {'similarity': 0.4500733,
  'word': {'sent': -1, 'word': '‡∏Å‡∏±‡∏á‡∏ß‡∏•'}},
 '‡∏ô‡∏¥‡∏Ñ‡∏°_‡∏≠‡∏∏‡∏ï‡∏™‡∏≤‡∏´‡∏Å‡∏£‡∏£‡∏°': {'similarity': 0.2927236,
  'word': {'sent': 1, 'word': '‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏Å‡∏£‡πà‡∏á'}},
 '‡∏•‡∏≠‡∏™': {'similarity': 0.5785567, 'word': {'sent': -1, 'word': '‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏™‡∏≤‡∏î'}},
 '‡∏õ‡∏£‡∏∞‡∏™‡∏á‡∏Ñ‡πå': {'similarity': 0.27067077, 'word': {'sent': -1, 'word': '‡∏Å‡∏±‡∏á‡∏ß‡∏•'}},
 '‡∏Ñ‡∏ß‡∏≤‡∏°_‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å': {'similarity': 0.54989415,
  'word': {'sent': -1, 'word': '‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î'}},
 '‡∏ö‡∏≤‡∏ó_gf_m': {'similarity': 0.47313878,
  'word': {'sent': -1, 'word': '‡∏£‡πà‡∏ß‡∏á_‡∏

In [243]:
%%time


def best_sent_cache(word):
    if word in sent_cache:
        return sent_cache[word]
    
    items = [
        {
            'word': sentword,
            'similarity': wv.similarity(word, sentword['word'])
        }
        for sentword
        in sentwords
    ]
    bestsent = max(items, key=lambda item:item['similarity'])
    
    sent_cache[word] = bestsent
    return bestsent

def sent_for_day(day):
    pattern = 'tokenized/{}/*.json'.format(day)
    positive = 0
    negative = 0
    sent = 0
    
    for path in glob.glob(pattern):
        sentences = file_sentences(path)
        sentences = trigram_transformer[bigram_transformer[sentences]]
        
        for sentence in sentences:
            for word in sentence:
                if word in wv:
                    bestsent = best_sent_cache(word)
                    if bestsent['similarity'] > 0.7:
                        # print(word, bestsent)
                        sent += bestsent['word']['sent']
               
    
    return sent

score = sent_for_day('2013/5/20')
print(score)

341
CPU times: user 2.04 s, sys: 12 ms, total: 2.06 s
Wall time: 2.07 s


# Time to calculate

In [247]:
import glob
import random
import datetime
dirs = glob.glob('tokenized/*/*/*')
random.shuffle(dirs)
for i in range(len(dirs)):
    basedir = dirs[i]

    import os
    import re

    print(datetime.datetime.now(), i, len(dirs), basedir)
    sent_dir = basedir.replace('sentiment', 'trigram')
    sent_path = '/'.join([sent_dir, 'trigram.json'])
    if not os.path.exists(sent_path):
        day = re.findall(r'\d{4}\/\d{1,2}\/\d{1,2}', sent_dir)[0]
        sent = sent_for_day(day)
        if not os.path.exists(sent_dir):
            os.makedirs(sent_dir)
        with open(sent_path, 'w') as f:
            f.write('{}'.format(sent))
        print('sentimental score', sent)

2018-08-13 15:51:20.325756 0 2033 tokenized/2015/5/7
2018-08-13 15:51:20.326409 1 2033 tokenized/2015/10/17


  if np.issubdtype(vec.dtype, np.int):


sentimental score -55
2018-08-13 15:51:21.290517 2 2033 tokenized/2015/7/28
2018-08-13 15:51:21.290905 3 2033 tokenized/2014/5/13
sentimental score -61
2018-08-13 15:51:22.674507 4 2033 tokenized/2017/3/3
sentimental score 83
2018-08-13 15:51:23.734512 5 2033 tokenized/2018/6/19
sentimental score -259
2018-08-13 15:51:25.156644 6 2033 tokenized/2013/4/13
sentimental score -167
2018-08-13 15:51:26.212322 7 2033 tokenized/2014/5/17
sentimental score -41
2018-08-13 15:51:27.658259 8 2033 tokenized/2013/4/1
2018-08-13 15:51:27.658734 9 2033 tokenized/2015/11/22
2018-08-13 15:51:27.659119 10 2033 tokenized/2014/3/12
2018-08-13 15:51:27.659522 11 2033 tokenized/2017/6/27
sentimental score 104
2018-08-13 15:51:29.606714 12 2033 tokenized/2015/3/13
sentimental score 90
2018-08-13 15:51:31.336714 13 2033 tokenized/2018/4/28
sentimental score -19
2018-08-13 15:51:31.969503 14 2033 tokenized/2017/5/2
2018-08-13 15:51:31.970036 15 2033 tokenized/2017/2/10
sentimental score -58
2018-08-13 15:51:32.

sentimental score 64
2018-08-13 15:52:56.911682 124 2033 tokenized/2014/1/7
sentimental score 135
2018-08-13 15:52:59.380210 125 2033 tokenized/2013/5/4
sentimental score 100
2018-08-13 15:53:00.874458 126 2033 tokenized/2014/10/22
2018-08-13 15:53:00.874864 127 2033 tokenized/2016/5/16
2018-08-13 15:53:00.875315 128 2033 tokenized/2017/9/25
sentimental score -30
2018-08-13 15:53:01.609809 129 2033 tokenized/2015/1/23
sentimental score 141
2018-08-13 15:53:03.448584 130 2033 tokenized/2016/9/18
sentimental score 27
2018-08-13 15:53:04.440990 131 2033 tokenized/2014/1/27
sentimental score -142
2018-08-13 15:53:06.672171 132 2033 tokenized/2013/2/24
sentimental score -17
2018-08-13 15:53:07.920697 133 2033 tokenized/2015/1/29
sentimental score 153
2018-08-13 15:53:09.624836 134 2033 tokenized/2017/11/10
sentimental score 25
2018-08-13 15:53:10.775940 135 2033 tokenized/2014/12/20
sentimental score 51
2018-08-13 15:53:11.663528 136 2033 tokenized/2017/10/6
sentimental score 40
2018-08-13 

sentimental score 77
2018-08-13 15:54:35.487764 249 2033 tokenized/2017/11/4
sentimental score -46
2018-08-13 15:54:36.136118 250 2033 tokenized/2014/4/23
2018-08-13 15:54:36.136494 251 2033 tokenized/2017/7/7
sentimental score 110
2018-08-13 15:54:37.252155 252 2033 tokenized/2014/1/21
sentimental score -7
2018-08-13 15:54:39.226965 253 2033 tokenized/2014/5/23
sentimental score -112
2018-08-13 15:54:40.758096 254 2033 tokenized/2013/5/15
sentimental score 307
2018-08-13 15:54:43.418021 255 2033 tokenized/2014/1/8
2018-08-13 15:54:43.418417 256 2033 tokenized/2013/11/25
sentimental score -139
2018-08-13 15:54:45.413926 257 2033 tokenized/2013/4/22
sentimental score 82
2018-08-13 15:54:47.681159 258 2033 tokenized/2013/11/6
sentimental score 39
2018-08-13 15:54:49.947291 259 2033 tokenized/2014/5/29
sentimental score 95
2018-08-13 15:54:51.584296 260 2033 tokenized/2017/5/26
sentimental score 138
2018-08-13 15:54:52.616121 261 2033 tokenized/2014/12/13
2018-08-13 15:54:52.616546 262 20

sentimental score -165
2018-08-13 15:56:43.289175 366 2033 tokenized/2017/1/4
sentimental score 11
2018-08-13 15:56:44.256738 367 2033 tokenized/2016/11/30
2018-08-13 15:56:44.257173 368 2033 tokenized/2017/6/4
sentimental score 19
2018-08-13 15:56:45.053456 369 2033 tokenized/2017/4/28
sentimental score 58
2018-08-13 15:56:46.145897 370 2033 tokenized/2017/11/28
sentimental score -8
2018-08-13 15:56:46.900493 371 2033 tokenized/2013/4/26
sentimental score 282
2018-08-13 15:56:49.378710 372 2033 tokenized/2017/12/25
2018-08-13 15:56:49.380008 373 2033 tokenized/2013/7/15
2018-08-13 15:56:49.380503 374 2033 tokenized/2015/6/19
sentimental score -25
2018-08-13 15:56:50.765619 375 2033 tokenized/2016/9/27
sentimental score -71
2018-08-13 15:56:52.163067 376 2033 tokenized/2014/4/7
2018-08-13 15:56:52.163723 377 2033 tokenized/2016/10/17
sentimental score 10
2018-08-13 15:56:53.099271 378 2033 tokenized/2018/5/17
2018-08-13 15:56:53.099758 379 2033 tokenized/2013/6/6
sentimental score -573

sentimental score 173
2018-08-13 15:58:26.225317 485 2033 tokenized/2017/12/27
2018-08-13 15:58:26.225789 486 2033 tokenized/2014/1/31
sentimental score -132
2018-08-13 15:58:27.734176 487 2033 tokenized/2016/12/16
2018-08-13 15:58:27.736739 488 2033 tokenized/2015/1/10
sentimental score -35
2018-08-13 15:58:28.683189 489 2033 tokenized/2018/6/4
sentimental score -46
2018-08-13 15:58:29.684910 490 2033 tokenized/2017/6/23
sentimental score 133
2018-08-13 15:58:30.992022 491 2033 tokenized/2013/10/12
2018-08-13 15:58:30.992442 492 2033 tokenized/2013/9/5
sentimental score 17
2018-08-13 15:58:33.877944 493 2033 tokenized/2013/2/15
sentimental score 202
2018-08-13 15:58:36.000999 494 2033 tokenized/2015/11/20
sentimental score 26
2018-08-13 15:58:36.918703 495 2033 tokenized/2013/1/2
sentimental score 225
2018-08-13 15:58:38.737809 496 2033 tokenized/2013/4/18
sentimental score -49
2018-08-13 15:58:40.802787 497 2033 tokenized/2017/6/30
sentimental score -15
2018-08-13 15:58:41.911473 498

sentimental score 137
2018-08-13 16:00:21.568738 602 2033 tokenized/2013/5/19
sentimental score 133
2018-08-13 16:00:22.580665 603 2033 tokenized/2013/8/5
sentimental score 27
2018-08-13 16:00:24.769849 604 2033 tokenized/2014/2/5
2018-08-13 16:00:24.770577 605 2033 tokenized/2018/1/27
2018-08-13 16:00:24.771198 606 2033 tokenized/2015/4/7
sentimental score -9
2018-08-13 16:00:26.156132 607 2033 tokenized/2016/7/20
sentimental score -5
2018-08-13 16:00:27.426958 608 2033 tokenized/2018/3/22
sentimental score 53
2018-08-13 16:00:28.462621 609 2033 tokenized/2018/6/27
2018-08-13 16:00:28.463021 610 2033 tokenized/2016/1/14
2018-08-13 16:00:28.463480 611 2033 tokenized/2013/10/21
sentimental score 110
2018-08-13 16:00:30.503054 612 2033 tokenized/2014/5/19
sentimental score 15
2018-08-13 16:00:31.785881 613 2033 tokenized/2017/2/9
2018-08-13 16:00:31.786339 614 2033 tokenized/2017/10/12
sentimental score 185
2018-08-13 16:00:32.890930 615 2033 tokenized/2014/2/22
2018-08-13 16:00:32.89141

sentimental score 91
2018-08-13 16:01:55.803451 724 2033 tokenized/2013/2/21
sentimental score 141
2018-08-13 16:01:58.121345 725 2033 tokenized/2016/9/17
2018-08-13 16:01:58.121719 726 2033 tokenized/2018/3/27
sentimental score 47
2018-08-13 16:01:59.406452 727 2033 tokenized/2016/10/7
sentimental score -14
2018-08-13 16:02:00.759010 728 2033 tokenized/2013/2/11
sentimental score 37
2018-08-13 16:02:02.582442 729 2033 tokenized/2017/7/12
2018-08-13 16:02:02.582915 730 2033 tokenized/2014/7/14
sentimental score 192
2018-08-13 16:02:04.068033 731 2033 tokenized/2015/10/24
sentimental score -11
2018-08-13 16:02:05.069386 732 2033 tokenized/2015/1/25
2018-08-13 16:02:05.069892 733 2033 tokenized/2015/4/30
2018-08-13 16:02:05.071025 734 2033 tokenized/2014/5/10
sentimental score -129
2018-08-13 16:02:06.039181 735 2033 tokenized/2015/12/28
2018-08-13 16:02:06.039589 736 2033 tokenized/2015/5/12
sentimental score 5
2018-08-13 16:02:08.163924 737 2033 tokenized/2013/2/1
2018-08-13 16:02:08.1

sentimental score -13
2018-08-13 16:03:35.399868 846 2033 tokenized/2016/5/12
sentimental score 164
2018-08-13 16:03:37.062107 847 2033 tokenized/2014/2/25
sentimental score 155
2018-08-13 16:03:38.434582 848 2033 tokenized/2017/1/25
2018-08-13 16:03:38.434984 849 2033 tokenized/2016/11/13
2018-08-13 16:03:38.435323 850 2033 tokenized/2017/3/25
2018-08-13 16:03:38.435806 851 2033 tokenized/2016/6/6
2018-08-13 16:03:38.436189 852 2033 tokenized/2014/1/26
2018-08-13 16:03:38.436623 853 2033 tokenized/2015/1/15
2018-08-13 16:03:38.437018 854 2033 tokenized/2016/12/30
sentimental score 95
2018-08-13 16:03:39.173553 855 2033 tokenized/2014/12/21
sentimental score -57
2018-08-13 16:03:40.267064 856 2033 tokenized/2015/6/12
2018-08-13 16:03:40.267491 857 2033 tokenized/2015/3/8
2018-08-13 16:03:40.268006 858 2033 tokenized/2013/6/14
sentimental score -220
2018-08-13 16:03:42.873271 859 2033 tokenized/2015/11/27
2018-08-13 16:03:42.873784 860 2033 tokenized/2014/7/27
2018-08-13 16:03:42.874161

sentimental score 144
2018-08-13 16:05:26.473382 964 2033 tokenized/2017/11/14
2018-08-13 16:05:26.473761 965 2033 tokenized/2014/11/17
2018-08-13 16:05:26.475617 966 2033 tokenized/2014/4/17
sentimental score 118
2018-08-13 16:05:28.019934 967 2033 tokenized/2013/3/21
2018-08-13 16:05:28.020330 968 2033 tokenized/2015/5/14
2018-08-13 16:05:28.020787 969 2033 tokenized/2018/5/4
sentimental score 64
2018-08-13 16:05:29.067086 970 2033 tokenized/2017/9/27
2018-08-13 16:05:29.067468 971 2033 tokenized/2016/5/10
sentimental score -43
2018-08-13 16:05:30.560234 972 2033 tokenized/2013/7/23
sentimental score 251
2018-08-13 16:05:32.476733 973 2033 tokenized/2014/10/7
sentimental score 45
2018-08-13 16:05:34.607368 974 2033 tokenized/2018/3/25
2018-08-13 16:05:34.607834 975 2033 tokenized/2014/4/8
sentimental score 105
2018-08-13 16:05:36.230333 976 2033 tokenized/2012/12/27
sentimental score 226
2018-08-13 16:05:37.770557 977 2033 tokenized/2014/8/7
2018-08-13 16:05:37.770983 978 2033 tokeni

sentimental score 85
2018-08-13 16:07:06.208303 1083 2033 tokenized/2014/12/11
2018-08-13 16:07:06.209747 1084 2033 tokenized/2016/6/14
sentimental score -6
2018-08-13 16:07:07.627966 1085 2033 tokenized/2015/7/1
sentimental score -79
2018-08-13 16:07:08.945086 1086 2033 tokenized/2018/4/29
2018-08-13 16:07:08.945530 1087 2033 tokenized/2016/1/5
sentimental score -319
2018-08-13 16:07:10.492172 1088 2033 tokenized/2014/1/18
sentimental score -37
2018-08-13 16:07:11.636100 1089 2033 tokenized/2012/12/22
sentimental score 71
2018-08-13 16:07:12.314239 1090 2033 tokenized/2017/6/3
sentimental score -8
2018-08-13 16:07:13.067081 1091 2033 tokenized/2017/9/2
sentimental score -20
2018-08-13 16:07:13.836204 1092 2033 tokenized/2014/3/1
sentimental score -26
2018-08-13 16:07:14.718387 1093 2033 tokenized/2014/6/3
sentimental score 123
2018-08-13 16:07:16.565032 1094 2033 tokenized/2015/9/2
sentimental score -61
2018-08-13 16:07:17.831387 1095 2033 tokenized/2018/2/12
sentimental score -12
201

sentimental score 360
2018-08-13 16:08:37.500678 1201 2033 tokenized/2013/4/12
sentimental score 118
2018-08-13 16:08:39.677550 1202 2033 tokenized/2016/1/31
sentimental score -12
2018-08-13 16:08:40.567374 1203 2033 tokenized/2014/2/14
2018-08-13 16:08:40.567814 1204 2033 tokenized/2015/9/10
2018-08-13 16:08:40.568200 1205 2033 tokenized/2017/5/21
sentimental score 97
2018-08-13 16:08:41.409535 1206 2033 tokenized/2017/2/5
sentimental score 24
2018-08-13 16:08:42.307797 1207 2033 tokenized/2016/9/22
sentimental score -17
2018-08-13 16:08:43.622370 1208 2033 tokenized/2016/4/22
sentimental score -23
2018-08-13 16:08:45.126257 1209 2033 tokenized/2016/3/10
sentimental score 64
2018-08-13 16:08:46.454922 1210 2033 tokenized/2015/4/22
sentimental score 49
2018-08-13 16:08:48.307689 1211 2033 tokenized/2017/12/16
2018-08-13 16:08:48.308211 1212 2033 tokenized/2014/2/3
2018-08-13 16:08:48.308654 1213 2033 tokenized/2013/2/16
2018-08-13 16:08:48.309056 1214 2033 tokenized/2016/8/7
2018-08-13

sentimental score -85
2018-08-13 16:10:09.140353 1320 2033 tokenized/2014/5/12
sentimental score -15
2018-08-13 16:10:10.360196 1321 2033 tokenized/2017/3/16
2018-08-13 16:10:10.362641 1322 2033 tokenized/2017/8/14
2018-08-13 16:10:10.363143 1323 2033 tokenized/2018/1/11
sentimental score 99
2018-08-13 16:10:11.684118 1324 2033 tokenized/2017/4/22
sentimental score -58
2018-08-13 16:10:12.515170 1325 2033 tokenized/2014/2/6
sentimental score 130
2018-08-13 16:10:14.739556 1326 2033 tokenized/2017/3/20
2018-08-13 16:10:14.740039 1327 2033 tokenized/2016/4/7
sentimental score -76
2018-08-13 16:10:16.198296 1328 2033 tokenized/2014/8/28
2018-08-13 16:10:16.200043 1329 2033 tokenized/2017/10/10
sentimental score 77
2018-08-13 16:10:17.294214 1330 2033 tokenized/2014/7/4
sentimental score 229
2018-08-13 16:10:19.143958 1331 2033 tokenized/2016/11/21
sentimental score -108
2018-08-13 16:10:20.327079 1332 2033 tokenized/2015/6/10
2018-08-13 16:10:20.327489 1333 2033 tokenized/2013/11/18
senti

sentimental score 24
2018-08-13 16:11:44.741688 1436 2033 tokenized/2014/7/25
2018-08-13 16:11:44.742147 1437 2033 tokenized/2016/2/25
2018-08-13 16:11:44.742499 1438 2033 tokenized/2017/10/16
sentimental score 19
2018-08-13 16:11:45.908993 1439 2033 tokenized/2017/12/15
sentimental score 42
2018-08-13 16:11:46.938401 1440 2033 tokenized/2017/11/30
sentimental score 18
2018-08-13 16:11:48.200417 1441 2033 tokenized/2013/8/7
2018-08-13 16:11:48.200881 1442 2033 tokenized/2013/9/4
2018-08-13 16:11:48.201355 1443 2033 tokenized/2018/5/16
sentimental score 154
2018-08-13 16:11:49.405182 1444 2033 tokenized/2017/1/31
2018-08-13 16:11:49.405610 1445 2033 tokenized/2013/4/11
sentimental score 29
2018-08-13 16:11:51.641773 1446 2033 tokenized/2015/12/10
sentimental score -94
2018-08-13 16:11:52.517484 1447 2033 tokenized/2013/10/28
sentimental score 99
2018-08-13 16:11:54.724807 1448 2033 tokenized/2014/8/25
sentimental score 59
2018-08-13 16:11:56.850533 1449 2033 tokenized/2017/11/26
sentime

sentimental score 95
2018-08-13 16:13:33.125901 1552 2033 tokenized/2016/9/4
2018-08-13 16:13:33.127098 1553 2033 tokenized/2013/3/31
sentimental score -33
2018-08-13 16:13:34.491586 1554 2033 tokenized/2013/12/4
sentimental score 156
2018-08-13 16:13:36.367338 1555 2033 tokenized/2014/1/24
sentimental score 20
2018-08-13 16:13:38.342420 1556 2033 tokenized/2015/9/23
2018-08-13 16:13:38.342869 1557 2033 tokenized/2014/8/22
sentimental score 258
2018-08-13 16:13:40.002762 1558 2033 tokenized/2012/12/31
sentimental score 7
2018-08-13 16:13:40.631733 1559 2033 tokenized/2014/1/20
2018-08-13 16:13:40.632228 1560 2033 tokenized/2016/6/28
sentimental score -33
2018-08-13 16:13:42.234615 1561 2033 tokenized/2017/8/24
sentimental score 103
2018-08-13 16:13:43.470902 1562 2033 tokenized/2016/12/18
2018-08-13 16:13:43.471302 1563 2033 tokenized/2015/3/12
2018-08-13 16:13:43.471655 1564 2033 tokenized/2016/5/15
sentimental score 29
2018-08-13 16:13:44.380908 1565 2033 tokenized/2018/3/16
sentimen

sentimental score -9
2018-08-13 16:15:22.679718 1668 2033 tokenized/2016/2/9
2018-08-13 16:15:22.680344 1669 2033 tokenized/2015/7/21
sentimental score -184
2018-08-13 16:15:24.347039 1670 2033 tokenized/2017/9/28
2018-08-13 16:15:24.347501 1671 2033 tokenized/2013/10/15
sentimental score 293
2018-08-13 16:15:26.876207 1672 2033 tokenized/2016/1/29
sentimental score -97
2018-08-13 16:15:28.526794 1673 2033 tokenized/2013/12/30
2018-08-13 16:15:28.527169 1674 2033 tokenized/2017/10/5
2018-08-13 16:15:28.527625 1675 2033 tokenized/2013/11/28
sentimental score 8
2018-08-13 16:15:30.450349 1676 2033 tokenized/2018/3/31
sentimental score -36
2018-08-13 16:15:31.022381 1677 2033 tokenized/2017/7/3
2018-08-13 16:15:31.022863 1678 2033 tokenized/2018/4/12
2018-08-13 16:15:31.023381 1679 2033 tokenized/2016/12/8
sentimental score 67
2018-08-13 16:15:32.016305 1680 2033 tokenized/2018/4/1
sentimental score -42
2018-08-13 16:15:33.092427 1681 2033 tokenized/2014/7/20
sentimental score 106
2018-08

sentimental score -45
2018-08-13 16:17:07.709784 1786 2033 tokenized/2014/9/1
2018-08-13 16:17:07.710177 1787 2033 tokenized/2018/7/7
2018-08-13 16:17:07.710587 1788 2033 tokenized/2017/11/29
2018-08-13 16:17:07.710972 1789 2033 tokenized/2015/2/23
2018-08-13 16:17:07.712789 1790 2033 tokenized/2016/10/29
2018-08-13 16:17:07.713329 1791 2033 tokenized/2014/10/11
2018-08-13 16:17:07.713731 1792 2033 tokenized/2013/2/26
sentimental score 150
2018-08-13 16:17:09.864097 1793 2033 tokenized/2016/11/2
2018-08-13 16:17:09.864603 1794 2033 tokenized/2017/1/5
sentimental score 130
2018-08-13 16:17:11.025326 1795 2033 tokenized/2016/6/2
sentimental score 54
2018-08-13 16:17:12.249357 1796 2033 tokenized/2014/7/12
sentimental score 110
2018-08-13 16:17:13.046072 1797 2033 tokenized/2018/6/24
2018-08-13 16:17:13.046446 1798 2033 tokenized/2015/2/18
sentimental score 5
2018-08-13 16:17:14.806839 1799 2033 tokenized/2018/1/19
sentimental score 87
2018-08-13 16:17:15.992101 1800 2033 tokenized/2015/1

sentimental score -33
2018-08-13 16:18:38.702334 1903 2033 tokenized/2018/7/3
sentimental score 81
2018-08-13 16:18:39.829603 1904 2033 tokenized/2014/10/20
2018-08-13 16:18:39.830024 1905 2033 tokenized/2018/5/14
sentimental score 179
2018-08-13 16:18:41.043453 1906 2033 tokenized/2014/7/15
sentimental score 205
2018-08-13 16:18:42.561817 1907 2033 tokenized/2013/1/29
2018-08-13 16:18:42.562198 1908 2033 tokenized/2014/10/25
sentimental score 31
2018-08-13 16:18:43.386847 1909 2033 tokenized/2013/7/2
sentimental score -39
2018-08-13 16:18:45.384681 1910 2033 tokenized/2016/10/23
sentimental score -45
2018-08-13 16:18:46.064955 1911 2033 tokenized/2016/3/4
sentimental score 61
2018-08-13 16:18:47.441582 1912 2033 tokenized/2013/11/1
sentimental score -42
2018-08-13 16:18:49.875636 1913 2033 tokenized/2013/12/16
2018-08-13 16:18:49.876737 1914 2033 tokenized/2018/4/24
sentimental score 121
2018-08-13 16:18:51.099015 1915 2033 tokenized/2017/10/21
sentimental score -26
2018-08-13 16:18:5

sentimental score 2
2018-08-13 16:20:31.221072 2017 2033 tokenized/2017/11/24
2018-08-13 16:20:31.223268 2018 2033 tokenized/2016/5/4
sentimental score -21
2018-08-13 16:20:32.401102 2019 2033 tokenized/2015/4/9
2018-08-13 16:20:32.404298 2020 2033 tokenized/2016/8/6
2018-08-13 16:20:32.404806 2021 2033 tokenized/2014/3/4
sentimental score 161
2018-08-13 16:20:33.985671 2022 2033 tokenized/2016/7/30
sentimental score 11
2018-08-13 16:20:34.825943 2023 2033 tokenized/2014/12/8
sentimental score 0
2018-08-13 16:20:36.508883 2024 2033 tokenized/2017/3/5
2018-08-13 16:20:36.509794 2025 2033 tokenized/2016/12/3
2018-08-13 16:20:36.510337 2026 2033 tokenized/2017/5/1
2018-08-13 16:20:36.510718 2027 2033 tokenized/2015/9/21
sentimental score -74
2018-08-13 16:20:37.861321 2028 2033 tokenized/2016/12/4
sentimental score 85
2018-08-13 16:20:38.470419 2029 2033 tokenized/2015/10/20
2018-08-13 16:20:38.470802 2030 2033 tokenized/2018/2/25
2018-08-13 16:20:38.471211 2031 2033 tokenized/2015/1/4
20

In [188]:
import re
re.findall(r'\d{4}\/\d{1,2}\/\d{1,2}', 'tokenized/2018/22/3')

['2018/22/3']

In [256]:
%%time
## create a csv
import glob
ds = glob.glob('tokenized/*/*/*')
dparts = [d.split('/') for d in ds]
dates = [(int(dpart[1]), int(dpart[2]), int(dpart[3])) for dpart in dparts]
dates.sort()


import csv
with open('sentiment.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=['date', 'sentiment'])
    writer.writeheader()
    for d in dates:
        path = 'tokenized/{}/{}/{}/trigram.json'.format(d[0], d[1], d[2])
        with open(path) as fr:
            score = int(fr.read())
        writer.writerow({'date' : '{}-{}-{}'.format(d[2], d[1], d[0]), 'sentiment' : score})

CPU times: user 40 ms, sys: 24 ms, total: 64 ms
Wall time: 66.2 ms


# SVR

In [404]:
goodwords = ['‡∏ó‡∏∞‡∏¢‡∏≤‡∏ô', '‡∏ß‡∏¥‡πà‡∏á','‡πÅ‡∏ï‡∏∞','‡πÅ‡∏Ç‡πá‡∏á','‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß','‡∏Ç‡∏≤_‡∏Ç‡∏∂‡πâ‡∏ô','‡∏™‡∏π‡∏á_‡∏™‡∏∏‡∏î', '‡∏Å‡∏≥‡πÑ‡∏£', '‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏Å‡∏£‡πà‡∏á', '‡∏õ‡∏±‡πà‡∏ô', '‡∏Ç‡∏≤‡∏¢_‡∏´‡∏°‡∏π', 'call', 'long', '‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô', '‡πÄ‡∏õ‡∏¥‡∏î_l', 'l', '‡∏•‡∏≤‡∏Å']
badwords = ['‡∏î‡∏≠‡∏¢', '‡∏´‡∏•‡∏∏‡∏î','‡∏£‡πà‡∏ß‡∏á','‡∏ï‡∏≤‡∏¢','‡πÅ‡∏î‡∏á',  '‡∏Ç‡∏≤_‡∏•‡∏á', '‡∏ï‡πà‡∏≥_‡∏™‡∏∏‡∏î', '‡∏Ñ‡∏±‡∏ó', '‡∏´‡∏ô‡∏±‡∏Å', '‡∏•‡∏ö', '‡∏ï‡∏Å', '‡∏£‡πà‡∏ß‡∏á_‡∏•‡∏á', '‡πÄ‡∏•‡∏¥‡∏Å', '‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î', '‡∏£‡∏≠‡∏î', '‡∏ó‡∏∏‡∏ö', '‡∏Å‡∏±‡∏á‡∏ß‡∏•', 'put', 'short', 's', '‡∏≠‡πà‡∏≠‡∏ô', '‡πÑ‡∏™‡πâ_‡πÅ‡∏ï‡∏Å', '‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏™‡∏≤‡∏î', '‡∏ï‡∏ö', '‡∏õ‡∏¥‡∏î‡∏à‡∏≠']

In [403]:
wv['‡∏õ‡∏¥‡∏î_‡∏à‡∏≠']

array([-0.17933123,  0.17893074, -0.15398802,  0.46605676, -0.94262594,
        0.516209  , -0.20863159, -0.27593794, -0.0063017 ,  0.09871096,
       -0.515251  ,  0.11251242,  0.48029622,  0.2006212 , -0.09685012,
       -0.4007059 , -0.46740475,  0.37575847, -0.04712586,  0.32536408,
       -0.3418107 ,  0.38607928,  0.2588204 ,  0.32190233,  0.19018143,
        0.5888957 ,  0.56994855,  0.8572202 , -0.35887432,  0.00228762,
        0.905201  , -0.0871786 ,  0.2032396 , -0.33234677,  0.05526485,
       -0.14519966, -0.5481803 ,  0.02682955, -0.04847681,  0.5166316 ,
        0.22186606, -0.60882   ,  0.99551165,  0.5313718 ,  0.35484073,
       -0.08905195, -0.27311736, -0.05452759,  0.24729437,  0.20385322,
       -0.09257133,  0.56201583, -0.21015047,  0.5776392 ,  0.27558258,
       -0.05963071,  0.18523173,  0.20140018, -0.0115049 , -0.34479457,
       -0.42153275, -0.01703904, -0.21173456, -0.3314934 ,  0.127348  ,
       -0.2568982 , -0.03460529, -0.59299254,  0.05060816, -0.09

In [405]:
allwords = goodwords + badwords
trainX = [
    wv[word]
    for word
    in allwords
]
trainY = [1 for word in goodwords] + [0 for word in badwords]


from sklearn import svm
model = svm.SVR()
model.fit(trainX, trainY)

Y = model.predict(trainX)
    
indice = list(range(len(allwords)))
indice.sort(key=lambda i: Y[i])

for i in indice:
    word = allwords[i]
    print('{:5.2f} {}'.format(Y[i], word))

 0.03 ‡∏´‡∏ô‡∏±‡∏Å
 0.10 ‡πÑ‡∏™‡πâ_‡πÅ‡∏ï‡∏Å
 0.10 ‡∏ï‡∏Å
 0.10 ‡∏Å‡∏±‡∏á‡∏ß‡∏•
 0.10 ‡∏ó‡∏∏‡∏ö
 0.10 ‡∏ï‡∏≤‡∏¢
 0.10 ‡∏´‡∏•‡∏∏‡∏î
 0.10 ‡∏î‡∏≠‡∏¢
 0.10 ‡∏õ‡∏¥‡∏î‡∏à‡∏≠
 0.10 ‡∏Ñ‡∏±‡∏ó
 0.10 ‡∏£‡πà‡∏ß‡∏á_‡∏•‡∏á
 0.10 ‡πÅ‡∏î‡∏á
 0.10 ‡∏£‡πà‡∏ß‡∏á
 0.10 ‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î
 0.10 ‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏™‡∏≤‡∏î
 0.11 ‡πÄ‡∏•‡∏¥‡∏Å
 0.11 ‡∏ï‡∏ö
 0.13 ‡∏•‡∏ö
 0.13 ‡∏£‡∏≠‡∏î
 0.18 ‡∏ï‡πà‡∏≥_‡∏™‡∏∏‡∏î
 0.18 s
 0.19 put
 0.20 ‡∏≠‡πà‡∏≠‡∏ô
 0.20 ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß
 0.22 ‡πÄ‡∏õ‡∏¥‡∏î_l
 0.22 ‡∏Ç‡∏≤_‡∏•‡∏á
 0.24 ‡∏Ç‡∏≤‡∏¢_‡∏´‡∏°‡∏π
 0.24 short
 0.25 l
 0.25 ‡∏•‡∏≤‡∏Å
 0.25 long
 0.30 ‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô
 0.31 call
 0.31 ‡∏õ‡∏±‡πà‡∏ô
 0.31 ‡πÅ‡∏Ç‡πá‡∏á
 0.37 ‡πÅ‡∏ï‡∏∞
 0.37 ‡∏ß‡∏¥‡πà‡∏á
 0.40 ‡∏™‡∏π‡∏á_‡∏™‡∏∏‡∏î
 0.41 ‡∏Ç‡∏≤_‡∏Ç‡∏∂‡πâ‡∏ô
 0.46 ‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏Å‡∏£‡πà‡∏á
 0.48 ‡∏Å‡∏≥‡πÑ‡∏£
 0.51 ‡∏ó‡∏∞‡∏¢‡∏≤‡∏ô


In [483]:

def svr_sentiment(word):
    X = [wv[word]]
    Y = model.predict(X)
    return Y[0] / 0.5 * 2 - 1

svr_sentiment('‡πÑ‡∏•‡πà‡∏£‡∏≤‡∏Ñ‡∏≤')

KeyError: "word '‡πÑ‡∏•‡πà‡∏£‡∏≤‡∏Ñ‡∏≤' not in vocabulary"

In [484]:
svr_words = list(allwords)
svr_words.sort(key=lambda word: svr_sentiment(word))
for word in svr_words:
    print('{:5.2f} {}'.format(svr_sentiment(word), word))

-0.87 ‡∏´‡∏ô‡∏±‡∏Å
-0.60 ‡πÑ‡∏™‡πâ_‡πÅ‡∏ï‡∏Å
-0.60 ‡∏ï‡∏Å
-0.60 ‡∏Å‡∏±‡∏á‡∏ß‡∏•
-0.60 ‡∏ó‡∏∏‡∏ö
-0.60 ‡∏ï‡∏≤‡∏¢
-0.60 ‡∏´‡∏•‡∏∏‡∏î
-0.60 ‡∏î‡∏≠‡∏¢
-0.60 ‡∏õ‡∏¥‡∏î‡∏à‡∏≠
-0.60 ‡∏Ñ‡∏±‡∏ó
-0.60 ‡∏£‡πà‡∏ß‡∏á_‡∏•‡∏á
-0.60 ‡πÅ‡∏î‡∏á
-0.60 ‡∏£‡πà‡∏ß‡∏á
-0.60 ‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î
-0.60 ‡πÄ‡∏•‡∏∑‡∏≠‡∏î‡∏™‡∏≤‡∏î
-0.55 ‡πÄ‡∏•‡∏¥‡∏Å
-0.54 ‡∏ï‡∏ö
-0.50 ‡∏•‡∏ö
-0.46 ‡∏£‡∏≠‡∏î
-0.30 ‡∏ï‡πà‡∏≥_‡∏™‡∏∏‡∏î
-0.28 s
-0.22 put
-0.21 ‡∏≠‡πà‡∏≠‡∏ô
-0.18 ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ß
-0.14 ‡πÄ‡∏õ‡∏¥‡∏î_l
-0.11 ‡∏Ç‡∏≤_‡∏•‡∏á
-0.05 ‡∏Ç‡∏≤‡∏¢_‡∏´‡∏°‡∏π
-0.03 short
-0.01 l
 0.00 ‡∏•‡∏≤‡∏Å
 0.00 long
 0.19 ‡∏Ç‡∏≤‡∏î_‡∏ó‡∏∏‡∏ô
 0.22 call
 0.23 ‡∏õ‡∏±‡πà‡∏ô
 0.25 ‡πÅ‡∏Ç‡πá‡∏á
 0.49 ‡πÅ‡∏ï‡∏∞
 0.50 ‡∏ß‡∏¥‡πà‡∏á
 0.60 ‡∏™‡∏π‡∏á_‡∏™‡∏∏‡∏î
 0.63 ‡∏Ç‡∏≤_‡∏Ç‡∏∂‡πâ‡∏ô
 0.83 ‡πÅ‡∏Ç‡πá‡∏á‡πÅ‡∏Å‡∏£‡πà‡∏á
 0.94 ‡∏Å‡∏≥‡πÑ‡∏£
 1.06 ‡∏ó‡∏∞‡∏¢‡∏≤‡∏ô
