In [4]:
%matplotlib inline

# FOR TRANSLATED VERSOIN
import os
import sys
import nltk
import sklearn
import csv
import re
import collections
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import json
import numpy as np


# Downloads the NLTK stopword corpus if not already downloaded
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

# sklearn modules for data processing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# sklearn modules for LSA
from sklearn.decomposition import TruncatedSVD

# sklearn modules for classification
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# sklearn modules for clustering
from sklearn.cluster import KMeans

from textblob import TextBlob
from googletrans import Translator
from mtranslate import translate

# from translate import Translator

from emoji import UNICODE_EMOJI
import re
from many_stop_words import get_stop_words
# import nagisa

#################################### TOKENIZER CLASS #######################################
class TinySegmenter():

    def __init__(self):
        self._patterns = {
            u"[一二三四五六七八九十百千万億兆]":u"M",
            u"[一-龠々〆ヵヶ]":u"H",
            u"[ぁ-ん]":u"I",
            u"[ァ-ヴーｱ-ﾝﾞｰ]":u"K",
            u"[a-zA-Zａ-ｚＡ-Ｚ]":u"A",
            u"[0-9０-９]":u"N",
            u"(?:w+)":u"A",
            u"(?:😤)":u"N",
            r"[^0-9a-zA-Z\s]":r"N"
            #r'(?:\w)|(?:😊)| ?:[^0-9a-zA-Z\s] '
            }
        self._chartype = []
        for pat in self._patterns:
            regexp = re.compile(pat)
            self._chartype.append([regexp, self._patterns[pat]])

        self._BIAS = -332
        self._BC1 = {u"HH":6,u"II":2461,u"KH":406,u"OH":-1378}
        self._BC2 = {u"AA":-3267,u"AI":2744,u"AN":-878,u"HH":-4070,u"HM":-1711,u"HN":4012,u"HO":3761,u"IA":1327,u"IH":-1184,u"II":-1332,u"IK":1721,u"IO":5492,u"KI":3831,u"KK":-8741,u"MH":-3132,u"MK":3334,u"OO":-2920}
        self._BC3 = {u"HH":996,u"HI":626,u"HK":-721,u"HN":-1307,u"HO":-836,u"IH":-301,u"KK":2762,u"MK":1079,u"MM":4034,u"OA":-1652,u"OH":266}
        self._BP1 = {u"BB":295,u"OB":304,u"OO":-125,u"UB":352}
        self._BP2 = {u"BO":60,u"OO":-1762}
        self._BQ1 = {u"BHH":1150,u"BHM":1521,u"BII":-1158,u"BIM":886,u"BMH":1208,u"BNH":449,u"BOH":-91,u"BOO":-2597,u"OHI":451,u"OIH":-296,u"OKA":1851,u"OKH":-1020,u"OKK":904,u"OOO":2965}
        self._BQ2 = {u"BHH":118,u"BHI":-1159,u"BHM":466,u"BIH":-919,u"BKK":-1720,u"BKO":864,u"OHH":-1139,u"OHM":-181,u"OIH":153,u"UHI":-1146}
        self._BQ3 = {u"BHH":-792,u"BHI":2664,u"BII":-299,u"BKI":419,u"BMH":937,u"BMM":8335,u"BNN":998,u"BOH":775,u"OHH":2174,u"OHM":439,u"OII":280,u"OKH":1798,u"OKI":-793,u"OKO":-2242,u"OMH":-2402,u"OOO":11699}
        self._BQ4 = {u"BHH":-3895,u"BIH":3761,u"BII":-4654,u"BIK":1348,u"BKK":-1806,u"BMI":-3385,u"BOO":-12396,u"OAH":926,u"OHH":266,u"OHK":-2036,u"ONN":-973}
        self._BW1 = {u",と":660,u",同":727,u"B1あ":1404,u"B1同":542,u"、と":660,u"、同":727,u"」と":1682,u"あっ":1505,u"いう":1743,u"いっ":-2055,u"いる":672,u"うし":-4817,u"うん":665,u"から":3472,u"がら":600,u"こう":-790,u"こと":2083,u"こん":-1262,u"さら":-4143,u"さん":4573,u"した":2641,u"して":1104,u"すで":-3399,u"そこ":1977,u"それ":-871,u"たち":1122,u"ため":601,u"った":3463,u"つい":-802,u"てい":805,u"てき":1249,u"でき":1127,u"です":3445,u"では":844,u"とい":-4915,u"とみ":1922,u"どこ":3887,u"ない":5713,u"なっ":3015,u"など":7379,u"なん":-1113,u"にし":2468,u"には":1498,u"にも":1671,u"に対":-912,u"の一":-501,u"の中":741,u"ませ":2448,u"まで":1711,u"まま":2600,u"まる":-2155,u"やむ":-1947,u"よっ":-2565,u"れた":2369,u"れで":-913,u"をし":1860,u"を見":731,u"亡く":-1886,u"京都":2558,u"取り":-2784,u"大き":-2604,u"大阪":1497,u"平方":-2314,u"引き":-1336,u"日本":-195,u"本当":-2423,u"毎日":-2113,u"目指":-724,u"Ｂ１あ":1404,u"Ｂ１同":542,u"｣と":1682}
        self._BW2 = {u"..":-11822,u"11":-669,u"――":-5730,u"−−":-13175,u"いう":-1609,u"うか":2490,u"かし":-1350,u"かも":-602,u"から":-7194,u"かれ":4612,u"がい":853,u"がら":-3198,u"きた":1941,u"くな":-1597,u"こと":-8392,u"この":-4193,u"させ":4533,u"され":13168,u"さん":-3977,u"しい":-1819,u"しか":-545,u"した":5078,u"して":972,u"しな":939,u"その":-3744,u"たい":-1253,u"たた":-662,u"ただ":-3857,u"たち":-786,u"たと":1224,u"たは":-939,u"った":4589,u"って":1647,u"っと":-2094,u"てい":6144,u"てき":3640,u"てく":2551,u"ては":-3110,u"ても":-3065,u"でい":2666,u"でき":-1528,u"でし":-3828,u"です":-4761,u"でも":-4203,u"とい":1890,u"とこ":-1746,u"とと":-2279,u"との":720,u"とみ":5168,u"とも":-3941,u"ない":-2488,u"なが":-1313,u"など":-6509,u"なの":2614,u"なん":3099,u"にお":-1615,u"にし":2748,u"にな":2454,u"によ":-7236,u"に対":-14943,u"に従":-4688,u"に関":-11388,u"のか":2093,u"ので":-7059,u"のに":-6041,u"のの":-6125,u"はい":1073,u"はが":-1033,u"はず":-2532,u"ばれ":1813,u"まし":-1316,u"まで":-6621,u"まれ":5409,u"めて":-3153,u"もい":2230,u"もの":-10713,u"らか":-944,u"らし":-1611,u"らに":-1897,u"りし":651,u"りま":1620,u"れた":4270,u"れて":849,u"れば":4114,u"ろう":6067,u"われ":7901,u"を通":-11877,u"んだ":728,u"んな":-4115,u"一人":602,u"一方":-1375,u"一日":970,u"一部":-1051,u"上が":-4479,u"会社":-1116,u"出て":2163,u"分の":-7758,u"同党":970,u"同日":-913,u"大阪":-2471,u"委員":-1250,u"少な":-1050,u"年度":-8669,u"年間":-1626,u"府県":-2363,u"手権":-1982,u"新聞":-4066,u"日新":-722,u"日本":-7068,u"日米":3372,u"曜日":-601,u"朝鮮":-2355,u"本人":-2697,u"東京":-1543,u"然と":-1384,u"社会":-1276,u"立て":-990,u"第に":-1612,u"米国":-4268,u"１１":-669};
        self._BW3 = {u"あた":-2194,u"あり":719,u"ある":3846,u"い.":-1185,u"い。":-1185,u"いい":5308,u"いえ":2079,u"いく":3029,u"いた":2056,u"いっ":1883,u"いる":5600,u"いわ":1527,u"うち":1117,u"うと":4798,u"えと":1454,u"か.":2857,u"か。":2857,u"かけ":-743,u"かっ":-4098,u"かに":-669,u"から":6520,u"かり":-2670,u"が,u":1816,u"が、":1816,u"がき":-4855,u"がけ":-1127,u"がっ":-913,u"がら":-4977,u"がり":-2064,u"きた":1645,u"けど":1374,u"こと":7397,u"この":1542,u"ころ":-2757,u"さい":-714,u"さを":976,u"し,u":1557,u"し、":1557,u"しい":-3714,u"した":3562,u"して":1449,u"しな":2608,u"しま":1200,u"す.":-1310,u"す。":-1310,u"する":6521,u"ず,u":3426,u"ず、":3426,u"ずに":841,u"そう":428,u"た.":8875,u"た。":8875,u"たい":-594,u"たの":812,u"たり":-1183,u"たる":-853,u"だ.":4098,u"だ。":4098,u"だっ":1004,u"った":-4748,u"って":300,u"てい":6240,u"てお":855,u"ても":302,u"です":1437,u"でに":-1482,u"では":2295,u"とう":-1387,u"とし":2266,u"との":541,u"とも":-3543,u"どう":4664,u"ない":1796,u"なく":-903,u"など":2135,u"に,u":-1021,u"に、":-1021,u"にし":1771,u"にな":1906,u"には":2644,u"の,u":-724,u"の、":-724,u"の子":-1000,u"は,u":1337,u"は、":1337,u"べき":2181,u"まし":1113,u"ます":6943,u"まっ":-1549,u"まで":6154,u"まれ":-793,u"らし":1479,u"られ":6820,u"るる":3818,u"れ,u":854,u"れ、":854,u"れた":1850,u"れて":1375,u"れば":-3246,u"れる":1091,u"われ":-605,u"んだ":606,u"んで":798,u"カ月":990,u"会議":860,u"入り":1232,u"大会":2217,u"始め":1681,u"市":965,u"新聞":-5055,u"日,u":974,u"日、":974,u"社会":2024,u"ｶ月":990};
        self._TC1 = {u"AAA":1093,u"HHH":1029,u"HHM":580,u"HII":998,u"HOH":-390,u"HOM":-331,u"IHI":1169,u"IOH":-142,u"IOI":-1015,u"IOM":467,u"MMH":187,u"OOI":-1832};
        self._TC2 = {u"HHO":2088,u"HII":-1023,u"HMM":-1154,u"IHI":-1965,u"KKH":703,u"OII":-2649};
        self._TC3 = {u"AAA":-294,u"HHH":346,u"HHI":-341,u"HII":-1088,u"HIK":731,u"HOH":-1486,u"IHH":128,u"IHI":-3041,u"IHO":-1935,u"IIH":-825,u"IIM":-1035,u"IOI":-542,u"KHH":-1216,u"KKA":491,u"KKH":-1217,u"KOK":-1009,u"MHH":-2694,u"MHM":-457,u"MHO":123,u"MMH":-471,u"NNH":-1689,u"NNO":662,u"OHO":-3393};
        self._TC4 = {u"HHH":-203,u"HHI":1344,u"HHK":365,u"HHM":-122,u"HHN":182,u"HHO":669,u"HIH":804,u"HII":679,u"HOH":446,u"IHH":695,u"IHO":-2324,u"IIH":321,u"III":1497,u"IIO":656,u"IOO":54,u"KAK":4845,u"KKA":3386,u"KKK":3065,u"MHH":-405,u"MHI":201,u"MMH":-241,u"MMM":661,u"MOM":841};
        self._TQ1 = {u"BHHH":-227,u"BHHI":316,u"BHIH":-132,u"BIHH":60,u"BIII":1595,u"BNHH":-744,u"BOHH":225,u"BOOO":-908,u"OAKK":482,u"OHHH":281,u"OHIH":249,u"OIHI":200,u"OIIH":-68};
        self._TQ2 = {u"BIHH":-1401,u"BIII":-1033,u"BKAK":-543,u"BOOO":-5591};
        self._TQ3 = {u"BHHH":478,u"BHHM":-1073,u"BHIH":222,u"BHII":-504,u"BIIH":-116,u"BIII":-105,u"BMHI":-863,u"BMHM":-464,u"BOMH":620,u"OHHH":346,u"OHHI":1729,u"OHII":997,u"OHMH":481,u"OIHH":623,u"OIIH":1344,u"OKAK":2792,u"OKHH":587,u"OKKA":679,u"OOHH":110,u"OOII":-685};
        self._TQ4 = {u"BHHH":-721,u"BHHM":-3604,u"BHII":-966,u"BIIH":-607,u"BIII":-2181,u"OAAA":-2763,u"OAKK":180,u"OHHH":-294,u"OHHI":2446,u"OHHO":480,u"OHIH":-1573,u"OIHH":1935,u"OIHI":-493,u"OIIH":626,u"OIII":-4007,u"OKAK":-8156};
        self._TW1 = {u"につい":-4681,u"東京都":2026};
        self._TW2 = {u"ある程":-2049,u"いった":-1256,u"ころが":-2434,u"しょう":3873,u"その後":-4430,u"だって":-1049,u"ていた":1833,u"として":-4657,u"ともに":-4517,u"もので":1882,u"一気に":-792,u"初めて":-1512,u"同時に":-8097,u"大きな":-1255,u"対して":-2721,u"社会党":-3216};
        self._TW3 = {u"いただ":-1734,u"してい":1314,u"として":-4314,u"につい":-5483,u"にとっ":-5989,u"に当た":-6247,u"ので,u":-727,u"ので、":-727,u"のもの":-600,u"れから":-3752,u"十二月":-2287};
        self._TW4 = {u"いう.":8576,u"いう。":8576,u"からな":-2348,u"してい":2958,u"たが,u":1516,u"たが、":1516,u"ている":1538,u"という":1349,u"ました":5543,u"ません":1097,u"ようと":-4258,u"よると":5865};
        self._UC1 = {u"A":484,u"K":93,u"M":645,u"O":-505};
        self._UC2 = {u"A":819,u"H":1059,u"I":409,u"M":3987,u"N":5775,u"O":646};
        self._UC3 = {u"A":-1370,u"I":2311};
        self._UC4 = {u"A":-2643,u"H":1809,u"I":-1032,u"K":-3450,u"M":3565,u"N":3876,u"O":6646};
        self._UC5 = {u"H":313,u"I":-1238,u"K":-799,u"M":539,u"O":-831};
        self._UC6 = {u"H":-506,u"I":-253,u"K":87,u"M":247,u"O":-387};
        self._UP1 = {u"O":-214};
        self._UP2 = {u"B":69,u"O":935};
        self._UP3 = {u"B":189};
        self._UQ1 = {u"BH":21,u"BI":-12,u"BK":-99,u"BN":142,u"BO":-56,u"OH":-95,u"OI":477,u"OK":410,u"OO":-2422};
        self._UQ2 = {u"BH":216,u"BI":113,u"OK":1759};
        self._UQ3 = {u"BA":-479,u"BH":42,u"BI":1913,u"BK":-7198,u"BM":3160,u"BN":6427,u"BO":14761,u"OI":-827,u"ON":-3212};
        self._UW1 = {u",u":156,u"、":156,u"「":-463,u"あ":-941,u"う":-127,u"が":-553,u"き":121,u"こ":505,u"で":-201,u"と":-547,u"ど":-123,u"に":-789,u"の":-185,u"は":-847,u"も":-466,u"や":-470,u"よ":182,u"ら":-292,u"り":208,u"れ":169,u"を":-446,u"ん":-137,u"・":-135,u"主":-402,u"京":-268,u"区":-912,u"午":871,u"国":-460,u"大":561,u"委":729,u"市":-411,u"日":-141,u"理":361,u"生":-408,u"県":-386,u"都":-718,u"｢":-463,u"･":-135};
        self._UW2 = {u",u":-829,u"、":-829,u"〇":892,u"「":-645,u"」":3145,u"あ":-538,u"い":505,u"う":134,u"お":-502,u"か":1454,u"が":-856,u"く":-412,u"こ":1141,u"さ":878,u"ざ":540,u"し":1529,u"す":-675,u"せ":300,u"そ":-1011,u"た":188,u"だ":1837,u"つ":-949,u"て":-291,u"で":-268,u"と":-981,u"ど":1273,u"な":1063,u"に":-1764,u"の":130,u"は":-409,u"ひ":-1273,u"べ":1261,u"ま":600,u"も":-1263,u"や":-402,u"よ":1639,u"り":-579,u"る":-694,u"れ":571,u"を":-2516,u"ん":2095,u"ア":-587,u"カ":306,u"キ":568,u"ッ":831,u"三":-758,u"不":-2150,u"世":-302,u"中":-968,u"主":-861,u"事":492,u"人":-123,u"会":978,u"保":362,u"入":548,u"初":-3025,u"副":-1566,u"北":-3414,u"区":-422,u"大":-1769,u"天":-865,u"太":-483,u"子":-1519,u"学":760,u"実":1023,u"小":-2009,u"市":-813,u"年":-1060,u"強":1067,u"手":-1519,u"揺":-1033,u"政":1522,u"文":-1355,u"新":-1682,u"日":-1815,u"明":-1462,u"最":-630,u"朝":-1843,u"本":-1650,u"東":-931,u"果":-665,u"次":-2378,u"民":-180,u"気":-1740,u"理":752,u"発":529,u"目":-1584,u"相":-242,u"県":-1165,u"立":-763,u"第":810,u"米":509,u"自":-1353,u"行":838,u"西":-744,u"見":-3874,u"調":1010,u"議":1198,u"込":3041,u"開":1758,u"間":-1257,u"｢":-645,u"｣":3145,u"ｯ":831,u"ｱ":-587,u"ｶ":306,u"ｷ":568};
        self._UW3 = {u",u":4889,u"1":-800,u"−":-1723,u"、":4889,u"々":-2311,u"〇":5827,u"」":2670,u"〓":-3573,u"あ":-2696,u"い":1006,u"う":2342,u"え":1983,u"お":-4864,u"か":-1163,u"が":3271,u"く":1004,u"け":388,u"げ":401,u"こ":-3552,u"ご":-3116,u"さ":-1058,u"し":-395,u"す":584,u"せ":3685,u"そ":-5228,u"た":842,u"ち":-521,u"っ":-1444,u"つ":-1081,u"て":6167,u"で":2318,u"と":1691,u"ど":-899,u"な":-2788,u"に":2745,u"の":4056,u"は":4555,u"ひ":-2171,u"ふ":-1798,u"へ":1199,u"ほ":-5516,u"ま":-4384,u"み":-120,u"め":1205,u"も":2323,u"や":-788,u"よ":-202,u"ら":727,u"り":649,u"る":5905,u"れ":2773,u"わ":-1207,u"を":6620,u"ん":-518,u"ア":551,u"グ":1319,u"ス":874,u"ッ":-1350,u"ト":521,u"ム":1109,u"ル":1591,u"ロ":2201,u"ン":278,u"・":-3794,u"一":-1619,u"下":-1759,u"世":-2087,u"両":3815,u"中":653,u"主":-758,u"予":-1193,u"二":974,u"人":2742,u"今":792,u"他":1889,u"以":-1368,u"低":811,u"何":4265,u"作":-361,u"保":-2439,u"元":4858,u"党":3593,u"全":1574,u"公":-3030,u"六":755,u"共":-1880,u"円":5807,u"再":3095,u"分":457,u"初":2475,u"別":1129,u"前":2286,u"副":4437,u"力":365,u"動":-949,u"務":-1872,u"化":1327,u"北":-1038,u"区":4646,u"千":-2309,u"午":-783,u"協":-1006,u"口":483,u"右":1233,u"各":3588,u"合":-241,u"同":3906,u"和":-837,u"員":4513,u"国":642,u"型":1389,u"場":1219,u"外":-241,u"妻":2016,u"学":-1356,u"安":-423,u"実":-1008,u"家":1078,u"小":-513,u"少":-3102,u"州":1155,u"市":3197,u"平":-1804,u"年":2416,u"広":-1030,u"府":1605,u"度":1452,u"建":-2352,u"当":-3885,u"得":1905,u"思":-1291,u"性":1822,u"戸":-488,u"指":-3973,u"政":-2013,u"教":-1479,u"数":3222,u"文":-1489,u"新":1764,u"日":2099,u"旧":5792,u"昨":-661,u"時":-1248,u"曜":-951,u"最":-937,u"月":4125,u"期":360,u"李":3094,u"村":364,u"東":-805,u"核":5156,u"森":2438,u"業":484,u"氏":2613,u"民":-1694,u"決":-1073,u"法":1868,u"海":-495,u"無":979,u"物":461,u"特":-3850,u"生":-273,u"用":914,u"町":1215,u"的":7313,u"直":-1835,u"省":792,u"県":6293,u"知":-1528,u"私":4231,u"税":401,u"立":-960,u"第":1201,u"米":7767,u"系":3066,u"約":3663,u"級":1384,u"統":-4229,u"総":1163,u"線":1255,u"者":6457,u"能":725,u"自":-2869,u"英":785,u"見":1044,u"調":-562,u"財":-733,u"費":1777,u"車":1835,u"軍":1375,u"込":-1504,u"通":-1136,u"選":-681,u"郎":1026,u"郡":4404,u"部":1200,u"金":2163,u"長":421,u"開":-1432,u"間":1302,u"関":-1282,u"雨":2009,u"電":-1045,u"非":2066,u"駅":1620,u"１":-800,u"｣":2670,u"･":-3794,u"ｯ":-1350,u"ｱ":551,u"ｸﾞ":1319,u"ｽ":874,u"ﾄ":521,u"ﾑ":1109,u"ﾙ":1591,u"ﾛ":2201,u"ﾝ":278};
        self._UW4 = {u",u":3930,u".":3508,u"―":-4841,u"、":3930,u"。":3508,u"〇":4999,u"「":1895,u"」":3798,u"〓":-5156,u"あ":4752,u"い":-3435,u"う":-640,u"え":-2514,u"お":2405,u"か":530,u"が":6006,u"き":-4482,u"ぎ":-3821,u"く":-3788,u"け":-4376,u"げ":-4734,u"こ":2255,u"ご":1979,u"さ":2864,u"し":-843,u"じ":-2506,u"す":-731,u"ず":1251,u"せ":181,u"そ":4091,u"た":5034,u"だ":5408,u"ち":-3654,u"っ":-5882,u"つ":-1659,u"て":3994,u"で":7410,u"と":4547,u"な":5433,u"に":6499,u"ぬ":1853,u"ね":1413,u"の":7396,u"は":8578,u"ば":1940,u"ひ":4249,u"び":-4134,u"ふ":1345,u"へ":6665,u"べ":-744,u"ほ":1464,u"ま":1051,u"み":-2082,u"む":-882,u"め":-5046,u"も":4169,u"ゃ":-2666,u"や":2795,u"ょ":-1544,u"よ":3351,u"ら":-2922,u"り":-9726,u"る":-14896,u"れ":-2613,u"ろ":-4570,u"わ":-1783,u"を":13150,u"ん":-2352,u"カ":2145,u"コ":1789,u"セ":1287,u"ッ":-724,u"ト":-403,u"メ":-1635,u"ラ":-881,u"リ":-541,u"ル":-856,u"ン":-3637,u"・":-4371,u"ー":-11870,u"一":-2069,u"中":2210,u"予":782,u"事":-190,u"井":-1768,u"人":1036,u"以":544,u"会":950,u"体":-1286,u"作":530,u"側":4292,u"先":601,u"党":-2006,u"共":-1212,u"内":584,u"円":788,u"初":1347,u"前":1623,u"副":3879,u"力":-302,u"動":-740,u"務":-2715,u"化":776,u"区":4517,u"協":1013,u"参":1555,u"合":-1834,u"和":-681,u"員":-910,u"器":-851,u"回":1500,u"国":-619,u"園":-1200,u"地":866,u"場":-1410,u"塁":-2094,u"士":-1413,u"多":1067,u"大":571,u"子":-4802,u"学":-1397,u"定":-1057,u"寺":-809,u"小":1910,u"屋":-1328,u"山":-1500,u"島":-2056,u"川":-2667,u"市":2771,u"年":374,u"庁":-4556,u"後":456,u"性":553,u"感":916,u"所":-1566,u"支":856,u"改":787,u"政":2182,u"教":704,u"文":522,u"方":-856,u"日":1798,u"時":1829,u"最":845,u"月":-9066,u"木":-485,u"来":-442,u"校":-360,u"業":-1043,u"氏":5388,u"民":-2716,u"気":-910,u"沢":-939,u"済":-543,u"物":-735,u"率":672,u"球":-1267,u"生":-1286,u"産":-1101,u"田":-2900,u"町":1826,u"的":2586,u"目":922,u"省":-3485,u"県":2997,u"空":-867,u"立":-2112,u"第":788,u"米":2937,u"系":786,u"約":2171,u"経":1146,u"統":-1169,u"総":940,u"線":-994,u"署":749,u"者":2145,u"能":-730,u"般":-852,u"行":-792,u"規":792,u"警":-1184,u"議":-244,u"谷":-1000,u"賞":730,u"車":-1481,u"軍":1158,u"輪":-1433,u"込":-3370,u"近":929,u"道":-1291,u"選":2596,u"郎":-4866,u"都":1192,u"野":-1100,u"銀":-2213,u"長":357,u"間":-2344,u"院":-2297,u"際":-2604,u"電":-878,u"領":-1659,u"題":-792,u"館":-1984,u"首":1749,u"高":2120,u"｢":1895,u"｣":3798,u"･":-4371,u"ｯ":-724,u"ｰ":-11870,u"ｶ":2145,u"ｺ":1789,u"ｾ":1287,u"ﾄ":-403,u"ﾒ":-1635,u"ﾗ":-881,u"ﾘ":-541,u"ﾙ":-856,u"ﾝ":-3637};
        self._UW5 = {u",u":465,u".":-299,u"1":-514,u"E2":-32768,u"]":-2762,u"、":465,u"。":-299,u"「":363,u"あ":1655,u"い":331,u"う":-503,u"え":1199,u"お":527,u"か":647,u"が":-421,u"き":1624,u"ぎ":1971,u"く":312,u"げ":-983,u"さ":-1537,u"し":-1371,u"す":-852,u"だ":-1186,u"ち":1093,u"っ":52,u"つ":921,u"て":-18,u"で":-850,u"と":-127,u"ど":1682,u"な":-787,u"に":-1224,u"の":-635,u"は":-578,u"べ":1001,u"み":502,u"め":865,u"ゃ":3350,u"ょ":854,u"り":-208,u"る":429,u"れ":504,u"わ":419,u"を":-1264,u"ん":327,u"イ":241,u"ル":451,u"ン":-343,u"中":-871,u"京":722,u"会":-1153,u"党":-654,u"務":3519,u"区":-901,u"告":848,u"員":2104,u"大":-1296,u"学":-548,u"定":1785,u"嵐":-1304,u"市":-2991,u"席":921,u"年":1763,u"思":872,u"所":-814,u"挙":1618,u"新":-1682,u"日":218,u"月":-4353,u"査":932,u"格":1356,u"機":-1508,u"氏":-1347,u"田":240,u"町":-3912,u"的":-3149,u"相":1319,u"省":-1052,u"県":-4003,u"研":-997,u"社":-278,u"空":-813,u"統":1955,u"者":-2233,u"表":663,u"語":-1073,u"議":1219,u"選":-1018,u"郎":-368,u"長":786,u"間":1191,u"題":2368,u"館":-689,u"１":-514,u"Ｅ２":-32768,u"｢":363,u"ｲ":241,u"ﾙ":451,u"ﾝ":-343};
        self._UW6 = {u",u":227,u".":808,u"1":-270,u"E1":306,u"、":227,u"。":808,u"あ":-307,u"う":189,u"か":241,u"が":-73,u"く":-121,u"こ":-200,u"じ":1782,u"す":383,u"た":-428,u"っ":573,u"て":-1014,u"で":101,u"と":-105,u"な":-253,u"に":-149,u"の":-417,u"は":-236,u"も":-206,u"り":187,u"る":-135,u"を":195,u"ル":-673,u"ン":-496,u"一":-277,u"中":201,u"件":-800,u"会":624,u"前":302,u"区":1792,u"員":-1212,u"委":798,u"学":-960,u"市":887,u"広":-695,u"後":535,u"業":-697,u"相":753,u"社":-507,u"福":974,u"空":-822,u"者":1811,u"連":463,u"郎":1082,u"１":-270,u"Ｅ１":306,u"ﾙ":-673,u"ﾝ":-496};

    def _ts(self, dict, key):
        if not key in dict:
            return 0
        else:
            return dict[key]

    def _ctype(self, str):
        for elem in self._chartype:
            match = re.match(elem[0], str)
            if match:
                return elem[1]
        return u"O"

    def tokenize(self, text):
        if text == "":
            return []

        result = []
        seg = [u"B3",u"B2",u"B1"]
        ctype = [u"O",u"O",u"O"]
        o = list(text)
        for i in range(0, len(o)):
            seg.append(o[i])
            ctype.append(self._ctype(o[i]))

        seg.append(u"E1")
        seg.append(u"E2")
        seg.append(u"E3")
        ctype.append(u"O")
        ctype.append(u"O")
        ctype.append(u"O")

        word = seg[3]
        p1 = u"U"
        p2 = u"U"
        p3 = u"U"
        for i in range(4, len(seg) - 3):

            score = self._BIAS
            w1 = seg[i-3]
            w2 = seg[i-2]
            w3 = seg[i-1]
            w4 = seg[i]
            w5 = seg[i+1]
            w6 = seg[i+2]
            c1 = ctype[i-3]
            c2 = ctype[i-2]
            c3 = ctype[i-1]
            c4 = ctype[i]
            c5 = ctype[i+1]
            c6 = ctype[i+2]

            score += self._ts(self._UP1, p1)
            score += self._ts(self._UP2, p2)
            score += self._ts(self._UP3, p3)
            score += self._ts(self._BP1, p1 + p2)
            score += self._ts(self._BP2, p2 + p3)
            score += self._ts(self._UW1, w1)
            score += self._ts(self._UW2, w2)
            score += self._ts(self._UW3, w3)
            score += self._ts(self._UW4, w4)
            score += self._ts(self._UW5, w5)
            score += self._ts(self._UW6, w6)
            score += self._ts(self._BW1, w2 + w3)
            score += self._ts(self._BW2, w3 + w4)
            score += self._ts(self._BW3, w4 + w5)
            score += self._ts(self._TW1, w1 + w2 + w3)
            score += self._ts(self._TW2, w2 + w3 + w4)
            score += self._ts(self._TW3, w3 + w4 + w5)
            score += self._ts(self._TW4, w4 + w5 + w6)
            score += self._ts(self._UC1, c1)
            score += self._ts(self._UC2, c2)
            score += self._ts(self._UC3, c3)
            score += self._ts(self._UC4, c4)
            score += self._ts(self._UC5, c5)
            score += self._ts(self._UC6, c6)
            score += self._ts(self._BC1, c2 + c3)
            score += self._ts(self._BC2, c3 + c4)
            score += self._ts(self._BC3, c4 + c5)
            score += self._ts(self._TC1, c1 + c2 + c3)
            score += self._ts(self._TC2, c2 + c3 + c4)
            score += self._ts(self._TC3, c3 + c4 + c5)
            score += self._ts(self._TC4, c4 + c5 + c6)
            #  score += self._ts(self._TC5, c4 + c5 + c6)
            score += self._ts(self._UQ1, p1 + c1)
            score += self._ts(self._UQ2, p2 + c2)
            score += self._ts(self._UQ1, p3 + c3)
            score += self._ts(self._BQ1, p2 + c2 + c3)
            score += self._ts(self._BQ2, p2 + c3 + c4)
            score += self._ts(self._BQ3, p3 + c2 + c3)
            score += self._ts(self._BQ4, p3 + c3 + c4)
            score += self._ts(self._TQ1, p2 + c1 + c2 + c3)
            score += self._ts(self._TQ2, p2 + c2 + c3 + c4)
            score += self._ts(self._TQ3, p3 + c1 + c2 + c3)
            score += self._ts(self._TQ4, p3 + c2 + c3 + c4)
            p = u"O"
            if score > 0:
                result.append(word)
                word = ""
                p = u"B"

            p1 = p2
            p2 = p3
            p3 = p
            word += seg[i]

        result.append(word)

        return result
#################################### TOKENIZER CLASS #######################################


###### Here is tokenizing and filter by stop word.
def process_document(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r'[^\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf😤]','',text)
    text = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', text)
    
    ##### METHOD 1: TRANSLATE FROM BEGINNING #####
    text = text.replace("😤", "sleepymoji")


     ##### METHOD 2: USE JAPANESE TOKENIZER CLASS #####
    segmenter = TinySegmenter()
    text = segmenter.tokenize(text)
    #tokenizer = RegexpTokenizer(r'(?:\w)|(?:😊)| ?:[^0-9a-zA-Z\s] ')
    
    #tokenized = tokenizer.tokenize(text)
    new_stop_words = []
    stop_words = get_stop_words('ja')
    
    for x in stop_words:
        new_stop_words.append(x)
    new_stop_words.append("て")
    new_stop_words.append("なっ")
    new_stop_words.append("き")
    new_stop_words.append("な")
    new_stop_words.append("ので")
    new_stop_words.append("よ")
    new_stop_words.append("ね")
    new_stop_words.append("ず")
    new_stop_words.append("か")
    new_stop_words.append("い")
    new_stop_words.append("して")
    new_stop_words.append("だ")
    new_stop_words.append("み")
    new_stop_words.append("そう")
    new_stop_words.append("いる")
    new_stop_words.append("ます")
    new_stop_words.append("なり")
    new_stop_words.append("あっ")
    new_stop_words.append("でき")
    new_stop_words.append("ちゃん")
    new_stop_words.append("さん")
    new_stop_words.append("くん")
    new_stop_words.append("ある")
    new_stop_words.append("へ")
    new_stop_words.append("たく")
    new_stop_words.append("そし")
    new_stop_words.append("なる")
    new_stop_words.append("する")
    new_stop_words.append("たい")
    new_stop_words.append("的")
    new_stop_words.append("けど")#start from crying
    new_stop_words.append("られ")
    new_stop_words.append("たし")
    new_stop_words.append("たけど")
    new_stop_words.append("ん") # start from angry
    new_stop_words.append("ご")
    new_stop_words.append("やか")
    new_stop_words.append("ゆる")
    new_stop_words.append("す") #start from sleepy
    new_stop_words.append("まい")
    new_stop_words.append("こん")
    new_stop_words.append("ばん")
    
    filtered = []

    bad_punct = ["（", "！", "？", "）", "、", "～", "；", "｀", "）・", "・", "＊",
                "ゞ", "：", "＼", "／", "一", "＆"]
    for word in text:
        if word not in new_stop_words:
            if word not in bad_punct:
                filtered.append(word)
    return filtered

###### Here is tokenizing and filter by stop word. here is reading data into a corpus.
def read_data(data_dir):
    corpus = []
    with open(data_dir, errors='ignore', encoding='utf-8') as words_file:
        csv_reader = csv.reader(words_file, delimiter = ',')
        data = []
        for row in csv_reader:
            data.append(row[0])
    
    return data


###### Making the vocabulary.
def construct_vocab(corpus):
    """
        Input: A list of list of string. Each string represents a word token.
        Output: A tuple of dicts: (vocab, inverse_vocab)
                vocab : A dict mapping str -> int. This will be your vocabulary.
                inverse_vocab: Inverse mapping int -> str
    """
    vocab = {}
    inverse_vocab = {}
    id_count = 0

    for sentence in corpus:
        for word in sentence:
            if word not in vocab:
                vocab[word] = id_count
                inverse_vocab[id_count] = word
                id_count += 1
    return (vocab, inverse_vocab)

###### Count num words
def word_counts(corpus):
    """ Given a corpus (such as returned by load_corpus), return a dictionary
        of word frequencies. Maps string token to integer count.
    """
    return collections.Counter(w for s in corpus for w in s)

###### Truncate the vocabulary
def trunc_vocab(corpus, counts):
    """ Limit the vocabulary to the 10k most-frequent words. Remove rare words from
         the original corpus.
        Input: A list of list of string. Each string represents a word token.
        Output: A tuple (new_corpus, new_counts)
                new_corpus: A corpus (list of list of string) with only the 10k most-frequent words
                new_counts: Counts of the 10k most-frequent words

        Hint: Sort the keys of counts by their values
    """
    new_counts = {}
    new_corpus = []
    new_counts = collections.Counter(counts).most_common(200)
    top_words = set(dict(new_counts).keys())

    for sentence in corpus:
        sent = []
        for word in sentence:
            if word in top_words and word != "https" and word != "amp" and word != "co":
                sent.append(word)
        new_corpus.append(sent)

    new_counts = dict(new_counts)
    
    
    return new_corpus, new_counts

###### Constructing word vectors
def word_vectors(corpus, vocab):
    """
        Input: A corpus (list of list of string) and a vocab (word-to-id mapping)
        Output: A lookup table that maps [word id] -> [word vector]
    """

    # each word vector is [count of word id 1, count of word id 2 ... , count of word id n]
    table = {}

    # construct a table where every word ID maps to a list of 0's
    for word in vocab:
        word_id = vocab[word]
        table[word_id] = len(vocab) * [0]

    for sentence in corpus:
        length = len(sentence)
        for i in range(length):
            curr_word = sentence[i]
            for word in range( i - 4, i):
                if word >= 0 and word != i:
                    table[vocab[curr_word]][vocab[sentence[word]]] += 1
            for word in range(i + 1, i + 4 + 1):
                if word < length and word != i:
                    table[vocab[curr_word]][vocab[sentence[word]]] += 1
    return table

# iris's function
def get_emoji_list(data):
    empty = []
    for i in range(len(data)):
        tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', data[i], flags=re.MULTILINE)
        a = process_document(tweet)
        a = ' '.join(a)
        data[i] = a
        if a == '':
            empty.append(i)
    data  = list(np.delete(list(data), empty, 0))

###### Return most similar words
def most_similar(lookup_table, wordvec, vocab):
    """ Helper function (optional).

        Given a lookup table and word vector, find the top most-similar word ids to the given
        word vector. You can limit this to the first NUM_CLOSEST results.
    """

    most_similar = {}
    for word in lookup_table:
        vector = lookup_table[word]
        pair = []
        pair.append(vector)
        pair.append(wordvec)
        distance = pdist(pair, 'cosine')
        most_similar[word] = distance

    sorted_most_similar = sorted(most_similar.items(), key=lambda x: x[1])
    sorted_most_similar = sorted_most_similar[1:100 + 1]
    most_similar_word = []
    for word in sorted_most_similar:
        most_similar_word.append(word[0])

    return most_similar_word

def get_wordvec_dictionary(lookup_table, wordvec, inverse_vocab):
    """ Helper function (optional).

        Given a lookup table and word vector, 
        returns a dictionary of the words and their distance from the given word
    """

    most_similar = {}
    for word in lookup_table:
        vector = lookup_table[word]
        pair = []
        pair.append(vector)
        pair.append(wordvec)
        distance = pdist(pair, 'cosine')
        most_similar[word] = distance

    sorted_most_similar = sorted(most_similar.items(), key=lambda x: x[1])
    sorted_most_similar = sorted_most_similar[1:100]
    
    word_distance = {}
    for word in sorted_most_similar:
        num = word[1][0]
        word_distance[inverse_vocab[word[0]]] = num
    return word_distance

###### TSNE plot
def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(
            label,
            xy=(x, y),
            xytext=(5, 2),
            textcoords='offset points',
            ha='right',
            va='bottom')
        
        
def main(data_dir):

    corpus = read_data(data_dir) # returns the data, one tweet per list
    # call  translate here if necessary
    new_corpus = []
    for tweet in corpus:
        new_corpus.append(process_document(tweet)) # returns a list of tweets processed
    
    corpus_2d = [] # list of lists
    for tweet in new_corpus:
        corpus_2d.append(tweet)
    
    counts = word_counts(corpus_2d)
    new_corpus, new_counts = trunc_vocab(corpus_2d, counts)
               
    vocab, inverse_vocab = construct_vocab(new_corpus)

    lookup_table= word_vectors(new_corpus, vocab)
    
    vectors = []
    for wid in lookup_table:
        vectors.append(lookup_table[wid])

    D = pdist(vectors, 'cosine')
    D = squareform(D)
    
    dictionary_data = get_wordvec_dictionary(lookup_table, 
    lookup_table[vocab['sleepymoji']], inverse_vocab)

    print(dictionary_data)

#     tsne = TSNE(
#       perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
#     plot_only = 12
#     low_dim_embs = tsne.fit_transform(D[:plot_only, :])
#     labels = [inverse_vocab[i] for i in range(plot_only)]
#     plot_with_labels(low_dim_embs, labels)


# This may take a little bit of time (~30-60 seconds) to run.
if __name__ == '__main__':
    data_dir = 'data/angry_face_ja.csv'
    main(data_dir)

{'た': 0.2527580849980917, 'だっ': 0.2995193400908692, 'ない': 0.31077909743139054, 'くれ': 0.3309248056496411, '今日': 0.36229829080827236, 'まし': 0.3707184636216043, 'う': 0.3904742334449455, 'じゃ': 0.39358085620327254, 'でい': 0.39661672989656593, 'ところ': 0.39895317559760735, 'あり': 0.4052464089862048, 'れ': 0.4083202761736887, 'よねー': 0.4160153097808289, '最後': 0.4201919125312181, '見': 0.4208375328508489, 'なく': 0.4232426621498516, 'sleepymojisleepymojisleepymojisleepymojisleepymoji': 0.433216163537686, '目': 0.43408881045471925, 'とう': 0.43671059102102205, 'でし': 0.43992109105234567, 'こ': 0.4429996483596318, 'よね': 0.4453525264176009, 'お': 0.4464753490194492, 'という': 0.44946616149302876, '顔': 0.44958330881609276, 'でも': 0.4516208022983046, '誰': 0.4538821561790992, '気': 0.45515451303868326, 'さ': 0.4556496589172012, '漫画': 0.46401378924943637, '絶対': 0.4668973532154892, '昨日': 0.4679467448719582, '笑': 0.4724550445235385, 'なかっ': 0.47762044041974083, 'せ': 0.4808734557500227, '手': 0.481603633126786, '応援': 0.482073