# Cluster

In [65]:
import pandas as pd
import pymysql
import re
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
import numpy
import scipy.cluster.hierarchy as hcluster
import scipy.cluster.vq as vq
from collections import defaultdict, Counter
import itertools

## get data from db

In [66]:
conn = pymysql.connect(host='localhost',
                       user='memento',
                       password='memento@0x100_',
                       db='memento',
                       charset='utf8')
cur = conn.cursor()

In [67]:
columns = ['keyword', 'title', 'content', 'published_time', 'reply_count']

In [68]:
query_time_start = '2016/12/01'
query_time_end = '2016/12/31'

In [69]:
sql = "SELECT " + ",".join(columns) + " FROM articles where published_time between \'" +\
        query_time_start + "\' and \'" + query_time_end +"\'"

In [70]:
result = cur.execute(sql)

In [71]:
db_data = []
for c in cur:
    db_data.append(c)

In [72]:
frame = pd.DataFrame(db_data, columns=columns)

In [73]:
frame.max_len = max(frame.content.apply(lambda x : len(x))) / 5
frame['per'] = frame.content.apply(lambda x : int(round(frame.max_len / len(x))))

In [74]:
cur.close()
conn.close()

## tokenize and stemize

In [75]:
pat_small_quot = re.compile(u"\'(.+?)\'")
pat_double_quot = re.compile(u"\"(.+?)\"")
def find_quotation(text):
    mat_small = pat_small_quot.finditer(text)
    mat_double = pat_double_quot.finditer(text)
    return list(mat_small) + list(mat_double)

In [76]:
from konlpy.tag import Twitter
tagger = Twitter()

def tokenize(text):
    return tagger.morphs(text)

stop_words = []
def tokenize_stop(text, stop=stop_words):
    return [ token for token in tokenize(text) if token not in stop ]

pos_tags = ['Noun']
neg_tags = ['KoreanParticle', 'PreEomi', 'Punctuation', 'Eomi', 'Number', 'Foreign', 'URL']
def stemize_pos(text, tags=pos_tags):
    return [ word for word, tag in tagger.pos(text) if tag in tags]

def stemize_neg(text, tags=neg_tags):
    return [ word for word, tag in tagger.pos(text) if not tag in tags]

def tagging(text, neg_tags=[]):
    return [ "{}/{}".format(word, tag) for word, tag in tagger.pos(text) if not tag in neg_tags]

In [77]:
zip_tags = ['Noun', 'Alpha']
token_key = '**//*//**//**/*/**//*//**/**'
def stemize_tagging(text, zip_tags=zip_tags, neg_tags=neg_tags):
    match_str = []
    c = 0
    for match in find_quotation(text):
        text = text[:match.start() + c] + " " + token_key + " " + text[match.end() + c:]
        c += len(token_key) - len(match.group()) + 2
        match_str.append(match.group()[1:-1])
    ret = []
    for tokens in [ tagger.pos(word) for word in text.split() ]:
        if len(tokens) < 2:
            ret.append(tokens[0][0])
            continue
        zipper = []
        for word, pos in tokens:
            if pos in zip_tags: 
                zipper.append(word)
            else:
                if zipper: ret.append("".join(zipper))
                if pos not in neg_tags: ret.append(word)
                zipper[:] = []
        if zipper: ret.append("".join(zipper))
    return [ r == token_key and match_str.pop() or r for r in ret ]

In [78]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

In [79]:
rate_content = 1
rate_title = 20
rate_date = 20
rate_keyword = 25

In [80]:
class LabeledLineSentence(object):
    def __init__(self, frame):
        self.frame = frame
        self.s = []
        for idx, row in self.frame.iterrows():
            title, content, date, keyword = self.parser(row)
            self.s.append({'title':title,'content':content,'date':date,'keyword':keyword,'per':row.per})
    
    def time_range(self, date, pat, day=10):
        dates = []
        for d in range(day):
            delta = timedelta(days=d)
            dates.append((date + delta).strftime("%Y-%m-%d"))
            dates.append((date - delta).strftime("%Y-%m-%d"))
        return dates
    
    def parser(self, row):
        content = stemize_tagging(row.content)
        title = stemize_tagging(row.title)
        date = self.time_range(row.published_time.to_pydatetime(), "%Y-%m-%d")
        keyword = row.keyword
        return title, content, date, keyword
    
    def lines(self):
        ret = []
        for idx, s in enumerate(self.s):
            ret.append(LabeledSentence(s['content'] + s['title'] + s['date'] + [s['keyword']], ['line_%s' % idx]))
        return ret
    
    def __iter__(self):
        for idx, s in enumerate(self.s):
            l = len(s['content'])
            q = s['title']*int(l/int(100/rate_title))+\
                s['content']+\
                s['date']*int(l/int(100/rate_date))+\
                [s['keyword']]*int(l/int(100/rate_keyword))
            yield LabeledSentence(q, ['line_%s' % idx])

In [81]:
np.unique(frame.keyword.values)

array(['공지철', '김세정', '김태희', '박근혜', '정지훈', '최서원'], dtype=object)

In [82]:
keyword = np.unique(frame.keyword.values)[6]

IndexError: index 6 is out of bounds for axis 0 with size 6

In [None]:
key_frame = frame.loc[frame.keyword == keyword]

In [None]:
key_frame.shape

In [None]:
sentences = LabeledLineSentence(key_frame)

In [None]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025)

In [None]:
model.build_vocab(sentences.lines())

In [None]:
for epoch in range(10):
    model.train(sentences)
    model.alpha *= 0.98
    model.min_alpha = model.alpha

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
from matplotlib import colors as mcolors
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)

In [None]:
tsne = TSNE(n_components=2)
twoDimVecs = tsne.fit_transform(model.docvecs)

In [None]:
thresh = 12

In [None]:
clusters = hcluster.fclusterdata(model.docvecs, thresh, criterion="distance")

In [None]:
counter = Counter(clusters)

In [None]:
plt.bar(list(counter.keys()), list(counter.values()), align='center')
plt.show()

In [None]:
fig, ax = plt.subplots()
for c, twoDimVec in zip(clusters, twoDimVecs):
    ax.scatter(twoDimVec[0], twoDimVec[1], color=list(colors.values())[c % len(colors)])
plt.show()

In [None]:
xy = np.array(model.docvecs)[:,[0,1]]
#xy = model.docvecs

In [None]:
# plotting
plt.scatter(*numpy.transpose(xy), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
plt.title(title)
plt.show()

In [None]:
s = defaultdict(int)
for x in clusters:
    s[x] += 1

In [None]:
print (clusters)
print (len(np.unique(clusters)))

In [None]:
key_frame = key_frame.assign(cluster = clusters)

In [None]:
def get_keywords(titles, contents, size=10):
    title_tag = stemize_pos(" ".join(titles))
    content_tag = stemize_pos(" ".join(contents))
    dic = defaultdict(int)
    for tag in title_tag + content_tag:
        dic[tag] += 1
    tags = sorted([(k, v) for k, v in dic.items()], key=lambda x: -x[1])
    return tags[:size]

In [None]:
def get_memento_rate(frame):
    replys = frame.reply_count.sum()
    r, c = frame.shape
    return r * 100 + replys

In [None]:
key_frame.to_csv('../back/mytest.csv')

In [None]:
for cluster in np.unique(clusters):
    i_frame = key_frame.loc[key_frame.cluster == cluster]
    memento_rate = get_memento_rate(i_frame)
    event_title = i_frame.title.values[0]
    event_keywords = get_keywords(i_frame.title.values, i_frame.content.values)
    print(memento_rate)
    print (event_title)
    print (event_keywords)

In [None]:
for keyword in np.unique(frame.keyword.values):
    key_frame = frame.loc[frame.keyword == keyword]
    setences = LabeledLineSentence(key_frame)
    
    model = Doc2Vec(alpha=0.025, min_alpha=0.025)
    model.build_vocab(setences.lines())
    print(keyword, 'vocab builded')
    
    for epoch in range(11):
        if not epoch % 10: print (epoch, model.alpha)
        model.train(setences)
        model.alpha *= 0.99
        model.min_alpha = model.alpha
    break

In [None]:
int((date_end - date_start) / date_jump)

In [None]:
x=datetime(1989,  2, 2)

In [None]:
y=datetime(1989, 3,2)

In [None]:
(y-x) / timedelta(10)

In [None]:
setences = LabeledLineSentence(frame)

In [None]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025)

In [None]:
model.build_vocab(setences.lines())

In [None]:
for epoch in range(11):
    if not epoch % 10: print (epoch, model.alpha)
    model.train(sentences)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

## write to txt

In [None]:
lines = []
lmax = 0
for idx, content in frame.iterrows():
    text = " ".join(tagging(content.content))
    title = " ".join(tagging(content.title))
    dates = []
    date_origin = datetime.strptime(content.date[:10].replace('-', '.'), "%Y.%m.%d")
    for idx in range(5):
        dates.append(date_origin + timedelta(days=idx))
        dates.append(date_origin + timedelta(days=-idx))
    date = " ".join([ date.strftime("%Y%m%d") for date in dates ])
    line = text * 1 + title * int(len(text)/5) + date * int(len(text)/3)
    lines.append(line)
    if lmax < len(line): lmax = len(line)

In [None]:
normalize = []
for line in lines:
    l = len(line)
    normalize.append(line * int(lmax / l))

In [None]:
with open('../data/test.txt', 'w') as f:
    for line in normalize:
        f.write(line + "\n")

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

In [None]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            yield LabeledSentence(words=line.split(), tags=['TXT_%s' % uid])

In [None]:
sentences = LabeledLineSentence('../data/test.txt')

In [None]:
model = Doc2Vec(alpha=0.025, min_alpha=0.001, workers=8)

In [None]:
model.build_vocab(sentences)

In [None]:
for epoch in range(201):
    try:
        model.train(sentences)
        model.alpha *= 0.99
        model.min_alpha = model.alpha
        if not epoch % 100: print ('epoch %d' % (epoch), model.alpha)
    except (KeyboardInterrupt, SystemExit):
        break

In [None]:
model.most_similar(['김태희/Noun'])

In [None]:
vecs = np.array(model.docvecs)

In [None]:
import matplotlib.pyplot as plt
import numpy
import scipy.cluster.hierarchy as hcluster
import scipy.cluster.vq as vq

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
thresh = 32

In [None]:
k = 8

In [None]:
ii, clusters = vq.kmeans2(vecs, k)

In [None]:
clusters = hcluster.fclusterdata(vecs, thresh, criterion="distance")

In [None]:
keywords = ["MC몽 군기피", "아이오아이 해체", "지드래곤 설리 열애", "김태희 비 결혼", "김태희 비 열애", "김태희 아이리스 출연", "박근혜 당선"]
keyindex = [0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1,1,2,1,2,2,2,2,2,2,2,2,3,2,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,6,6,6,5,5,6,6,5,6,6,5,6,6,6]

In [None]:
l = [[] for _ in range(len(keywords))]
x = [[] for _ in range(len(keywords))]
for i, e in enumerate(clusters):
    l[keyindex[i]].append(int(e))
    x[keyindex[i]].append(i)

In [None]:
for label, idx in enumerate(zip(l,x)):
    print (keywords[label], idx)

In [None]:
print (clusters)
print (len(np.unique(clusters)))

In [None]:
def find_center(array):
    mean = np.mean(array)
    idx = int((np.abs(array - mean)).argmin() / 100)
    return idx, array[idx]

In [None]:
idx, ar = find_center(vecs[x[2]])

In [None]:
topic = frame.iloc[x[2][idx]]
topic.content

In [None]:
for y in x[2]:
    print (y)
    t = frame.iloc[y]
    print (t.content)
    print ()
    print ()

In [None]:
frame['cluster'] = pd.Series(int)

In [None]:
for i, v in enumerate(clusters):
    frame = frame.set_value(i, 'cluster', v)

In [None]:
c_each = [ frame.loc[frame['cluster'] == i].shape[0] for i in range(len(np.unique(clusters)))  ]

In [None]:
c = frame.loc[frame['cluster'] == 579]
c.head(2)

In [None]:
for idx, content in c.iterrows():
    print ()
    print (content.href_origin)
    print (content['title'])
    print (content['content'])

In [None]:
xy = np.array(vecs)[:,[0,1]]
#xy = model.docvecs

In [None]:
# plotting
plt.scatter(*numpy.transpose(xy), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
plt.title(title)
plt.show()

In [None]:
for v in range(1,len(model.docvecs[0])):
    n = np.array(model.docvecs)[:,[v-1, v]]
    plt.scatter(*numpy.transpose(n), c=clusters)
    plt.axis("equal")
    title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
    plt.title(title)
    plt.show()

In [None]:
# plotting
plt.scatter(*numpy.transpose(n), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
plt.title(title)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy
import scipy.cluster.hierarchy as hcluster

In [None]:
# generate 3 clusters of each around 100 points and an orphan vector
N=100
data = numpy.random.randn(3*N,2)
data[:N] += 5
data[-N:] += 10
data[-1:] -= 20

In [None]:
# clustering
thresh = 1.5
clusters = hcluster.fclusterdata(data, thresh, criterion="distance")

In [None]:
print (clusters)

In [None]:
clusters.shape

In [None]:
# plotting
plt.scatter(*numpy.transpose(data), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
plt.title(title)
plt.show()

In [None]:
from konlpy.tag import Twitter
tagger = Twitter()

def tokenize(text):
    return tagger.morphs(text)

stoptags = ['KoreanParticle', 'PreEomi', 'Punctuation', 'Eomi', 'Number', 'Foreign', 'URL']
def stemize(text):
    return [ word for word, tag in tagger.pos(text) if not tag in stoptags]

def tagging(text):
    return [ "{}/{}".format(word, tag) for word, tag in tagger.pos(text) if not tag in stoptags]

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

In [None]:
class LabeledLineSentence(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for uid, line in enumerate(open(self.filename)):
            yield LabeledSentence(words=line.split(), tags=['TXT_%s' % uid])

In [None]:
sentences = LabeledLineSentence('test.txt')

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = Doc2Vec(alpha=0.025, min_alpha=0.025, size=2, workers=8, sample=1e-5)

model.build_vocab(sentences)

for epoch in range(11):
    try:
        if not epoch % 10: print ('epoch %d' % (epoch))
        model.train(sentences)
        model.alpha *= 0.99
        model.min_alpha = model.alpha
    except (KeyboardInterrupt, SystemExit):
        break

In [None]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(model.docvecs)

clusters = km.labels_.tolist()

In [None]:
films = { 'title': titles, 'synopsis': texts, 'cluster': clusters }
frame = pd.DataFrame(films, index = [clusters] , columns = ['title', 'synopsis','cluster'])

In [None]:
frame.head(3)

In [None]:
frame['cluster'].value_counts()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(model.docvecs)
print (dist)

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e',
                  5: '#1b9e77', 6: '#d95f02', 7: '#7570b3', 8: '#e7298a', 9: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0:'x',1:'x',2:'x',3:'x',4:'x',5:'x',6:'x',7:'x',8:'x',9:'x'}

In [None]:
#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

In [None]:
string_int ='123'
string_float = '123.123'
string_string = '나는숫자가아닙니다.'
num_int = 123
num_float = 123.123

print (string_int, type(string_int))
print (string_float, type(string_float))
print (string_string, type(string_string))
print ()
print (num_int, type(num_int))
print (num_float, type(num_float))
print ()
print (int(string_int), type(int(string_int)))
print (int(string_float), type(int(string_float)))
print (int(string_string), type(string_string)))

In [None]:
plt.close()

In [None]:
#define custom toolbar location
import mpld3
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
                     label=cluster_names[name], mec='none', 
                     color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1)

mpld3.display()

## TODO
- 수집된 기사를 형태소 단위로 나눔
- 기사의 형태소들로 부터 키워드를 추출
- 각 뉴스에서 많이 사용되는 키워드를 모음
- 뉴스의 키워드를 통해 뉴스를 군집화

---

- 군집화된 뉴스에서 요약 키워드 추출
- 군집화된 뉴스에서 대표 뉴스 선택
- 군집화된 뉴스에서 대중의 반응 추출
- 군집화된 뉴스에서 대중의 관심 파악
- 군집화된 뉴스에서 관련 인물 추출
- 군집화된 뉴스에서 메멘토 등급 측정

## visualize by pyplt

## TODO
- too sparse......