In [1]:
import sys
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import re
import nltk
from nltk.stem import PorterStemmer

import warnings
warnings.filterwarnings('ignore')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['figure.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 10

In [2]:
ps = PorterStemmer()
np.set_printoptions(threshold=sys.maxsize)

#stopwords from txt
my_file = open("stopwords.txt", "r")
stopwords = my_file.read()
stopwords = [stopwords.translate({ ord(c): None for c in '" ' }).split(",") ][0]#delete punctuation
#stopwords

In [3]:
length_all_doc = 1095 #實際1095

In [4]:
def tokenize(text): #順序很有差！以下都用空白取代
    text = re.sub(r'\|\|\|', r' ', text) #照數據格式切成一段一段
    text = re.sub(r'http\S+', r' ', text) #刪除網址
    text = re.sub('\S+com$', ' ', text) #刪除網站
    text = re.sub('\d+',' ', text) #刪除所有數字
    text = re.sub(r'[^\w\s]', ' ', text) #刪除所有標點符號 避免如ideas/thought -> ideasthought
    #print(text)
    #text = [i.translate({ ord(c): None for c in '"1234567890' }) for i in text] #delete punctuation
    #text = [i.translate({ ord(c): ' ' for c in "#$%&'()*+,-./:;!<=>?@[\]^_`{|~} " }) for i in text]
    text = re.sub("[^a-zA-Z]", " ", text) #只保留文字
    text = re.sub(' +', ' ', text) # Remove spaces > 1
    
    text = [i.strip() for i in text.split(' ')]  
    text = [i.lower() for i in text]
    text = [ps.stem(i) for i in text] #porter's algo
    text = [i for i in text if i not in stopwords] #stopwords delete
    text = list(filter(None, text)) #去除空字元
    stringtext = ' '.join([str(item) for item in text])
    #print(stringtext)
    return stringtext

In [5]:
Allwords = [] #一個 總體Dictionary
def buildDict(text): #只算有無出現, 弄成list
    for word in str(text).split(' '):
        #print(word)
        if word not in Allwords:
            Allwords.append(word)

for i in range(1, length_all_doc+1):
    with open(f'./data/{i}.txt') as f:
        data=f.read()
        text = tokenize(data)
        #print(text)
        buildDict(text)

Allwords_dict = sorted(Allwords) #照字母排
AllwordSize = len(Allwords_dict) #總term數
AllwordSize

11957

`Allwords_dict`為總共有哪些字的字典，AllwordsSize為字典大小<br>
### 計算每個字出現在多少文件裡面（`IDF`) -> wd_appear

In [6]:
wd_appear = {k: 0 for k in Allwords_dict}
for i in range(1, length_all_doc+1):
    tmp_appear = {k: 0 for k in Allwords_dict} #看每篇文章有沒有出現字，最高1
    with open(f'./data/{i}.txt') as f:
        data=f.read()
        text = tokenize(data)
        for wd in text.split(' '):
            tmp_appear[wd] = 1
        for wd, num in tmp_appear.items():
            wd_appear[wd] += tmp_appear[wd]
for wd, num in wd_appear.items():
    wd_appear[wd] = math.log(length_all_doc+1/wd_appear[wd]+1) #算IDF值對於字典每個字

### 計算tf在每個文件的每個字 `tf` : wd_cnt<br>
- 同時結合 idf 後標準化成此文章對於每個字的陣列

In [7]:
allVector = [] #保存所有文章的vector，總共有文章數*字典字數 個參數
for i in range(1, length_all_doc+1):  
    wd_cnt = {k: 0 for k in Allwords_dict}
    with open(f'./data/{i}.txt') as f:
        data=f.read()
        text = tokenize(data)
        for wd in text.split(' '):
            wd_cnt[wd] += 1
        
    arr = []
    for wd, cnt in wd_cnt.items():
        arr.append(cnt*wd_appear[wd]) #tf*idf
    normal_V = arr/np.linalg.norm(arr)
    allVector.append(normal_V) #標準化成每個文章對於每個字的vector
#allVector

In [8]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
vMatrix = 1-pairwise_distances(allVector, metric="cosine")
#vMatrix

### 先從０，輸出時再加一回來

In [11]:
#使用complete link clustering
def calculateResult(vecMatrix, kclusters=3, doclength=10):
    mergeStep=[] 
    alive_person = [i for i in range(doclength)] #initial alive 活著會在此顯示1
    dead_person  = [] #initial dead 死了則會在此顯示1
    clusters     = [[i] for i in range(doclength)] #initial cluster

    for i in range(doclength-kclusters): #要合併幾次
        maxCosine = 0
        maxpair = []
        for first in alive_person:
            for last in alive_person:
                if last<=first: continue #之後要把last合併到first
                if vecMatrix[first][last]>=maxCosine:
                    maxCosine = vecMatrix[first][last]
                    maxpair = [first, last]
        #決定這次要合併誰：last合併到first
        #print(maxpair)
        mergeStep.append((maxpair[0],maxpair[1]))
        alive_person.remove(maxpair[1])
        dead_person.append(maxpair[1])
        for j in alive_person: #更新群間距離
            vecMatrix[maxpair[0]][j] = min(vecMatrix[maxpair[0]][j], vecMatrix[maxpair[1]][j])
            vecMatrix[j][maxpair[0]] = min(vecMatrix[maxpair[0]][j], vecMatrix[maxpair[1]][j])
        for doc in clusters[maxpair[1]]: #把那類全部合併過來
            clusters[maxpair[0]].append(doc)
        clusters[maxpair[1]] = [] #刪除該類別
        
#         print(f"mergeStep: {mergeStep}")
#         print(f"clusters: {clusters}")
#         print(vecMatrix)
#         print()
        
    return clusters                

In [12]:
for k in [8, 13, 20]:
    res = calculateResult(vMatrix, kclusters=k, doclength=length_all_doc)
    res = [ele for ele in res if ele != []]
    with open(f'./output/{k}.txt', 'w') as wf:
        for num in res:
            for item in sorted(num):            
                wf.write(f'{item+1}\n')
            wf.write('\n')    

[847, 848]
[791, 795]
[731, 732]
[704, 705]
[661, 662]
[620, 621]
[526, 528]
[475, 476]
[228, 229]
[211, 212]
[210, 211]
[194, 228]
[47, 48]
[7, 8]
[942, 943]
[304, 308]
[925, 927]
[594, 595]
[564, 594]
[563, 564]
[100, 105]
[190, 191]
[154, 157]
[1084, 1087]
[499, 500]
[329, 332]
[1033, 1035]
[506, 507]
[895, 896]
[242, 243]
[854, 855]
[582, 583]
[99, 101]
[831, 837]
[838, 839]
[197, 199]
[604, 606]
[680, 683]
[498, 499]
[735, 736]
[490, 492]
[821, 823]
[371, 380]
[999, 1007]
[519, 522]
[968, 969]
[53, 54]
[824, 827]
[675, 676]
[196, 201]
[87, 88]
[840, 841]
[898, 905]
[545, 546]
[530, 531]
[93, 97]
[108, 109]
[953, 954]
[55, 56]
[301, 312]
[820, 821]
[910, 911]
[230, 234]
[968, 970]
[884, 885]
[129, 130]
[501, 502]
[244, 249]
[454, 455]
[1083, 1084]
[326, 327]
[838, 840]
[33, 34]
[614, 615]
[995, 996]
[967, 968]
[718, 721]
[921, 923]
[196, 198]
[1040, 1041]
[506, 511]
[497, 498]
[597, 601]
[94, 104]
[573, 574]
[282, 284]
[764, 767]
[318, 321]
[955, 956]
[974, 981]
[280, 281]
[138, 14

[890, 892]
[158, 162]
[818, 1062]
[856, 863]
[1005, 1026]
[139, 205]
[569, 717]
[812, 819]
[20, 60]
[431, 692]
[909, 940]
[787, 901]
[484, 586]
[753, 790]
[314, 354]
[27, 31]
[825, 831]
[871, 880]
[176, 690]
[237, 252]
[290, 341]
[142, 144]
[517, 727]
[519, 538]
[718, 719]
[214, 286]
[43, 304]
[315, 481]
[534, 541]
[266, 877]
[707, 899]
[1034, 1052]
[254, 516]
[238, 458]
[342, 351]
[834, 843]
[1015, 1074]
[149, 366]
[922, 1043]
[618, 625]
[852, 886]
[881, 987]
[132, 140]
[272, 487]
[141, 777]
[526, 536]
[1027, 1046]
[0, 35]
[5, 64]
[19, 121]
[572, 580]
[156, 760]
[90, 270]
[18, 434]
[856, 942]
[765, 804]
[816, 817]
[419, 563]
[519, 544]
[520, 900]
[1, 2]
[1022, 1086]
[971, 979]
[82, 394]
[720, 815]
[20, 110]
[390, 438]
[232, 710]
[240, 338]
[151, 223]
[953, 1023]
[138, 163]
[534, 570]
[884, 909]
[367, 617]
[1001, 1009]
[314, 339]
[820, 856]
[994, 1005]
[818, 858]
[784, 874]
[82, 432]
[309, 462]
[219, 468]
[753, 963]
[125, 711]
[814, 838]
[30, 374]
[411, 417]
[129, 138]
[916, 1048]
[66,

[932, 999]
[1056, 1058]
[9, 32]
[1005, 1039]
[566, 568]
[952, 974]
[2, 4]
[388, 399]
[315, 356]
[45, 49]
[785, 794]
[354, 359]
[82, 85]
[639, 1075]
[134, 154]
[65, 67]
[342, 347]
[851, 864]
[237, 409]
[37, 39]
[730, 779]
[272, 548]
[392, 518]
[75, 115]
[107, 663]
[80, 89]
[627, 685]
[625, 628]
[656, 675]
[666, 670]
[749, 786]
[915, 925]
[3, 12]
[114, 430]
[426, 782]
[143, 386]
[608, 613]
[254, 513]
[73, 98]
[570, 610]
[445, 446]
[854, 868]
[989, 992]
[30, 69]
[947, 977]
[526, 552]
[383, 385]
[838, 860]
[532, 694]
[756, 944]
[525, 535]
[189, 222]
[5, 92]
[156, 485]
[603, 608]
[11, 29]
[276, 289]
[718, 723]
[765, 805]
[639, 740]
[961, 962]
[818, 1010]
[41, 128]
[184, 250]
[174, 210]
[722, 725]
[314, 325]
[761, 768]
[899, 920]
[169, 170]
[336, 444]
[90, 103]
[322, 414]
[572, 587]
[618, 638]
[495, 651]
[362, 482]
[1046, 1065]
[349, 471]
[797, 879]
[1022, 1042]
[932, 965]
[28, 810]
[1026, 1030]
[645, 793]
[112, 277]
[232, 248]
[971, 982]
[358, 365]
[240, 253]
[701, 703]
[1038, 1071]
[799, 8

[457, 461]
[55, 70]
[730, 731]
[820, 824]
[303, 307]
[244, 259]
[801, 802]
[283, 288]
[916, 928]
[32, 33]
[600, 605]
[1034, 1036]
[1038, 1040]
[1018, 1019]
[396, 400]
[571, 576]
[319, 326]
[338, 346]
[320, 324]
[91, 99]
[446, 451]
[925, 937]
[475, 478]
[575, 597]
[226, 230]
[550, 551]
[994, 997]
[132, 133]
[832, 842]
[453, 480]
[553, 562]
[437, 441]
[421, 428]
[665, 671]
[129, 136]
[554, 555]
[19, 25]
[73, 83]
[107, 116]
[1017, 1020]
[53, 55]
[442, 479]
[573, 575]
[23, 24]
[117, 124]
[381, 403]
[704, 708]
[609, 614]
[775, 776]
[419, 429]
[387, 401]
[835, 912]
[240, 244]
[694, 700]
[139, 150]
[349, 352]
[482, 506]
[279, 280]
[666, 669]
[391, 407]
[158, 209]
[548, 560]
[350, 370]
[672, 674]
[901, 903]
[453, 501]
[315, 320]
[275, 282]
[184, 200]
[222, 291]
[666, 667]
[174, 197]
[167, 187]
[949, 988]
[87, 93]
[190, 202]
[490, 497]
[489, 491]
[319, 333]
[538, 545]
[17, 22]
[573, 577]
[786, 898]
[412, 437]
[723, 726]
[934, 936]
[965, 993]
[362, 415]
[153, 159]
[376, 422]
[533, 596]
[659, 668

[171, 203]
[355, 862]
[18, 28]
[730, 854]
[1014, 1047]
[953, 1028]
[750, 822]
[134, 241]
[5, 7]
[1013, 1091]
[1053, 1085]
[10, 424]
[677, 733]
[645, 800]
[745, 761]
[1088, 1093]
[941, 1000]
[90, 148]
[151, 216]
[139, 214]
[208, 265]
[295, 396]
[149, 287]
[118, 630]
[881, 904]
[112, 316]
[763, 765]
[951, 989]
[272, 334]
[137, 166]
[300, 340]
[355, 715]
[260, 398]
[829, 857]
[261, 532]
[484, 640]
[915, 924]
[165, 219]
[520, 787]
[1022, 1029]
[51, 106]
[290, 772]
[833, 1068]
[798, 1031]
[160, 181]
[618, 791]
[756, 957]
[825, 1044]
[584, 983]
[15, 71]
[193, 297]
[314, 375]
[519, 526]
[1034, 1053]
[3, 687]
[852, 889]
[171, 174]
[547, 569]
[534, 618]
[266, 309]
[19, 38]
[1054, 1088]
[132, 142]
[1, 5]
[20, 62]
[655, 757]
[237, 384]
[852, 959]
[300, 315]
[707, 916]
[0, 27]
[342, 445]
[129, 193]
[272, 627]
[812, 834]
[534, 572]
[517, 784]
[1022, 1067]
[143, 355]
[15, 118]
[1014, 1015]
[51, 120]
[345, 405]
[425, 549]
[112, 300]
[273, 311]
[158, 236]
[176, 644]
[797, 884]
[484, 519]
[10, 18]
[43,