## Import Libraries

In [7]:
import sys
!{sys.executable} -m pip install rouge

Collecting rouge
  Using cached rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [8]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.datasets import fetch_20newsgroups
from scipy.special import rel_entr
from os.path import isfile, join
from scipy.sparse import vstack
from collections import Counter
from scipy.stats import entropy
from rouge import Rouge
from os import listdir
import pandas as pd
import numpy as np 
import nltk
import re

## Import 20NG

In [19]:
news_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42)
news_test = fetch_20newsgroups(subset="test", shuffle=True, random_state=42)

tfidf = TfidfVectorizer(stop_words='english', min_df=5)
x_train_tfidf = tfidf.fit_transform(news_train.data)
vocabulary = np.array(tfidf.get_feature_names())
x_test_tfidf = tfidf.transform(news_test.data)

y_train = news_train.target
y_test = news_test.target

x = vstack((x_train_tfidf, x_test_tfidf), format='csr').toarray()
y = np.append(y_train, y_test)[:5000]

comp = [10, 20, 50]

print(x.shape)  
print(y.shape)



(18846, 25636)
(5000,)


## LDA

In [17]:
for i in comp:
    print("K = " + str(i))
    lda = LatentDirichletAllocation(n_components = i).fit(x)
    feat = lda.components_
    
    for j in range(len(feat)):
        proba = {}
        print("Top words for " + str(j) + ": ")
        
        sort_feat = np.argsort(-feat[j])
        sort_val = sorted(-feat[j])
        top_feat = []
        
        for k in range(20):
            proba[vocabulary[sort_feat[k]]] = -sort_val[k]
        print(proba)
    print(feat)

K = 10
Top words for 0: 
{'edu': 191.8291354459874, 'windows': 140.58515893168263, 'university': 100.82866679931946, 'lines': 98.88556213108767, 'subject': 97.82273517320569, 'organization': 96.56795539692746, 'thanks': 94.5651395343809, 'ca': 92.42516153960031, 'drive': 87.71742934030935, 'com': 84.67979604163811, 'host': 83.85303467694709, 'posting': 83.2365197023339, 'nntp': 82.57104668153114, 'card': 80.28405963818157, 'dos': 74.47501580840832, 'file': 73.5823725174852, 'pc': 70.95254555016552, 'mac': 70.83623305663608, 'software': 66.47992666600912, 'help': 65.5766857841836}
Top words for 1: 
{'stratus': 42.594598785471526, 'pitt': 24.520065579562637, 'geb': 22.094140494358836, 'sw': 21.86779567405946, 'banks': 20.79094932107271, 'gordon': 20.71763723622693, 'edu': 18.490794196178417, 'cdt': 18.2270538165214, 'atf': 17.31418808178465, 'irvine': 16.197836367132947, 'intercon': 13.639286911264561, 'amanda': 12.341041702670562, 'fbi': 11.924330587660654, 'com': 11.8766648340892, 'ran

{'ati': 12.916543418546913, 'mitre': 10.637957943166938, 'centris': 8.558447746134561, 'colostate': 7.773469032048144, 'cookson': 7.633389687265875, 'ufl': 7.443459019383943, 'se': 7.0750909999879, '610': 6.893248780688162, 'triumf': 6.813295232154128, 'kth': 6.566122550927642, 'hammerl': 6.4974962377860175, 'nada': 5.91544934527798, 'acsu': 5.518859666293389, 'lance': 5.4720249063439885, 'mcgill': 5.359096243233888, 'penev': 4.952148365314959, 'csuchico': 4.811431874481159, 'ecst': 4.811431874481159, 'svoboda': 4.7805437457883855, 'chalmers': 4.713925397795556}
Top words for 15: 
{'stratus': 42.22772855881798, 'sw': 21.09666049941087, 'cdt': 20.479454984886523, 'atf': 14.332984090889433, 'handheld': 12.555001584883689, 'br': 12.055539728013931, 'fbi': 11.708810647513928, 'isc': 11.041767088716968, 'ranch': 11.039891866100586, 'stafford': 10.82959541392929, 'survivors': 10.642910026348895, 'dividian': 10.163692027883792, 'roby': 9.760293471091103, 'burns': 9.687547441607101, 'irvine': 

{'cleveland': 28.26170848612717, 'cwru': 21.54447396251823, 'freenet': 15.733447505899152, 'buffalo': 15.562427378433604, 'reserve': 13.95174687752841, 'rit': 13.403595888892442, 'ins': 13.074096033102462, 'western': 10.912320462835337, 'acsu': 10.487976440496997, 'isc': 10.328771396274249, 'umcc': 9.258437144172154, 'edu': 8.837385153866263, 'po': 8.049402131910705, 'ranck': 7.7095713590093515, 'hela': 7.397975752307285, 'vt': 7.197178921467844, 'brian': 6.935099841898502, 'ritvax': 6.861286991582171, 'hammerl': 6.467496240496271, 'phillies': 6.271777321485273}
Top words for 40: 
{'stratus': 10.422058175241734, 'ncr': 8.565318522209383, 'harris': 7.6711983185238966, 'atlantaga': 7.234653497917033, 'mcguire': 5.992141723412098, 'ssd': 5.855464160805544, 'csd': 5.693367674255843, 'wilson': 5.199019988483827, 'clarkson': 5.1496545920146515, 'ncratl': 4.724837463511885, 'sw': 4.688430871369652, 'ellison': 4.3662857266889485, 'ellisun': 4.314772441510387, 'cme': 4.212837787942115, 'tuinstr

## NMF

In [20]:
x_new = x[:5000]

for i in comp:
    print("K = " + str(i))
    nmf = NMF(n_components = i).fit(x_new)
    feat = nmf.components_
    
    for j in range(len(feat)):
        proba = {}
        print("Top words for " + str(j) + ": ")
        
        sort_feat = np.argsort(-feat[j])
        sort_val = sorted(-feat[j])
        top_feat = []
        
        for k in range(20):
            proba[vocabulary[sort_feat[k]]] = -sort_val[k]
        print(proba)
    print(feat)

K = 10




Top words for 0: 
{'windows': 1.0555051882571271, 'window': 0.49202309431380886, 'dos': 0.4842506671555535, 'file': 0.45793773297580104, 'files': 0.37190799352928866, 'card': 0.35394738701097006, 'use': 0.34527446873553236, 'program': 0.33776974952127703, 'ac': 0.3106044998145729, 'uk': 0.3093080912177744, 'thanks': 0.3068061280464428, 'version': 0.2868195699540487, 'com': 0.2862288107932537, 'help': 0.285475964280815, 'graphics': 0.27807755543554474, 'using': 0.2673676501256219, 'screen': 0.2590637310735119, 'ms': 0.2583138684218185, 'video': 0.24591434141780447, 'pc': 0.23934692336781083}
Top words for 1: 
{'god': 1.0253198812832591, 'jesus': 0.4322040838767434, 'bible': 0.36809866793456475, 'christian': 0.34570012914640363, 'people': 0.3194228176872578, 'christians': 0.3105780625120573, 'faith': 0.3015123264729859, 'believe': 0.2781261817547765, 'truth': 0.21192989972797552, 'christ': 0.20203997286467554, 'christianity': 0.19095234748827006, 'say': 0.1859081399934785, 'think': 0.176



Top words for 0: 
{'car': 1.2690031106464923, 'bike': 0.6115083991374664, 'cars': 0.4787635246105138, 'just': 0.42273865115358217, 'good': 0.4182660764186529, 'don': 0.39008325776140745, 'insurance': 0.38362639225623646, 'like': 0.37602935371258533, 'speed': 0.3589993708813706, 'drive': 0.30288717601662546, 've': 0.3012685340048426, 'engine': 0.27822530151053815, 'really': 0.2696768073440591, 'time': 0.26444926109122024, 'dod': 0.26181121716742567, 'right': 0.25906558583279726, 'left': 0.2582768152061746, 'org': 0.2457162471501937, 'road': 0.24324558050003411, 'going': 0.2369665629662383}
Top words for 1: 
{'god': 1.1219962380373074, 'jesus': 0.4751501279176489, 'bible': 0.40253247047914503, 'christian': 0.3730131285511405, 'christians': 0.3358757350040534, 'faith': 0.32731772265174547, 'believe': 0.28957101851712513, 'people': 0.2872484091998446, 'truth': 0.2250056069818657, 'christ': 0.22189694306406818, 'christianity': 0.2079608826607681, 'say': 0.18964881262221445, 'life': 0.177303



Top words for 0: 
{'people': 2.722197217852451, 'don': 1.863499667335687, 'think': 1.8282548495917907, 'just': 1.3193161642591626, 'like': 1.300644851561846, 'know': 1.1265914330202922, 'really': 1.051680978964761, 'time': 1.0139420930975072, 'good': 0.9730447247755393, 'things': 0.9464412110423256, 'going': 0.9242591447514172, 'want': 0.879892708933572, 'make': 0.8755106635992552, 'did': 0.8487374779923429, 'said': 0.8184493468188658, 'way': 0.7969041079142831, 'say': 0.7862086991587889, 've': 0.7621561662329732, 'thing': 0.7326709335375244, 'right': 0.7037988632586538}
Top words for 1: 
{'god': 1.5048970576830965, 'jesus': 0.59419453446231, 'bible': 0.49516556973937104, 'faith': 0.4182836817478084, 'christians': 0.41239132162024, 'christian': 0.41000155982843167, 'believe': 0.3292650221695053, 'christ': 0.280834275065962, 'truth': 0.2707097324895731, 'christianity': 0.22566342504646766, 'sin': 0.1886719763864585, 'belief': 0.1862556042894937, 'life': 0.18261214099026105, 'hell': 0.18

{'cs': 1.954512344215312, 'nyx': 0.6572425334062627, 'du': 0.5663318795800817, 'edu': 0.48611692798045786, 'dept': 0.4599837463784803, 'math': 0.32898043178614683, 'denver': 0.3136946656158245, 'science': 0.2953035182694694, 'colorado': 0.2724805221332402, 'computer': 0.24533683659339217, 'cornell': 0.21016664112927527, 'rochester': 0.16680167925626535, 'colostate': 0.15380438260032914, 'unix': 0.15362639319004984, 'sci': 0.15346370946455448, 'dal': 0.1508574224675014, 'comp': 0.148040023045051, 'public': 0.14366864624704107, 'university': 0.141540008079696, 'arizona': 0.1397088654074069}
Top words for 34: 
{'msg': 1.660700222218923, 'food': 0.6964551098365667, 'sensitivity': 0.33294079285773626, 'superstition': 0.31455392854817116, 'chinese': 0.2714963464670538, 'berkeley': 0.21729830339116446, 'reaction': 0.21079737320859204, 'effects': 0.15301685097102719, 'sdf': 0.14725756933634687, 'restaurant': 0.1445261040405505, 'foods': 0.14375082133732844, 'flavor': 0.14286552729065438, 'extr

## Import DUC 2001

In [29]:
def load_duc(path):
  files = [f for f in listdir(path) if isfile(join(path, f))]
  files.remove('annotations.txt')
  files.remove('notes.txt')
  data = []
  for filename in files:
    text = []
    with open(path + filename, 'r', encoding='mac_roman', newline='') as f:
      lines = f.readlines()
    flag = 0
    for line in lines:
      indicator = line.strip().split(' ')[0]
      if indicator == '<TEXT>':
        flag = 1
      if indicator == '</TEXT>':
        flag = 0
      if flag == 1 and indicator != '<TEXT>':
        text.append(line.strip())    
    text = ''.join(text)
    data.append(text)
  return data

path = 'DUC2001/'
data = np.array(load_duc(path))

In [30]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=5)
train_X = vectorizer.fit_transform(data)
vocabulary = np.array(vectorizer.get_feature_names())



## LDA

In [31]:
for i in comp:
    print("K = " + str(i))
    lda = LatentDirichletAllocation(n_components = i).fit(train_X)
    feat = lda.components_
    
    for j in range(len(feat)):
        proba = {}
        print("Top words for " + str(j) + ": ")
        
        sort_feat = np.argsort(-feat[j])
        sort_val = sorted(-feat[j])
        top_feat = []
        
        for k in range(20):
            proba[vocabulary[sort_feat[k]]] = -sort_val[k]
        print(proba)
    print(feat)

K = 10
Top words for 0: 
{'said': 17.05668843291329, 'police': 8.142493402446835, 'johnson': 6.128476321996715, 'eclipse': 6.074842200518187, 'oil': 5.956152342228015, 'slovenia': 5.619229729961428, 'exxon': 5.5935690255885335, '000': 5.010920529864154, 'thomas': 4.7558225496719295, 'forest': 4.687863523302773, 'drought': 4.400550506747777, 'bank': 4.3064051615408685, 'million': 4.259886895945976, 'city': 4.221435724756816, 'officials': 4.050426832124764, 'fires': 3.959014611988849, 'census': 3.940188064792524, 'area': 3.870599536385655, 'department': 3.814024441441717, 'world': 3.7512485862018456}
Top words for 1: 
{'mercury': 2.917527152816133, 'jose': 2.525647669849946, 'pubyear': 2.1866637762176366, 'limlen': 2.1866637762176366, 'ct': 2.1866637762176366, 'edition': 2.1866637762176366, 'pubdate': 2.1866637762176366, 'pg': 2.1866637762176366, 'copyrght': 2.1866637762176366, 'dateline': 2.1866637762176366, 'byline': 2.1866637762176366, 'code': 2.163458023866576, 'language': 2.04151449

Top words for 0: 
{'forest': 4.007463544448693, 'thomas': 3.811884027291799, 'acres': 2.5257375266888067, 'court': 2.4651558694067925, 'fires': 1.9988916682985254, 'firefighters': 1.892703332664279, 'bureau': 1.4580943530948722, 'sen': 1.2769359306896488, 'amendment': 1.2213095261194586, 'clarence': 1.2101531808935333, 'blaze': 1.1414349887988418, 'judge': 1.129442251966638, 'counting': 1.116866591788319, 'acre': 1.050280179721582, 'contained': 0.961010920785508, 'burned': 0.956090381334283, 'crews': 0.8722574033188059, 'nomination': 0.8172112142883683, 'chairman': 0.7746495160153776, 'constitution': 0.7736314820316266}
Top words for 1: 
{'tactical': 0.5348778460619392, 'hale': 0.49812139368441455, 'boulder': 0.09968689672109873, 'examined': 0.09368345785389053, 'assigned': 0.0894446928252897, 'teams': 0.08764155383255286, 'al': 0.08764155383255286, 'personnel': 0.08465132491206151, 'headquarters': 0.07906794860132024, 'richest': 0.058600061639099726, 'selected': 0.020000000000000007, 

## NMF

In [32]:
x_new = train_X[:5000]

for i in comp:
    print("K = " + str(i))
    nmf = NMF(n_components = i).fit(x_new)
    feat = nmf.components_
    
    for j in range(len(feat)):
        proba = {}
        print("Top words for " + str(j) + ": ")
        
        sort_feat = np.argsort(-feat[j])
        sort_val = sorted(-feat[j])
        top_feat = []
        
        for k in range(20):
            proba[vocabulary[sort_feat[k]]] = -sort_val[k]
        print(proba)
    print(feat)

K = 10




Top words for 0: 
{'nafta': 1.200324436531846, 'mr': 0.9104350894797033, 'welfare': 0.7100920113081322, 'clinton': 0.385207135000005, 'bank': 0.338553233227378, 'president': 0.300980462885836, 'term': 0.29656705946316797, 'trade': 0.2874254989450966, 'pact': 0.2536929476738555, 'says': 0.2534979250719473, 'reform': 0.25160209838538267, 'house': 0.23691379444527977, 'limits': 0.22804005371866023, 'congressman': 0.22210882622852984, 'congress': 0.21954390787090258, 'cent': 0.20716852777296685, 'administration': 0.20603936263058137, 'mexico': 0.19968201117346177, 'thomas': 0.1989331214468594, 'poverty': 0.19576979401568287}
Top words for 1: 
{'exxon': 0.7791538622005963, 'oil': 0.7766949788541968, 'spill': 0.5168450900720232, 'valdez': 0.47500659799391126, 'tanker': 0.2278107315796535, 'said': 0.22752860768045516, 'cleanup': 0.22297380988734744, 'alaska': 0.2181505465420072, 'sound': 0.16921698405743288, 'ship': 0.1617621603052385, 'guard': 0.14707897863533928, 'coast': 0.1390237254662788

{'nafta': 1.1975586175922321, 'mr': 0.5491109411444364, 'trade': 0.27363170804642256, 'clinton': 0.27109928024356533, 'pact': 0.25737704941608125, 'congressman': 0.21438489461434576, 'mexico': 0.18238929363403933, 'president': 0.16708169769071288, 'vote': 0.16693425463889383, 'house': 0.16643438833367727, 'perot': 0.16511418537633257, 'agreement': 0.16369986133431508, 'opposition': 0.15782234487245111, 'anti': 0.1522345723952043, 'labour': 0.14255425464067714, 'administration': 0.14123258140923803, 'pro': 0.14075594511243353, 'campaign': 0.13756448802416554, 'congressmen': 0.1332224446715034, 'bank': 0.13264035074151503}
Top words for 11: 
{'diamond': 0.8146322704668522, 'beers': 0.717957817518521, 'diamonds': 0.6171681190793793, 'cso': 0.23915904712422542, 'market': 0.19192252986953365, 'botswana': 0.18924193989584653, 'carats': 0.18905843416605547, 'rough': 0.16382352878758133, 'cartel': 0.16360020739438016, 'mr': 0.1614966419272582, 'sales': 0.15503484685160268, 'world': 0.151397597



Top words for 0: 
{'shining': 5.602563854231529, 'path': 5.4107486932475135, 'guzman': 3.8709771420391466, 'fujimori': 1.8700522103911916, 'peace': 1.7844376491523772, 'war': 1.7527022679358573, 'abimael': 1.2237091697772675, 'agreement': 1.1742941020423412, 'lima': 1.0581053214420402, 'party': 0.9292710550361797, 'faction': 0.9268799730772772, 'october': 0.8788187293536557, 'talks': 0.8540304956775108, 'garcia': 0.8085080335027551, 'armed': 0.8021862758868474, 'actions': 0.7556045578165796, 'document': 0.7261634511638163, 'government': 0.705859813128711, 'committee': 0.6560936623375305, 'movement': 0.6453947870874314}
Top words for 1: 
{'exxon': 0.7620471275020119, 'oil': 0.7580415031053719, 'spill': 0.5070921540391159, 'valdez': 0.4656781218826082, 'tanker': 0.22320112039953274, 'cleanup': 0.2186059647229076, 'alaska': 0.21281007140613858, 'said': 0.2085269707234138, 'sound': 0.1653154979537378, 'ship': 0.15543245402409908, 'guard': 0.1409785937065523, 'coast': 0.13181434099834063, '

