## STEP 1: Tokenization with NLTK

In [1]:
import nltk
import string
from collections import Counter

In [2]:
def get_tokens():
    with open('./article/art1.dat','r') as article:
        text = article.read()
        lowers = text.lower()
        no_punctuation = lowers.translate(string.punctuation)
        tokens = nltk.word_tokenize(no_punctuation)
        return tokens

In [3]:
tokens = get_tokens()
count = Counter(tokens)
print(count.most_common)

<bound method Counter.most_common of Counter({'the': 9, ',': 6, 'to': 4, 'a': 4, 'ringgit': 3, 'in': 3, 'on': 3, '.': 3, '%': 3, 'us': 2, 'death-cross': 2, 'pattern': 2, 'has': 2, 'this': 2, 'previous': 2, 'took': 2, 'dollar': 2, 'of': 2, '3': 2, 'from': 2, 'trading': 2, 'strengthen': 1, 'against': 1, 'dollar.+the': 1, 'dollar-ringgit': 1, 'exchange': 1, 'rate': 1, 'is': 1, 'forming': 1, 'which': 1, 'past': 1, 'led': 1, 'decline': 1, 'currency': 1, 'pair': 1, 'based': 1, 'technical': 1, 'charts': 1, '+bloomberg': 1, 'reported': 1, 'wednesday': 1, 'that': 1, 'occurs': 1, 'when': 1, '50-day': 1, 'moving': 1, 'average': 1, 'drops': 1, 'below': 1, '100-day': 1, 'gauge': 1, '+it': 1, 'said': 1, 'three': 1, 'occasions': 1, 'move': 1, 'place': 1, 'posted': 1, 'additional': 1, 'losses': 1, 'and': 1, '7': 1, 'before': 1, 'finding': 1, 'bottom': 1, '+the': 1, 'underperformed': 1, 'asian': 1, 'currencies': 1, 'since': 1, 'policy': 1, 'makers': 1, 'steps': 1, 'november': 1, 'deter': 1, 'foreign': 

## STEP 2: Stop words Removal

In [4]:
from nltk.corpus import stopwords

In [5]:
tokens = get_tokens()
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print(count.most_common())

[(',', 6), ('ringgit', 3), ('.', 3), ('%', 3), ('us', 2), ('death-cross', 2), ('pattern', 2), ('previous', 2), ('took', 2), ('dollar', 2), ('3', 2), ('trading', 2), ('strengthen', 1), ('dollar.+the', 1), ('dollar-ringgit', 1), ('exchange', 1), ('rate', 1), ('forming', 1), ('past', 1), ('led', 1), ('decline', 1), ('currency', 1), ('pair', 1), ('based', 1), ('technical', 1), ('charts', 1), ('+bloomberg', 1), ('reported', 1), ('wednesday', 1), ('occurs', 1), ('50-day', 1), ('moving', 1), ('average', 1), ('drops', 1), ('100-day', 1), ('gauge', 1), ('+it', 1), ('said', 1), ('three', 1), ('occasions', 1), ('move', 1), ('place', 1), ('posted', 1), ('additional', 1), ('losses', 1), ('7', 1), ('finding', 1), ('bottom', 1), ('+the', 1), ('underperformed', 1), ('asian', 1), ('currencies', 1), ('since', 1), ('policy', 1), ('makers', 1), ('steps', 1), ('november', 1), ('deter', 1), ('foreign', 1), ('banks', 1), ('non-deliverable', 1), ('forwards', 1), ('wire', 1), ('report', 1), ('said.+at', 1), ('

## STEP 3: Stemming with Porter Stemmer

In [6]:
from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
print(count.most_common())

[(',', 6), ('ringgit', 3), ('.', 3), ('%', 3), ('us', 2), ('death-cross', 2), ('pattern', 2), ('currenc', 2), ('report', 2), ('move', 2), ('previou', 2), ('took', 2), ('dollar', 2), ('3', 2), ('trade', 2), ('strengthen', 1), ('dollar.+th', 1), ('dollar-ringgit', 1), ('exchang', 1), ('rate', 1), ('form', 1), ('past', 1), ('led', 1), ('declin', 1), ('pair', 1), ('base', 1), ('technic', 1), ('chart', 1), ('+bloomberg', 1), ('wednesday', 1), ('occur', 1), ('50-day', 1), ('averag', 1), ('drop', 1), ('100-day', 1), ('gaug', 1), ('+it', 1), ('said', 1), ('three', 1), ('occas', 1), ('place', 1), ('post', 1), ('addit', 1), ('loss', 1), ('7', 1), ('find', 1), ('bottom', 1), ('+the', 1), ('underperform', 1), ('asian', 1), ('sinc', 1), ('polici', 1), ('maker', 1), ('step', 1), ('novemb', 1), ('deter', 1), ('foreign', 1), ('bank', 1), ('non-deliver', 1), ('forward', 1), ('wire', 1), ('said.+at', 1), ('1.45pm', 1), ('4.4305', 1), ('close', 1), ('4.4312.+', 1)]


## STEP 4: tf-idf with Scikit-learn (combined)

In [7]:
import nltk
import string
import math
import os
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from collections import defaultdict 

In [8]:
path = '/home/muhdlaziem/Workspace/NLP/Week8/article'
token_dict ={}
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [9]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems  = stem_tokens(tokens, stemmer)
    return stems

for subdir, dirs, files in os.walk(path):
    for file in files:
        file_path =  subdir + os.path.sep + file
        print(file_path)
        article = open(file_path,'r')
        text = article.read()
        lowers = text.lower()
        no_punctuation = lowers.translate(string.punctuation)
        token_dict[file] = no_punctuation

/home/muhdlaziem/Workspace/NLP/Week8/article/art1.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art3.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art4.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art2.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art6.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art5.dat


In [41]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=['english'])
tfs = tfidf.fit_transform((token_dict.values()))

In [43]:
feature_names = tfidf.get_feature_names()
corpus_index = [n for n in token_dict]
import pandas as pd
df = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)

print(df)

            art1.dat  art3.dat  art4.dat  art2.dat  art6.dat  art5.dat
$           0.000000  0.000000       0.0  0.241775  0.000000  0.000000
%           0.242364  0.000000       0.0  0.000000  0.000000  0.000000
''          0.000000  0.000000       0.0  0.000000  0.053996  0.107424
's          0.000000  0.173227       0.0  0.055795  0.000000  0.136042
+bloomberg  0.080788  0.000000       0.0  0.000000  0.000000  0.000000
...              ...       ...       ...       ...       ...       ...
year        0.000000  0.000000       0.0  0.161183  0.000000  0.000000
yen         0.000000  0.062554       0.0  0.000000  0.000000  0.000000
ying        0.000000  0.062554       0.0  0.000000  0.000000  0.000000
yoo         0.000000  0.062554       0.0  0.000000  0.000000  0.000000
you         0.000000  0.000000       0.0  0.000000  0.000000  0.131003

[511 rows x 6 columns]


In [84]:
arr = pd.DataFrame()
strings = ['win','ringgit','trade','game','killed']
for i in df.index:
    if i in strings:
        print(i +'\n')
        print(df.loc[i])
#         arr.append(df.loc[i])
        print('\n')

game

art1.dat    0.000000
art3.dat    0.102590
art4.dat    0.131752
art2.dat    0.000000
art6.dat    0.000000
art5.dat    0.000000
Name: game, dtype: float64


ringgit

art1.dat    0.242364
art3.dat    0.000000
art4.dat    0.000000
art2.dat    0.000000
art6.dat    0.000000
art5.dat    0.000000
Name: ringgit, dtype: float64


trade

art1.dat    0.132495
art3.dat    0.000000
art4.dat    0.000000
art2.dat    0.264345
art6.dat    0.000000
art5.dat    0.000000
Name: trade, dtype: float64


win

art1.dat    0.000000
art3.dat    0.043307
art4.dat    0.055617
art2.dat    0.000000
art6.dat    0.000000
art5.dat    0.045347
Name: win, dtype: float64




In [40]:

for i in df:
    print(i)
    if i in strings:
        print(df[i])

art1.dat    0.242364
art3.dat    0.000000
art4.dat    0.000000
art2.dat    0.000000
art6.dat    0.000000
art5.dat    0.000000
Name: %, dtype: float64

In [85]:
arr

In [83]:
def k_means(tfs):
    true_k=2
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=50,n_init=1)
    model.fit(tfs)
    print("Top terms per cluster: ")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf.get_feature_names()
    
    for i in range(true_k):
        print("Cluster %d: " % i)
        for ind in order_centroids[i, :10]:
#             print(ind)
            print(' %s' % terms[ind])
k_means(tfs)           

Top terms per cluster: 
Cluster 0: 
 the
 ,
 to
 a
 of
 on
 and
 .
 wa
 in
Cluster 1: 
 trade
 $
 deficit
 high
 billion
 year
 export
 near
 two-year
 china


In [77]:
strings

['win', 'ringgit', 'trade', 'game', 'killed']

In [78]:
tfidf.get_feature_names()

['$',
 '%',
 "''",
 "'s",
 '+bloomberg',
 '+it',
 '+the',
 ',',
 '-',
 '.',
 '1',
 '1-1',
 '1.45pm',
 '100-day',
 '15',
 '2-0',
 '20',
 '2013',
 '2016',
 '21',
 '21-13',
 '21-16',
 '21-17',
 '21-17.+the',
 '21-18',
 '21-19',
 '22-20',
 '22.+the',
 '23',
 '23-21',
 '26.6',
 '27th',
 '28-year-old',
 '3',
 '4.4305',
 '4.4312.+',
 '43.6',
 '44',
 '48.2',
 '5-3',
 '50-day',
 '52-year-old',
 '54',
 '6-21',
 '6-3',
 '6-4',
 '7',
 '75',
 '9.6',
 '94th-minut',
 ':',
 ';',
 '``',
 'a',
 'abbey',
 'abov',
 'across',
 'ad',
 'addit',
 'administr',
 'affect',
 'after',
 'again',
 'against',
 'ahead',
 'all',
 'allow',
 'also',
 'although',
 'alway',
 'american',
 'an',
 'and',
 'argentin',
 'argentina',
 'arsen',
 'as',
 'ashley',
 'asian',
 'assault',
 'at',
 'atmospher',
 'attack',
 'averag',
 'aysha',
 'bank',
 'base',
 'be',
 'beat',
 'befor',
 'behind',
 'below',
 'between',
 'billion',
 'blow',
 'boost',
 'bottom',
 'break',
 'bridg',
 'bridge.+',
 'british',
 'brother',
 'but',
 'by',
 'camp