## STEP 1: Tokenization with NLTK

In [1]:
import nltk
import string
from collections import Counter

In [2]:
def get_tokens():
    with open('./article/art1.dat','r') as article:
        text = article.read()
        lowers = text.lower()
        no_punctuation = lowers.translate(string.punctuation)
        tokens = nltk.word_tokenize(no_punctuation)
        return tokens

In [3]:
tokens = get_tokens()
count = Counter(tokens)
print(count.most_common)

<bound method Counter.most_common of Counter({'the': 9, ',': 6, 'to': 4, 'a': 4, 'ringgit': 3, 'in': 3, 'on': 3, '.': 3, '%': 3, 'us': 2, 'death-cross': 2, 'pattern': 2, 'has': 2, 'this': 2, 'previous': 2, 'took': 2, 'dollar': 2, 'of': 2, '3': 2, 'from': 2, 'trading': 2, 'strengthen': 1, 'against': 1, 'dollar.+the': 1, 'dollar-ringgit': 1, 'exchange': 1, 'rate': 1, 'is': 1, 'forming': 1, 'which': 1, 'past': 1, 'led': 1, 'decline': 1, 'currency': 1, 'pair': 1, 'based': 1, 'technical': 1, 'charts': 1, '+bloomberg': 1, 'reported': 1, 'wednesday': 1, 'that': 1, 'occurs': 1, 'when': 1, '50-day': 1, 'moving': 1, 'average': 1, 'drops': 1, 'below': 1, '100-day': 1, 'gauge': 1, '+it': 1, 'said': 1, 'three': 1, 'occasions': 1, 'move': 1, 'place': 1, 'posted': 1, 'additional': 1, 'losses': 1, 'and': 1, '7': 1, 'before': 1, 'finding': 1, 'bottom': 1, '+the': 1, 'underperformed': 1, 'asian': 1, 'currencies': 1, 'since': 1, 'policy': 1, 'makers': 1, 'steps': 1, 'november': 1, 'deter': 1, 'foreign': 

## STEP 2: Stop words Removal

In [4]:
from nltk.corpus import stopwords

In [5]:
tokens = get_tokens()
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print(count.most_common())

[(',', 6), ('ringgit', 3), ('.', 3), ('%', 3), ('us', 2), ('death-cross', 2), ('pattern', 2), ('previous', 2), ('took', 2), ('dollar', 2), ('3', 2), ('trading', 2), ('strengthen', 1), ('dollar.+the', 1), ('dollar-ringgit', 1), ('exchange', 1), ('rate', 1), ('forming', 1), ('past', 1), ('led', 1), ('decline', 1), ('currency', 1), ('pair', 1), ('based', 1), ('technical', 1), ('charts', 1), ('+bloomberg', 1), ('reported', 1), ('wednesday', 1), ('occurs', 1), ('50-day', 1), ('moving', 1), ('average', 1), ('drops', 1), ('100-day', 1), ('gauge', 1), ('+it', 1), ('said', 1), ('three', 1), ('occasions', 1), ('move', 1), ('place', 1), ('posted', 1), ('additional', 1), ('losses', 1), ('7', 1), ('finding', 1), ('bottom', 1), ('+the', 1), ('underperformed', 1), ('asian', 1), ('currencies', 1), ('since', 1), ('policy', 1), ('makers', 1), ('steps', 1), ('november', 1), ('deter', 1), ('foreign', 1), ('banks', 1), ('non-deliverable', 1), ('forwards', 1), ('wire', 1), ('report', 1), ('said.+at', 1), ('

## STEP 3: Stemming with Porter Stemmer

In [6]:
from nltk.stem.porter import *

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
print(count.most_common())

[(',', 6), ('ringgit', 3), ('.', 3), ('%', 3), ('us', 2), ('death-cross', 2), ('pattern', 2), ('currenc', 2), ('report', 2), ('move', 2), ('previou', 2), ('took', 2), ('dollar', 2), ('3', 2), ('trade', 2), ('strengthen', 1), ('dollar.+th', 1), ('dollar-ringgit', 1), ('exchang', 1), ('rate', 1), ('form', 1), ('past', 1), ('led', 1), ('declin', 1), ('pair', 1), ('base', 1), ('technic', 1), ('chart', 1), ('+bloomberg', 1), ('wednesday', 1), ('occur', 1), ('50-day', 1), ('averag', 1), ('drop', 1), ('100-day', 1), ('gaug', 1), ('+it', 1), ('said', 1), ('three', 1), ('occas', 1), ('place', 1), ('post', 1), ('addit', 1), ('loss', 1), ('7', 1), ('find', 1), ('bottom', 1), ('+the', 1), ('underperform', 1), ('asian', 1), ('sinc', 1), ('polici', 1), ('maker', 1), ('step', 1), ('novemb', 1), ('deter', 1), ('foreign', 1), ('bank', 1), ('non-deliver', 1), ('forward', 1), ('wire', 1), ('said.+at', 1), ('1.45pm', 1), ('4.4305', 1), ('close', 1), ('4.4312.+', 1)]


## STEP 4: tf-idf with Scikit-learn (combined)

In [7]:
import nltk
import string
import math
import os
from collections import Counter
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from collections import defaultdict 

In [8]:
path = '/home/muhdlaziem/Workspace/NLP/Week8/article'
token_dict ={}
stemmer = PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [9]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems  = stem_tokens(tokens, stemmer)
    return stems

for subdir, dirs, files in os.walk(path):
    for file in files:
        file_path =  subdir + os.path.sep + file
        print(file_path)
        article = open(file_path,'r')
        text = article.read()
        lowers = text.lower()
        no_punctuation = lowers.translate(string.punctuation)
        token_dict[file] = no_punctuation

/home/muhdlaziem/Workspace/NLP/Week8/article/art4.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art3.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art1.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art2.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art5.dat
/home/muhdlaziem/Workspace/NLP/Week8/article/art6.dat


In [10]:
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform((token_dict.values()))

  'stop_words.' % sorted(inconsistent))


In [11]:
def k_means(tfs):
    true_k=2
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=50,n_init=1)
    model.fit(tfs)
    print("Top terms per cluster: ")
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = tfidf.get_feature_names()
    
    for i in range(true_k):
        print("Cluster %d: " % i)
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind])
k_means(tfs)           

Top terms per cluster: 
Cluster 0: 
 ,
 's
 wa
 feder
 kill
 servic
 hi
 unit
 ibrahimov
 game
Cluster 1: 
 trade
 ,
 %
 ringgit
 $
 billion
 high
 deficit
 thi
 death-cross
