In [13]:
%matplotlib inline
import csv
import nltk
#from nltk.stem.porter import PorterStemmer
from stem import IndonesianStemmer
import string
#import os
import re
from collections import defaultdict

In [1]:
#stopwords = nltk.corpus.stopwords.words('indonesian')

In [2]:
stemmer = IndonesianStemmer()
tokenizer = nltk.tokenize.RegexpTokenizer("[\w']+", flags=re.UNICODE)

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    global stemmer, tokenizer
    #tokens = nltk.word_tokenize(text, language="indonesian")
    tokens = tokenizer.tokenize(text)
    
    stems = stem_tokens(tokens, stemmer)
    return stems

In [4]:
token_dict = {}
with open('/data/corpus.csv') as raw_file:
    for (prod_id, title, _) in csv.reader(raw_file):
        token_dict[int(prod_id)] = tokenize(
            title.lower().translate(None, string.punctuation))

In [11]:
collection = nltk.text.TextCollection(token_dict.values())

In [12]:
#get a list of unique terms
unique_terms = list(set(collection))
print "Unique terms found: ", len(unique_terms)

Unique terms found:  101633


In [72]:
# Function to create a TF*IDF vector for one document.  For each of
# our unique words, we have a feature which is the td*idf for that word
# in the current document
def TFIDF(document):
    global collection
    word_tfidf = defaultdict(lambda: 0)
    for word in unique_terms:
        ifidf = collection.tf_idf(word,document)
        if ifidf > 0.01:
            word_tfidf[word] = ifidf
    return word_tfidf

In [35]:
%time terms = [(t, collection.idf(t)) for t in unique_terms]

CPU times: user 49min 37s, sys: 1.17 s, total: 49min 38s
Wall time: 49min 35s


In [77]:
# Inverted Document Frequencies (of the unique terms):
sorted(terms, key=lambda t: -t[1])[:30] + ["..."] + sorted(terms, key=lambda t: -t[1])[-10:]

[('fawn', 12.339562787303892),
 ('tsukino', 12.339562787303892),
 ('5quot', 12.339562787303892),
 ('400410', 12.339562787303892),
 ('daiich', 12.339562787303892),
 ('asustrixgtx750tioc2gd5', 12.339562787303892),
 ('lodyne', 12.339562787303892),
 ('inlite', 12.339562787303892),
 ('n7100note', 12.339562787303892),
 ('ds1015', 12.339562787303892),
 ('avanzaxeniakonfim', 12.339562787303892),
 ('br226', 12.339562787303892),
 ('xonmedaxon', 12.339562787303892),
 ('40z125ml', 12.339562787303892),
 ('spdquick', 12.339562787303892),
 ('ehnd11a', 12.339562787303892),
 ('replacer', 12.339562787303892),
 ('eos450d', 12.339562787303892),
 ('xp748fe', 12.339562787303892),
 ('bb402apinktua', 12.339562787303892),
 ('cp145335', 12.339562787303892),
 ('ks11', 12.339562787303892),
 ('wednesday', 12.339562787303892),
 ('330ma', 12.339562787303892),
 ('chameleons', 12.339562787303892),
 ('x0821656565s7x', 12.339562787303892),
 ('elgar', 12.339562787303892),
 ('adssuara', 12.339562787303892),
 ('330ml', 12.

In [None]:
collection.

In [73]:
%time len([TFIDF(token_dict[k]) for k in token_dict.keys()[1000:2000]])

CPU times: user 2min 9s, sys: 844 ms, total: 2min 10s
Wall time: 2min 9s


1000

In [None]:
# Vectorize products:
vectors = [(prod_id,TFIDF(token_dict[k])) for prod_id in token_dict.keys()]

In [60]:
sorted (_.values())

[0.08125752443140973,
 0.08418035664711482,
 0.08471049230957717,
 0.08572306924639782,
 0.09490421607068966,
 0.09610310742401315,
 0.09888906653094279,
 0.10309338499850827,
 0.11219112130519665,
 0.11575702874521299,
 0.11606919200875113,
 0.11961919908903075,
 0.12047141587224015,
 0.12249164852945398,
 0.1252307888935559,
 0.1273432417639753,
 0.12874552394640823,
 0.13010061483301233,
 0.13415534476661178,
 0.13481606573946714,
 0.1350518254670126,
 0.14870098089993758,
 0.15073557285628914,
 0.15355791836801627,
 0.15486953902992615,
 0.15990072051809956,
 0.1616393269183114,
 0.16506548307544766,
 0.1704401742606427,
 0.17254841538315738,
 0.17449490419424216,
 0.17940906916324698,
 0.18033942578217022,
 0.18783448628517618,
 0.20636806708149222,
 0.21031920747208285,
 0.21210899912641143,
 0.22731659139168264,
 0.22917804970967187,
 0.25458938197304437,
 0.2869665764489277,
 0.2869665764489277]

In [None]:
prod_ids = sorted(token_dict.keys())
for i in range(5):
    for j in range(i,7):
        nltk_repr.similar()

In [None]:
#this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='indonesian')
tfs = tfidf.fit_transform(token_dict.values())