In [1]:
import pickle
import csv
import os
import nltk
import numpy as np

from sklearn.naive_bayes import GaussianNB
from preprocessors import clean, tokenize, remove_stopwords

#### Stopwords

In [2]:
with open('id.stopwords.02.01.2016.txt', 'r') as file:
    csv = csv.reader(file)
    stopwords = []
    for row in csv:
        stopwords += row

#### Load Tweets

In [5]:
tweet_dir = os.path.join(os.getcwd(), 'tweet-data')
tweet_data = [os.path.join(tweet_dir, f) for f in os.listdir(tweet_dir)]
tweet_data[0:2]

tweets = []
for file in tweet_data:
    with open(file, 'rb') as input:
        tweets += pickle.load(input)
        
len(tweets)

566

#### Creating Bag of Words

In [6]:
def clean_tokens(tweet):
    return remove_stopwords(tokenize(clean(tweet['text'])), stopwords)

def create_fdist(tweets):
    words = []
    for tweet in tweets:
        words += clean_tokens(tweet)
    return nltk.probability.FreqDist(words)

fdist = create_fdist(tweets)
fdist.pprint()
len(fdist)

FreqDist({'yg': 184, 'prabowo': 123, 'presiden': 118, 'jokowi': 115, '@prabowo': 103, '@jokowi': 74, '#prabowosandi': 53, '#jokowi2periode': 53, 'sandi': 50, '@sandiuno': 49, ...})


3627

In [14]:
def extract_feature(tweet, features, capres):
    tokens = clean_tokens(tweet)
    fdist = nltk.probability.FreqDist(tokens)
    label = tweet['aspect'][capres]
    return np.hstack(([fdist[f] for f in features], label))

def create_feature_matrix(tweets, features, capres):
    feature_matrix = np.array([]).reshape(0, len(features)+1)
    for tweet in tweets:
        row = extract_feature(tweet, features, capres)
        if row[-1] != 0:
            # Removed neutral tweets
            feature_matrix = np.vstack((feature_matrix, row))
    return feature_matrix

In [15]:
features = [word for word, count in fdist.most_common(1461)]
feature_matrix_jokowi = create_feature_matrix(tweets, features, 'jokowi')
feature_matrix_prabowo = create_feature_matrix(tweets, features, 'prabowo')

In [16]:
train_jokowi = feature_matrix_jokowi[:, :-1]
label_jokowi = feature_matrix_jokowi[:, -1:].flatten()

train_prabowo = feature_matrix_prabowo[:, :-1]
label_prabowo = feature_matrix_prabowo[:, -1:].flatten()

In [17]:
nbc_jokowi = GaussianNB()
nbc_jokowi.fit(train_jokowi, label_jokowi)

nbc_prabowo = GaussianNB()
nbc_prabowo.fit(train_prabowo, label_prabowo)

GaussianNB(priors=None)

In [18]:
print(nbc_jokowi.score(train_jokowi, label_jokowi), \
      nbc_prabowo.score(train_prabowo, label_prabowo))

0.996078431373 0.980694980695
