In [16]:
from preprocessors import clean, remove_stopwords, tokenize
import pickle
import os
import csv

import pandas as pd
import numpy as np

import nltk
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
with open('id.stopwords.02.01.2016.txt', 'r') as file:
    csv = csv.reader(file)
    stopwords = []
    for row in csv:
        stopwords += row

In [3]:
tweets = []
for filename in os.listdir('tweet-data/'):
    subdata = pickle.load(open('tweet-data/' + filename, 'rb'))
    tweets = tweets + subdata

In [4]:
def clean_tokens(tweet):
    return remove_stopwords(tokenize(clean(tweet['text'])), stopwords)

def create_fdist(tweets):
    words = []
    for tweet in tweets:
        words += clean_tokens(tweet)
    return nltk.probability.FreqDist(words)

fdist = create_fdist(tweets)
fdist.pprint()
len(fdist)

FreqDist({'yg': 184, 'prabowo': 123, 'presiden': 118, 'jokowi': 115, '@prabowo': 103, '@jokowi': 74, '#jokowi2periode': 53, '#prabowosandi': 53, 'sandi': 50, 'boyolali': 49, ...})


3627

In [5]:
def extract_feature(tweet, features, capres):
    tokens = clean_tokens(tweet)
    fdist = nltk.probability.FreqDist(tokens)
    label = tweet['aspect'][capres]
    return np.hstack(([fdist[f] for f in features], label))

def create_feature_matrix(tweets, features, capres):
    feature_matrix = np.array([]).reshape(0, len(features)+1)
    for tweet in tweets:
        row = extract_feature(tweet, features, capres)
        if row[-1] != 0:
            row[-1] = 1
        feature_matrix = np.vstack((feature_matrix, row))
    return feature_matrix

In [43]:
features = [word for word, count in fdist.most_common(800)]
feature_matrix_jokowi = create_feature_matrix(tweets, features, 'jokowi')
feature_matrix_prabowo = create_feature_matrix(tweets, features, 'prabowo')

In [44]:
feature_matrix_jokowi

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [2., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [45]:
train_jokowi = feature_matrix_jokowi[:, :-1]
label_jokowi = feature_matrix_jokowi[:, -1:].flatten()
train_jokowi, validation_jokowi, label_jokowi, label_validation_jokowi = train_test_split(train_jokowi, label_jokowi, test_size=0.1, random_state=12121)

train_prabowo = feature_matrix_prabowo[:, :-1]
label_prabowo = feature_matrix_prabowo[:, -1:].flatten()
train_prabowo, validation_prabowo, label_prabowo, label_validation_prabowo = train_test_split(train_prabowo, label_prabowo, test_size=0.1, random_state=12121)

In [46]:
nbc_jokowi = GaussianNB()
nbc_jokowi.fit(train_jokowi, label_jokowi)
nbc_prabowo = GaussianNB()
nbc_prabowo.fit(train_prabowo, label_prabowo)

lr_jokowi = LogisticRegression()
lr_jokowi.fit(train_jokowi, label_jokowi)
lr_prabowo = LogisticRegression()
lr_prabowo.fit(train_prabowo, label_prabowo)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
print(nbc_jokowi.score(train_jokowi, label_jokowi), \
      nbc_prabowo.score(train_prabowo, label_prabowo))

print(lr_jokowi.score(train_jokowi, label_jokowi), \
      lr_prabowo.score(train_prabowo, label_prabowo))

print('')

print(nbc_jokowi.score(validation_jokowi, label_validation_jokowi), \
      nbc_prabowo.score(validation_prabowo, label_validation_prabowo))

print(lr_jokowi.score(validation_jokowi, label_validation_jokowi), \
      lr_prabowo.score(validation_prabowo, label_validation_prabowo))

0.8644400785854617 0.831041257367387
0.9646365422396856 0.9646365422396856

0.7192982456140351 0.7368421052631579
0.8245614035087719 0.8245614035087719
