In [1]:
import csv
import time
from math import sqrt, floor
from random import randint
import pandas as pd
import numpy as np
import re, string
from collections import Counter
from itertools import dropwhile

In [2]:
#Preprocessing

def remove_punctuations(text):
    punctuations = string.punctuation
    translations = text.maketrans(punctuations, ''.join([' ']*len(punctuations)))
    return text.translate(translations)

def remove_numerics(string):
    return re.sub(r'\d+', '', string)

def remove_hashtags(string):
    return re.sub(r'#\w+', '', string)

def remove_handles(string):
    return re.sub(r'@\w+', '', string)

url_re = r'(http:\/\/)?t\.co\/([a-zA-Z0-9])+'
def remove_urls(text):
    return re.sub(url_re, '', text)

def remove_stopwords(string, stopwords):
    return ' '.join(['' if word in stopwords else word for word in string.split()])

def is_stopword(word):
    return word in stop_words

def is_hashtag(word):
    return re.match(r'#\w+', word)

def is_handle(word):
    return re.match(r'@\w+', word)

def is_numeric(word):
    return re.match(r'^\d+$', word)

def get_valid_word(word):
    word = word.lower()
    sw = is_stopword(word)
    ht = is_hashtag(word)
    th = is_handle(word)
    if sw or ht or th:
        return ''
    word = remove_punctuations(word)
    return '' if is_numeric(word) else word
    
    return word

# param: tweets string[], args = {}
def create_bow(tweets, args):
    bow = []
    text = ' '.join(tweets)
    text = text.lower()
    if len(args['stopwords']):
        print('removing stop words...')
        text = remove_stopwords(text, args['stopwords'])
    if args['remove_urls']:
        print('removing urls...')
        text = remove_urls(text)
    if args['remove_handles']:
        print('removing handles...')
        text = remove_handles(text)
    if args['remove_hashtags']:
        print('removing hashtags...')
        text = remove_hashtags(text)
    if args['remove_punctuations']:
        print('removing punctuations...')
        text = remove_punctuations(text)
    if args['remove_numerics']:
        print('removing numerics...')
        text = remove_numerics(text)
    text = text.split()
    bow = Counter(text)
    most_common = args.get('most_common', 0)
    if args['remove_singles']:
        for k, c in dropwhile(lambda item: item[1]>1, bow.most_common()):
            del bow[k]
    if most_common:
        print('returning most common {}'.format(most_common))
        return bow.most_common(most_common)
    return bow

def tokenize(text, args):
    text = text.lower()
    if len(args['stopwords']):
        text = remove_stopwords(text, args['stopwords'])
    if args['remove_urls']:
        text = remove_urls(text)
    if args['remove_handles']:
        text = remove_handles(text)
    if args['remove_hashtags']:
        text = remove_hashtags(text)
    if args['remove_punctuations']:
        text = remove_punctuations(text)
    if args['remove_numerics']:
        text = remove_numerics(text)
    words = text.split()
    return words

In [12]:
Label = { 'positive': 1, 'negative': -1, 'neutral': 0 }
labels = [-1, 0, 1]
def process_data(bow, tweets, labels, options):
    vocab_size = len(bow)
    sample_size = len(tweets)
    vocab = list(bow)
    data_x = np.zeros((sample_size, vocab_size))
    for i, sample in enumerate(tweets):
        for word in tokenize(sample, options):
            try:
                word_index = vocab.index(word)
                data_x[i][word_index] += 1
            except:
                pass
    return data_x, [Label[l] for l in labels]

In [4]:
train_df = pd.read_csv('data/train/train.csv')
test_df = pd.read_csv('data/test/test.csv')

stopwprds = []
with open('data/stop_words.txt', 'r') as fd:
    stopwords = fd.read().split('\n')

bow_options = {'stopwords': stopwords,
               'remove_urls': True,
               'remove_handles': True,
               'remove_hashtags': False,
               'remove_punctuations': True,
               'remove_numerics': True,
               'remove_singles': False,
              }

bow = create_bow(train_df['Tweet'], bow_options)

starttime = time.time()
training_x, training_y = process_data(bow, train_df['Tweet'], train_df['Sentiment'], bow_options)
test_x, test_y = process_data(bow, test_df['Tweet'], test_df['Sentiment'], bow_options)
endtime = time.time()
print('Time taken to create BOW', endtime - starttime)

removing stop words...
removing urls...
removing handles...
removing punctuations...
removing numerics...
Time taken to create BOW 2.659074306488037


In [5]:
def compute_euclidean(v1, v2):
    raw_distance = v1-v2
    squared_distance = np.dot(raw_distance, raw_distance)
    d = sqrt(squared_distance)
    return d

In [6]:
def compute_distance_matrix(training_x, test_x):
    return np.array([[compute_euclidean(ts, tr) for tr in training_x] for ts in test_x])

print('computing distance matrix...')
starttime = time.time()
distance_matrix = compute_distance_matrix(training_x, test_x)
endtime = time.time()
print('distance matrix computed')
print('time taken to compute distance matrix', endtime - starttime)
#time taken to compute distance matrix 271.697865486145 [with multiply sum] 

computing distance matrix...
distance matrix computed
time taken to compute distance matrix 988.8986740112305


In [33]:
def partition(arr, l, r):       
    x = arr[r] 
    i = l 
    for j in range(l, r): 
        if arr[j] <= x: 
            arr[i], arr[j] = arr[j], arr[i] 
            i += 1              
    arr[i], arr[r] = arr[r], arr[i] 
    return i 

def quick_select(arr, l, r, k): 
    if (k > 0 and k <= r - l + 1): 
        index = partition(arr, l, r) 
        if (index - l == k - 1): 
            return arr[index] 
        if (index - l > k - 1): 
            return quick_select(arr, l, index - 1, k) 
        return quick_select(arr, index + 1, r, k - index + l - 1) 
    return INT_MAX

def get_kth_smallest(arr, k):
    n = len(arr) 
    return quick_select(arr, 0, n - 1, k)

def get_k_nearest_indexes(distance_matrix, k):
    temp_distance = np.array(distance_matrix)
    kth_smallest = get_kth_smallest(temp_distance, k)
    for i, d in enumerate(distance_matrix):
        if d <= kth_smallest:
            yield i

def get_mod(arr):
    label_counter = Counter(arr)
    max_count = label_counter.most_common(1)[0][1]
    return [k for k, c in label_counter.items() if c == max_count]
    
def assign_label(distance_matrix, training_labels, k):
    k_nearest_labels = [training_labels[i] for i in get_k_nearest_indexes(distance_matrix, k)]
    mod = get_mod(k_nearest_labels)
    if len(mod) > 1:
        if k == 1:
            return mod[randint(0, len(mod) - 1)]
        return assign_label(distance_matrix, training_labels, k-1)
    return mod[0]

def compute_accuracy(gold, predicted):
    if len(gold) != len(predicted):
        print('label arrays should have same size')
        return
    correct = 0
    incorrect = 0
    for i in range(len(gold)):
        if gold[i] == predicted[i]:
            correct += 1
        else:
            incorrect += 1
    return correct/(correct + incorrect)

def compute_performance(gold, prediction):
    accuracy = compute_accuracy(gold, prediction)
    return accuracy

def predict(distance_matrix, training_y, k):
    test_size = distance_matrix.shape[0]
    return [assign_label(distance_matrix[test_i], training_y, k) for test_i in range(test_size)]

def generate_confusion_matrix(gold, prediction, labels):
    if len(gold) != len(prediction):
        print('label arrays should have same size')
        return
    label_count = len(labels)
    cm = np.zeros((label_count, label_count))
    for i in range(len(prediction)):
        cm[labels.index(prediction[i])][labels.index(gold[i])] += 1
    return cm

print('starting prediction...')
starttime = time.time()
#for k in [3]:
prediction = predict(distance_matrix, training_y, 2)
endtime = time.time()
print('completed prediction')
print('Time taken to compute predictions', endtime - starttime)

starting prediction...
completed prediction
Time taken to compute predictions 30.336062908172607


In [34]:
def compute_macro_average(prediction, gold, labels):
    label_count = len(labels)
    cm = generate_confusion_matrix(test_y, prediction, labels)
    print(cm)
    print('-------------')
    precision = 0
    recall = 0
    for i in range(label_count):
        negatives = [n for n in range(label_count) if n != i]
        tp = cm[i][i]
        fn = np.array([cm[j][i] for j in negatives]).sum()
        fp = np.array([cm[i][j] for j in negatives]).sum()
        tn = np.array([np.array([cm[j][k] for j in negatives]).sum() for k in negatives]).sum()
        print('{}\t{}'.format(tp, fp))
        print('{}\t{}'.format(fn, tn))
        print('-------({})--------'.format(labels[i]))

        precision += (tp/(tp+fp))
        recall += (tp/(tp+fn))
    macro_avg_precision = precision/label_count
    macro_avg_recall = recall/label_count
    f1_score = (2*macro_avg_precision*macro_avg_recall)/(macro_avg_precision+macro_avg_recall)
    return {'f1': f1_score, 'precision': macro_avg_precision, 'recall': macro_avg_recall}

In [39]:
macro = compute_macro_average(prediction, test_y, [-1, 0, 1])
print('F1 Score: %.4f' % macro['f1'])
print('Precision: %.4f' % macro['precision'])
print('Recall: %.4f' % macro['recall'])


[[799.  89.  46.]
 [861. 438. 135.]
 [174.  88. 291.]]
-------------
799.0	135.0
1035.0	952.0
-------(-1)--------
438.0	996.0
177.0	1310.0
-------(0)--------
291.0	262.0
181.0	2187.0
-------(1)--------
F1 Score: 0.5750
Precision: 0.5624
Recall: 0.5881
True


In [32]:
print(len(training_x[0]))

10060


In [8]:
#removed everything
#starting prediction...
#Accuracy for k=1 is 0.5234508729887025
#Accuracy for k=3 is 0.47107155083875385
#Accuracy for k=5 is 0.4577199589181787
#Accuracy for k=7 is 0.4471071550838754
#Accuracy for k=10 is 0.43957548784662787
#completed prediction
#Time taken to compute predictions 166.01523804664612