In [125]:
from scipy.sparse import dok_matrix
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from random import shuffle
import csv
from utils.processing import tokenize_text, process_tokens

In [126]:
import os
os.getcwd()

'c:\\Uni\\TTDS'

In [127]:
def convert_to_bow(data, word2id):
    matrix_size = (len(data), len(word2id)+1)
    oov_index = len(word2id)
    bow = dok_matrix(matrix_size)
    for doc_id, doc in enumerate(data):
        for word in doc:
            word_id = word2id.get(word, oov_index)
            bow[doc_id, word_id] += 1
    return bow

In [128]:
def preprocess_data(data):
    documents = []
    categories = []
    vocab = set()
    # Skip the header
    lines = data.split("\n")[1:]
    shuffle(lines)
    
    for line in lines:
        if not line:
            continue
        tweet_id, category, tweet = line.split("\t")
        tokens = tokenize_text(tweet)
        # processed_tokens = process_tokens(tokens)
        processed_tokens = tokens
        documents.append(processed_tokens)
        categories.append(category)
        vocab.update(processed_tokens)
    
    word2id = {word: i for i, word in enumerate(vocab)}
    cat2id = {cat: i for i, cat in enumerate(set(categories))}

    return documents, categories, vocab, word2id, cat2id


In [129]:
train_data = open('data/collections/train.txt', encoding="utf-8").read()
test_data = open('data/collections/test.txt', encoding="utf-8").read()

train_docs, train_cats, train_vocab, word2id, cat2id = preprocess_data(train_data)
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
# baseline
X = convert_to_bow(train_docs, word2id)
Y = [cat2id[cat] for cat in train_cats]


X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, random_state=42)

In [130]:
model = SVC(C=1000, kernel='linear')
model.fit(X_train, Y_train)

In [131]:
def export_results(reports):
    # reports: [{system: str, split: str, report: classification_report}]
    with open("data/cw2/classification.csv", "w", newline="") as f:
        writer = csv.writer(f, delimiter=",")
        writer.writerow("system,split,p-pos,r-pos,f-pos,p-neg,r-neg,f-neg,p-neu,r-neu,f-neu,p-macro,r-macro,f-macro".split(","))
        # 
        for report in reports:
            metrics = []
            # 
            for cat in ["positive", "negative", "neutral"]:
                data = report["report"][cat]
                for metric in ["precision", "recall", "f1-score"]:
                    metrics.append(round(data[metric], 3))
            # 
            macros = report["report"]["macro avg"]
            for metric in ["precision", "recall", "f1-score"]:
                metrics.append(round(macros[metric], 3))
            # 
            writer.writerow([report["system"], report["split"], *metrics])


In [132]:
reports = []

### Train data baseline

In [133]:
# baseline train
Y_train_pred = model.predict(X_train)
train_report = classification_report(Y_train, Y_train_pred, output_dict=True, target_names=cat_names)
reports.append({"system": "baseline", "split": "train", "report": train_report})
print(classification_report(Y_train, Y_train_pred, target_names=cat_names))

              precision    recall  f1-score   support

     neutral       1.00      1.00      1.00      7039
    positive       1.00      1.00      1.00      4768
    negative       1.00      1.00      1.00      3109

    accuracy                           1.00     14916
   macro avg       1.00      1.00      1.00     14916
weighted avg       1.00      1.00      1.00     14916



### Dev data baseline

In [134]:
# baseline dev
Y_dev_pred = model.predict(X_dev)
dev_report = classification_report(Y_dev, Y_dev_pred, output_dict=True, target_names=cat_names)
reports.append({"system": "baseline", "split": "dev", "report": dev_report})
print(classification_report(Y_dev, Y_dev_pred, target_names=cat_names))

              precision    recall  f1-score   support

     neutral       0.58      0.61      0.60      1750
    positive       0.61      0.59      0.60      1211
    negative       0.53      0.49      0.51       769

    accuracy                           0.58      3730
   macro avg       0.57      0.56      0.57      3730
weighted avg       0.58      0.58      0.58      3730



In [None]:
# print 3 misclassified examples from the dev set
cnt = 0
for i, (gold, pred) in enumerate(zip(Y_dev, Y_dev_pred)):
    if gold != pred:
        cnt += 1
        # labels
        print("Gold:", cat_names[gold], "Pred:", cat_names[pred])
        # text
        # print(X_dev[i])
        print(" ".join(train_docs[X_train.shape[0] + i]))
        print()
    if cnt == 3:
        break

Gold: neutral Pred: positive
this book is a slap in the face to all muslims living across the middle east who r trying to fix a broken religion https t co aztf0stonv

Gold: neutral Pred: positive
kurt cobain solo release http t co tmrslrvlyx any real nirvana fan will not be buying this just money grabbing brett morgan and courtney

Gold: positive Pred: neutral
i really sat through and watched bad blood music video becos i found out my gf was in it

Gold: neutral Pred: negative
kpop fans are all excited to see got7 at the smart araneta coliseum this november long queue happening now at http t co grj9j5kp5m

Gold: negative Pred: neutral
im siltitng in my room the sun is rising outside my windwo and im laughign at my fucking naruto joke

Gold: neutral Pred: positive
who wants to see ant-man with me tomorrow



### Test data baseline

In [137]:
test_docs, test_cats, _, _, _ = preprocess_data(test_data)
X_test = convert_to_bow(test_docs, word2id)
Y_test = [cat2id[cat] for cat in test_cats]

In [138]:
Y_test_pred = model.predict(X_test)
test_report = classification_report(Y_test, Y_test_pred, output_dict=True, target_names=cat_names)
reports.append({"system": "baseline", "split": "test", "report": test_report})
print(classification_report(Y_test, Y_test_pred, target_names=cat_names))

              precision    recall  f1-score   support

     neutral       0.59      0.65      0.62      2197
    positive       0.60      0.58      0.59      1495
    negative       0.53      0.45      0.49       970

    accuracy                           0.58      4662
   macro avg       0.58      0.56      0.57      4662
weighted avg       0.58      0.58      0.58      4662



### Export results

In [139]:
export_results(reports)
