In [1]:
from scipy.sparse import dok_matrix
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from utils.processing import tokenize_text, process_tokens

In [2]:
import os
os.getcwd()

'/afs/inf.ed.ac.uk/user/s21/s2150635/Desktop/ttds'

In [3]:
def convert_to_bow(data, word2id):
    matrix_size = (len(data), len(word2id)+1)
    oov_index = len(word2id)
    bow = dok_matrix(matrix_size)
    for doc_id, doc in enumerate(data):
        for word in doc:
            word_id = word2id.get(word, oov_index)
            bow[doc_id, word_id] += 1
    return bow

In [4]:
def preprocess_data(data):
    documents = []
    categories = []
    vocab = set()
    
    for line in data.split("\n"):
        if not line:
            continue
        tweet_id, category, tweet = line.split("\t")
        tokens = tokenize_text(tweet)
        processed_tokens = process_tokens(tokens)
        documents.append(processed_tokens)
        categories.append(category)
        vocab.update(processed_tokens)
    
    word2id = {word: i for i, word in enumerate(vocab)}
    cat2id = {cat: i for i, cat in enumerate(set(categories))}

    return documents, categories, vocab, word2id, cat2id


In [8]:
train_data = open('data/collections/train.txt', encoding="utf-8").read()
test_data = open('data/collections/test.txt', encoding="utf-8").read()

In [7]:
train_data = open('data/collections/train.txt', encoding="utf-8").read()
train_docs, train_cats, train_vocab, word2id, cat2id = preprocess_data(train_data)
# baseline
X_train = convert_to_bow(train_docs, word2id)
Y_train = [cat2id[cat] for cat in train_cats]
model = SVC(C=1000, kernel='linear')
model.fit(X_train, Y_train)

SVC(C=1000, kernel='linear')

In [9]:
test_docs, test_cats, _, _, _ = preprocess_data(test_data)
X_test = convert_to_bow(test_docs, word2id)
Y_test = [cat2id[cat] for cat in test_cats]

In [10]:
Y_test_pred = model.predict(X_test)

In [11]:
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
print(classification_report(Y_test, Y_test_pred, target_names=cat_names))

              precision    recall  f1-score   support

   sentiment       0.07      1.00      0.13         1
    negative       0.44      0.54      0.49       970
    positive       0.55      0.54      0.55      1495
     neutral       0.58      0.52      0.55      2197

    accuracy                           0.53      4663
   macro avg       0.41      0.65      0.43      4663
weighted avg       0.54      0.53      0.54      4663

