In [4]:
from scipy.sparse import dok_matrix
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from random import shuffle
import csv
from utils.processing import tokenize_text, process_tokens

In [2]:
import os
os.getcwd()

'c:\\Uni\\TTDS'

In [5]:
def convert_to_bow(data, word2id):
    matrix_size = (len(data), len(word2id)+1)
    oov_index = len(word2id)
    bow = dok_matrix(matrix_size)
    for doc_id, doc in enumerate(data):
        for word in doc:
            word_id = word2id.get(word, oov_index)
            bow[doc_id, word_id] += 1
    return bow

In [6]:
def preprocess_data(data):
    documents = []
    categories = []
    vocab = set()
    # Skip the header
    lines = data.split("\n")[1:]
    shuffle(lines)
    
    for line in lines:
        if not line:
            continue
        tweet_id, category, tweet = line.split("\t")
        tokens = tokenize_text(tweet)
        # processed_tokens = process_tokens(tokens)
        processed_tokens = tokens
        documents.append(processed_tokens)
        categories.append(category)
        vocab.update(processed_tokens)
    
    word2id = {word: i for i, word in enumerate(vocab)}
    cat2id = {cat: i for i, cat in enumerate(set(categories))}

    return documents, categories, vocab, word2id, cat2id


In [7]:
train_data = open('data/collections/train.txt', encoding="utf-8").read()
test_data = open('data/collections/test.txt', encoding="utf-8").read()

train_docs, train_cats, train_vocab, word2id, cat2id = preprocess_data(train_data)
cat_names = []
for cat,cid in sorted(cat2id.items(),key=lambda x:x[1]):
    cat_names.append(cat)
# baseline
X = train_docs 
Y = [cat2id[cat] for cat in train_cats]
X_train, X_dev, Y_train, Y_dev = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train_BoW = convert_to_bow(X_train, word2id)
X_dev_BoW = convert_to_bow(X_dev, word2id)


In [8]:
print(cat2id)

{'positive': 0, 'neutral': 1, 'negative': 2}


In [15]:
model = SVC(C=1000, kernel='linear')
model.fit(X_train_BoW, Y_train)

In [9]:
def export_results(reports):
    # reports: [{system: str, split: str, report: classification_report}]
    with open("data/cw2/classification.csv", "w", newline="") as f:
        writer = csv.writer(f, delimiter=",")
        writer.writerow("system,split,p-pos,r-pos,f-pos,p-neg,r-neg,f-neg,p-neu,r-neu,f-neu,p-macro,r-macro,f-macro".split(","))
        # 
        for report in reports:
            metrics = []
            # 
            for cat in ["positive", "negative", "neutral"]:
                data = report["report"][cat]
                for metric in ["precision", "recall", "f1-score"]:
                    metrics.append(round(data[metric], 3))
            # 
            macros = report["report"]["macro avg"]
            for metric in ["precision", "recall", "f1-score"]:
                metrics.append(round(macros[metric], 3))
            # 
            writer.writerow([report["system"], report["split"], *metrics])


In [10]:
reports = []

### Train data baseline

In [16]:
# baseline train
Y_train_pred = model.predict(X_train_BoW)
train_report = classification_report(Y_train, Y_train_pred, output_dict=True, target_names=cat_names)
reports.append({"system": "baseline", "split": "train", "report": train_report})
print(classification_report(Y_train, Y_train_pred, target_names=cat_names))

              precision    recall  f1-score   support

    positive       1.00      1.00      1.00      4784
     neutral       1.00      1.00      1.00      7050
    negative       1.00      1.00      1.00      3082

    accuracy                           1.00     14916
   macro avg       1.00      1.00      1.00     14916
weighted avg       1.00      1.00      1.00     14916



### Dev data baseline

In [17]:
# baseline dev
Y_dev_pred = model.predict(X_dev_BoW)
dev_report = classification_report(Y_dev, Y_dev_pred, output_dict=True, target_names=cat_names)
reports.append({"system": "baseline", "split": "dev", "report": dev_report})
print(classification_report(Y_dev, Y_dev_pred, target_names=cat_names))

              precision    recall  f1-score   support

    positive       0.57      0.63      0.60      1195
     neutral       0.61      0.61      0.61      1739
    negative       0.56      0.47      0.51       796

    accuracy                           0.58      3730
   macro avg       0.58      0.57      0.57      3730
weighted avg       0.58      0.58      0.58      3730



In [12]:
# print 3 misclassified examples from the dev set
cnt = 0
for i, (gold, pred) in enumerate(zip(Y_dev, Y_dev_pred)):
    if gold != pred:
        cnt += 1
        # labels
        print("Gold:", cat_names[gold], "Pred:", cat_names[pred])
        # text
        # print(X_dev[i])
        print(" ".join(X_train[i]))
        print()
    if cnt == 3:
        break

Gold: neutral Pred: negative
juventus have sign cuadrado and presenting him like its the 2nd coming of del piero why o

Gold: neutral Pred: positive
https t co 6hmmdiz0qp - turns out immigration does not harm the economy - but we knew that already

Gold: positive Pred: neutral
rod serling wrote or adapted two-thirds of the 156 episodes of the twilight zone and the quality never wavered still the gold standard



### Test data baseline

In [11]:
test_docs, test_cats, _, _, _ = preprocess_data(test_data)
X_test_BoW = convert_to_bow(test_docs, word2id)
Y_test = [cat2id[cat] for cat in test_cats]

In [18]:
Y_test_pred = model.predict(X_test_BoW)
test_report = classification_report(Y_test, Y_test_pred, output_dict=True, target_names=cat_names)
reports.append({"system": "baseline", "split": "test", "report": test_report})
print(classification_report(Y_test, Y_test_pred, target_names=cat_names))

              precision    recall  f1-score   support

    positive       0.55      0.61      0.58      1495
     neutral       0.60      0.60      0.60      2197
    negative       0.54      0.45      0.49       970

    accuracy                           0.57      4662
   macro avg       0.56      0.55      0.55      4662
weighted avg       0.57      0.57      0.57      4662



### DistillBert results

In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
tokenizer = DistilBertTokenizer.from_pretrained("./fine_tuned_model")

In [13]:

splits = ["dev", "train", "test"]
for i, (collection, labels) in enumerate([(X_dev, Y_dev), (X_train, Y_train), (test_docs, Y_test)]):
    tweets = [" ".join(doc) for doc in collection]
    all_logits = None
    print()
    print(f"Processing {splits[i]} set")
    # do in batches of size 32
    N = 20
    preds = []
    for j in tqdm(range(0, len(tweets), N)):
        inputs = tokenizer(tweets[j:j+N], padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        logits = outputs.logits
        ps = torch.argmax(logits, dim=1).tolist()
        if j == 0:
            print(ps)
        preds.extend(ps)
    # get classification report
    report = classification_report(labels, preds, output_dict=True, target_names=cat_names)
    print(classification_report(labels, preds, target_names=cat_names))
    reports.append({"system": "improved", "split": splits[i], "report": report})


Processing dev set


  1%|          | 1/187 [00:00<00:45,  4.07it/s]

[0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 0, 1, 0, 1, 1, 1, 1, 1, 1]


100%|██████████| 187/187 [00:45<00:00,  4.11it/s]


              precision    recall  f1-score   support

    positive       0.84      0.73      0.78      1195
     neutral       0.74      0.83      0.78      1739
    negative       0.79      0.72      0.75       796

    accuracy                           0.78      3730
   macro avg       0.79      0.76      0.77      3730
weighted avg       0.78      0.78      0.78      3730


Processing train set


  0%|          | 1/746 [00:00<03:27,  3.60it/s]

[1, 1, 2, 1, 1, 0, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1]


100%|██████████| 746/746 [03:01<00:00,  4.10it/s]


              precision    recall  f1-score   support

    positive       0.82      0.74      0.78      4784
     neutral       0.74      0.82      0.78      7050
    negative       0.79      0.72      0.75      3082

    accuracy                           0.77     14916
   macro avg       0.79      0.76      0.77     14916
weighted avg       0.78      0.77      0.77     14916


Processing test set


  0%|          | 1/234 [00:00<01:00,  3.87it/s]

[0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0, 2, 1]


100%|██████████| 234/234 [00:56<00:00,  4.14it/s]

              precision    recall  f1-score   support

    positive       0.76      0.68      0.72      1495
     neutral       0.68      0.75      0.71      2197
    negative       0.67      0.63      0.65       970

    accuracy                           0.70      4662
   macro avg       0.70      0.69      0.69      4662
weighted avg       0.70      0.70      0.70      4662






### Export results

In [19]:
export_results(reports)
