In [1]:
import numpy as np
import pandas as pd
import gensim
import pandas as pd
import os 

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors
from typing import List

In [2]:
# Loading data
data_path = "./data"

train_df = pd.read_csv(os.path.join(data_path, "train_data.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test_data.csv"))

train_texts, train_labels = train_df.text.tolist(), train_df.label.tolist()
test_texts, test_labels = test_df.text.tolist(), test_df.label.tolist()

# Simple whitespace split
tok_train_texts = [[w for w in txt.split() if w != ""] for txt in train_texts]
tok_test_texts = [[w for w in txt.split() if w != ""] for txt in test_texts]

In [3]:
# Experimentally-determined hyperparameters for Word2Vec
hyperparameters = {
    "sentences": tok_train_texts,
    "vector_size": 150,
    "window": 5, 
    "min_count": 5, 
    "sg": 0, 
    "epochs": 2, 
    "negative": 7, 
    "seed": 1
}

# Creating word2vec model
model = gensim.models.Word2Vec(**hyperparameters)                         

print("Created word2vec model...")

Created word2vec model...


In [4]:
# Courtesy for the code to https://www.kaggle.com/code/mehmetlaudatekman/tutorial-word-embeddings-with-svm
class Sequencer():
    def __init__(self,
                 all_words: List[str],
                 max_words: int,
                 seq_len: int,
                 vector_size: int,
                 embedding_matrix: KeyedVectors) -> None:
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix

        temp_vocab = list(set(all_words))
        self.vocab = []

        self.word_cnts = {}
        self.vector_size = vector_size

        for word in temp_vocab:
            count = len([0 for w in all_words if w == word])  # counting words, 0 is used a placeholder
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))

        # Sorting counts and indices by number of in descending order
        # Indices are used to find the most used N words
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1

        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])

    def textToVector(self, text: str):
        # Split text into tokens, get the length
        # If shorter than max. length - add spaces, if longer - trim from the end

        tokens = text.split()   
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1

        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            # In case token did not appear in training data
            except Exception as E:
                pass

        last_pieces = self.seq_len - len(vec)
        for _ in range(last_pieces):
            vec.append(np.zeros(self.vector_size,))

        return np.asarray(vec).flatten()

In [5]:
max_words = 1500
sequence_length = 200

# Init. seqeuncer
sequencer = Sequencer(all_words = [token for seq in tok_train_texts for token in seq],
                      max_words=max_words,
                      seq_len=sequence_length,
                      vector_size=hyperparameters["vector_size"],
                      embedding_matrix=model.wv)

In [6]:
# Utilizing PCA to reduce dimensions while preserving most of the information (~99%)
pca_model = PCA(n_components=100,
            svd_solver="randomized", 
            random_state=hyperparameters["seed"])


# Transform text -> vectors
train_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in tok_train_texts])
test_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in tok_test_texts])
print("Transformed texts to vectors...")

Transformed texts to vectors...


In [7]:
# Fit-transform
pca_model.fit(train_vecs)
train_comps = pca_model.transform(train_vecs)
test_comps = pca_model.transform(test_vecs)
print("Applied PCA...")

Applied PCA...


In [6]:
svm_classifier = SVC(random_state=hyperparameters["seed"])
svm_classifier.fit(train_comps, train_labels)
print(f"Done tuning SVM...")

predicted_labels = svm_classifier.predict(test_comps)
# Performance on test set
print(classification_report(test_labels, predicted_labels))

Created word2vec model...
Transformed texts to vectors...
Applied PCA...
Done tuning SVM...
              precision    recall  f1-score   support

          -1       0.88      0.54      0.67        13
           0       0.88      0.70      0.78        20
           1       0.61      0.89      0.72        19

    accuracy                           0.73        52
   macro avg       0.79      0.71      0.72        52
weighted avg       0.78      0.73      0.73        52

