# SVM baseline for MetaHate

In [None]:
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, f1_score

import pandas as pd

## Read Data

In [None]:
data = pd.read_csv('/data/metahate.csv', sep='\t', names=['label', 'text'])

texts = data['text'].tolist()
labels = data['label'].tolist()

## Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts,              # Input features (text data)
    labels,             # Target labels corresponding to the input features
    test_size=0.2,      # The proportion of the dataset to include in the test split (20% in this case)
    random_state=42     # Seed for reproducibility (set to 42 in this case)
)

## Create a TF-IDF vectorizer for text data

In [None]:
vectorizer = TfidfVectorizer(
    max_features=10000000,     # Maximum number of features to consider
    ngram_range=(1, 1),        # Considering unigrams (single words)
    stop_words='english',      # Ignoring common English stop words
    sublinear_tf=True,         # Applying sublinear scaling to term frequency
    use_idf=True               # Using Inverse Document Frequency (IDF)
)

## Fit and transform the training and testing data

In [None]:
X_train_vectorized = vectorizer.fit_transform(tqdm(X_train, desc='Fitting and transforming training data'))
X_test_vectorized = vectorizer.transform(tqdm(X_test, desc='Transforming testing data'))

## Create and train the SVM model

In [None]:
svm_model = LinearSVC(random_state=0, tol=1e-5)
svm_model.fit(X_train_vectorized, y_train)


## Make predictions on the test set

In [None]:
predictions = svm_model.predict(X_test_vectorized)

## Evaluate the model

In [None]:
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
micro_f1 = f1_score(y_test, predictions, average='micro')
macro_f1 = f1_score(y_test, predictions, average='macro')
f1_default = f1_score(y_test, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print(f"Micro F1 Score: {micro_f1}")
print(f"Macro F1 Score: {macro_f1}")
print(f"Default (Weighted) F1 Score: {f1_default}")