# DATA

In [26]:
import csv
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
train_df = pd.read_csv('data/olid-train-small.csv')

In [17]:
train_df

Unnamed: 0,id,text,labels
0,24590,@USER @USER @USER @USER @USER Hahah a left tel...,0
1,19287,@USER @USER I’m glad you do babe (I kiss you b...,0
2,44676,@USER And I have concerns with”Democrats”,0
3,96110,@USER AS IT SHOULD BE!,0
4,51557,@USER @USER A horrendous act of course. Conser...,1
...,...,...,...
5847,24288,@USER This information is out there and should...,1
5848,13930,#MorningJoe the closer it gets to monday the l...,0
5849,68792,@USER @USER Long time prisoners will also have...,0
5850,19909,@USER @USER It wasn’t that long ago it was tab...,0


# NLP Preprocessing

1. TF-IDF (Term frequency, inverse-document frequency) --> bag of words (will )
2. pre-trained word embeddings like word2vec, glove, or fastText --> more semantic and dense representation


In [36]:
vectorizer = TfidfVectorizer(
    min_df=3,
    max_df=0.9,
    stop_words='english',
    lowercase=True
)

tfidf_matrix = vectorizer.fit_transform(train_df['text'])
X = tfidf_matrix.toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
train_df = train_df.drop(columns=['id','text'])

Unnamed: 0,labels,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_3334,feature_3335,feature_3336,feature_3337,feature_3338,feature_3339,feature_3340,feature_3341,feature_3342,feature_3343
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5847,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5848,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5849,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5850,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# convert numpy array into pd dataframe
feature_df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])

train_df = pd.concat([train_df.reset_index(drop=True), feature_df.reset_index(drop=True)], axis=1) # drop old ordering numbering

In [49]:
train_df

Unnamed: 0,id,text,labels,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_3334,feature_3335,feature_3336,feature_3337,feature_3338,feature_3339,feature_3340,feature_3341,feature_3342,feature_3343
0,24590,@USER @USER @USER @USER @USER Hahah a left tel...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19287,@USER @USER I’m glad you do babe (I kiss you b...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44676,@USER And I have concerns with”Democrats”,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,96110,@USER AS IT SHOULD BE!,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,51557,@USER @USER A horrendous act of course. Conser...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5847,24288,@USER This information is out there and should...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5848,13930,#MorningJoe the closer it gets to monday the l...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5849,68792,@USER @USER Long time prisoners will also have...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5850,19909,@USER @USER It wasn’t that long ago it was tab...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# SVM

In [None]:
'''

- load data
- preprocess data (tokenize, lowecase, no stop words ...)
- convert data into vectors (bag of words, tf-idf ...)
- vectors are inputs
- SVM model training (learn boundary)
- Predict

'''

Support Vector Machine (SVM) Model
- maximizing the boundary space between points in space
- loss functions for determining the weights --> Hinge Function
- regularization 

Steps
- training -> init weights, classes (-1 and 1)
- apply update rules
- prediction y

In [23]:
class SVM:
    """
    Support Vector Machine classifier implementation using gradient descent optimization.
    This implementation uses the hinge loss function and L2 regularization.
    """

    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        """
        Initialize the SVM classifier
        Args:
            learning_rate: Step size for gradient descent (controls how much to adjust weights)
            lambda_param: Regularization parameter (helps prevent overfitting)
            n_iters: Number of iterations for training
        """
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None  # Weight vector
        self.b = None  # Bias term

    def fit(self, X, y):
        """
        Train the SVM classifier
        Args:
            X: Training features (matrix)
            y: Target labels
        """
        # Get dimensions of input data
        n_samples, n_features = X.shape

        # Convert labels to -1 and 1
        # Any value <= 0 becomes -1, any value > 0 becomes 1
        y_ = np.where(y <= 0, -1, 1)

        # Initialize weights and bias
        self.w = np.zeros(n_features)  # One weight for each feature
        self.b = 0  # Bias term (intercept)

        # Training loop
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                # Check if point satisfies classification constraint
                # The condition y * (w^T * x - b) >= 1 is the SVM margin constraint
                print(f'y type - {y_[idx]}')
                print(f'x type - {x_i}')
                print(f'w type - {self.w}')
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1

                if condition:
                    # If point is correctly classified and outside margin
                    # Update weights with only regularization term
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    # If point is misclassified or inside margin
                    # Update weights with both regularization and margin terms
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(y_[idx], x_i))
                    # Update bias
                    self.b -= self.lr * y_[idx]

    def predict(self, X):
        """
        Make predictions for input data X
        Args:
            X: Features to predict
        Returns:
            Predicted labels (-1 or 1)
        """
        # Calculate the decision function: w^T * x - b
        approx = np.dot(X, self.w) - self.b
        # Return sign of approximation
        # If approx > 0, return 1; if approx < 0, return -1
        return np.sign(approx)

In [19]:
# Prepare features (X) and target (y)
# Assuming your target variable is in a column named 'label' - adjust this based on your actual column name
X = train_df.drop('labels', axis=1)  # Features: all columns except the target
y = train_df['labels']               # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Create and train the SVM model
svm = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm.fit(X_train.values, y_train.values)  # Convert to numpy arrays

# Make predictions
y_pred = svm.predict(X_test.values)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test.values)
print(f"Accuracy: {accuracy:.2f}")

y type - -1
x type - [83041
 "@USER She isn't a radical but you are so far right you are Kemp. You refused to pay any attention to what the people of Georgia want and need. That will be your undoing."]
w type - [0. 0.]


TypeError: can't multiply sequence by non-int of type 'float'

# CNN