In [1]:
# PLAIN AND SIMPLE LINEAR SVM USING SVC KERNEL = LINEAR

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

data = pd.read_csv('data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_model = SVC(kernel='linear')

# Train the model
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')


print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)

# HISTORY
# first run: 0.7247599650858306 (6min)



Accuracy: 0.7247599650858306
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.84      0.79      2153
           1       0.66      0.54      0.59      1284

    accuracy                           0.72      3437
   macro avg       0.71      0.69      0.69      3437
weighted avg       0.72      0.72      0.72      3437



In [None]:
# LINEAR SVM WITH DIFFERING REGULARISATION VALUE USING LINEARSVC
# https://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html#sphx-glr-auto-examples-svm-plot-svm-scale-c-py







In [3]:
# PLAIN AND SIMPLE LINEAR SVM USING SVC KERNEL = LINEAR WITH PCA

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,f1_score
from sklearn.decomposition import PCA

data = pd.read_csv('data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pca = PCA(n_components=700)  
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)
svm_model = SVC(kernel='linear')

# Train the model
svm_model.fit(X_train_pca, y_train)
# Make predictions
y_pred = svm_model.predict(X_test_pca)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


# HISTORY
# first run: 0.5609 (1min 500 components)


Accuracy: 0.5609543206284551
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.73      0.67      2153
           1       0.38      0.28      0.33      1284

    accuracy                           0.56      3437
   macro avg       0.51      0.50      0.50      3437
weighted avg       0.54      0.56      0.54      3437



In [12]:
# LINEAR SVM USING SGDClassifier SCIKIT LEARN 
# https://stackoverflow.com/questions/29704231/in-sklearn-what-is-the-difference-between-a-svm-model-with-linear-kernel-and-a-s
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score,f1_score
from sklearn.model_selection import train_test_split


data = pd.read_csv('data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sgd_model = SGDClassifier(loss='perceptron', max_iter=2000, tol=1e-3) # we are changing loss to diff methodss
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')


print(f'Accuracy with SGDClassifier: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report with SGDClassifier:')
print(report)


# HISTORY using hinge loss
# first run: 0.7247599650858306 (14 sec with 1000 max iter)
# second run 0.7235961594413733 (14 sec with 5000 max iter)
# third run 0.7235961594413733 (14 sec with 10000 max iter)


# HISTORY using squared hinge loss
# first run: 0.6659877800407332 (1min 21 sec with 1000 max iter)
# second run 0.7235961594413733 (5min sec with 5000 max iter)


# HISTORY using Perceptoron loss
# first run: 0.6825720104742508, 0.648364347180668 (12sec with 1000 max iter)
# second run 0.6825720104742508, 0.5907522822434935 (13sec with 2000 max iter)


# HISTORY using Modified Huber loss
# first run: 0.7142857142857143, 0.6728646420942034 (15 sec with 1000 max iter)
# second run 0.7125400058190282, 0.6827629631844068 (16 sec with 2000 max iter)



Accuracy with SGDClassifier: 0.6825720104742508
F1 Score: 0.5907522822434935
Classification Report with SGDClassifier:
              precision    recall  f1-score   support

           0       0.68      0.92      0.78      2153
           1       0.68      0.28      0.40      1284

    accuracy                           0.68      3437
   macro avg       0.68      0.60      0.59      3437
weighted avg       0.68      0.68      0.64      3437



In [13]:
# Radial Basis function SVM
# https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,f1_score
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('data/train_tfidf_features.csv')
X = data.drop(columns=['id', 'label']).values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
ksvm =SVC(kernel='rbf',
               gamma=0.1,
               C=10.0)

ksvm.fit(X_train, y_train)
 
# Evaluate the model on the test data
accuracy = ksvm.score(X_test, y_test)
print('Accuracy:', accuracy)


# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')


print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)

# HISTORY
# NA took 30 mins. nothing pooped out



In [1]:
# POLY SVM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Load the dataset
# Replace 'your_dataset.csv' with your actual dataset path
data = pd.read_csv('data/train_tfidf_features.csv')

X = data.drop(columns=['label']).values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svm_poly = SVC(kernel='poly', degree=3, C=1.0, coef0=1) # here we will explore different degrees

svm_poly.fit(X_train, y_train)

# Make predictions
y_pred = svm_poly.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')


print(f'Accuracy: {accuracy}')
print(f'F1 Score: {macro_f1}')
print('Classification Report:')
print(report)


In [None]:
# Sigmoid SVM
# https://stats.stackexchange.com/questions/90736/the-difference-of-kernels-in-svm


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


data = pd.read_csv('data/train_tfidf_features.csv')

X = data.drop(columns=['label']).values
y = data['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_sigmoid = SVC(kernel='sigmoid', C=1.0, coef0=1)
svm_sigmoid.fit(X_train, y_train)

# Make predictions
y_pred = svm_sigmoid.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


https://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf


using the publication above