### Loading Data

In [2]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import sklearn.metrics as m



In [3]:
training = pd.read_csv("data/training/training.csv")
validation = pd.read_csv("data/training/validation.csv")
testing = pd.read_csv("data/training/test.csv")

In [4]:
training.head(3)

Unnamed: 0,text,claim
0,What do you do if you are a global warming ala...,5_1
1,(2.) A sun-blocking volcanic aerosols componen...,0_0
2,"Now, I am very interested in the AMO, since it...",1_1


# Simple Machine Leaning

### Feature Extraction Using Term Frequency and Inverse Document Frequency 

In [4]:
corpus = training["text"]

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(corpus)
y = training["claim"]


### Logistic Regression

In [58]:
#With class weights for class imbalance.
lr = LogisticRegression(max_iter=5000, class_weight="balanced")

#Without class weights
#lr = LogisticRegression(max_iter=5000)


#Hyperparameter grid search
param_grid = [{
    "penalty": ['none', 'l1', 'l2', 'elasticnet'],
    "C": np.logspace(-4,4,20),
    "solver": ['newton-cg', 'lbfgs', 'liblinear'] 
}]

clf = RandomizedSearchCV(lr,param_grid,cv=3,n_iter=5,scoring=["precision","recall", "accuracy"], refit="accuracy")

clf.fit(X, y)


Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1954, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 1573, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklea

In [59]:
print("best parameters: ",clf.best_params_)

best parameters:  {'solver': 'lbfgs', 'penalty': 'l2', 'C': 206.913808111479}


In [60]:
#Fitting with the best parameters
#With class weights for class imbalance.
lr = LogisticRegression(max_iter=5000,solver="lbfgs", class_weight="balanced", C=206.913808111479, penalty="l2")
lr.fit(X,y)

#### Validation of Logistic Regression

In [61]:
validation_X = vectorizer.transform(validation["text"])
validation_y = validation["claim"]
validation_predictions = lr.predict(validation_X)

In [62]:
#Using Macro-averaged as the authors use that. For comparison: 
print("Accuracy: " + str(m.accuracy_score(validation_y, validation_predictions)))
print("Recall: " + str(m.recall_score(validation_y, validation_predictions,average="macro")))
print("Precision: " + str(m.precision_score(validation_y, validation_predictions,average="macro")))
print("F1: " + str(m.f1_score(validation_y, validation_predictions,average="macro")))


Accuracy: 0.8084452975047984
Recall: 0.6690962268871067
Precision: 0.6401748479345559
F1: 0.6510399546722085


### Random Forest Classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
rf = RandomForestClassifier()
rf.fit(X,y)

#### RF validation

In [48]:
rf_validation_X = vectorizer.transform(validation["text"])
rf_validation_y = validation["claim"]
rf_validation_predictions = rf.predict(validation_X)

In [49]:
#Using Macro-averaged as the authors use that. For comparison: 
print("Accuracy: " + str(m.accuracy_score(rf_validation_y, rf_validation_predictions)))
print("Recall: " + str(m.recall_score(rf_validation_y, rf_validation_predictions,average="macro")))
print("Precision: " + str(m.precision_score(rf_validation_y, rf_validation_predictions,average="macro")))
print("F1: " + str(m.f1_score(rf_validation_y, rf_validation_predictions,average="macro")))

#Results are so bad that I don't even think it's worth 

Accuracy: 0.7197696737044146
Recall: 0.15027056113179288
Precision: 0.5727799087764723
F1: 0.19448282427710437


  _warn_prf(average, modifier, msg_start, len(result))


### SVM

In [5]:
SVM = svm.SVC(class_weight="balanced")

param_grid = {'C': [0.1, 1, 10], 
              'kernel': ['linear', 'rbf'], 
              'gamma': ['scale', 'auto']}

svm_search = RandomizedSearchCV(SVM, param_grid, cv=3, n_iter=10)
svm_search.fit(X,y)

In [None]:
print("best parameters: ",svm_search.best_params_)

best parameters:  {'kernel': 'linear', 'gamma': 'scale', 'C': 1}


In [72]:
SVM = svm.SVC(kernel="linear", gamma="scale", C=1,class_weight="balanced")

SVM.fit(X,y)

In [None]:
SVM_validation_X = vectorizer.transform(validation["text"])
SVM_validation_y = validation["claim"]
SVM_validation_predictions = SVM.predict(validation_X)

In [71]:
#Using Macro-averaged as the authors use that. For comparison: 
print("Accuracy: " + str(m.accuracy_score(SVM_validation_y, SVM_validation_predictions)))
print("Recall: " + str(m.recall_score(SVM_validation_y, SVM_validation_predictions,average="macro")))
print("Precision: " + str(m.precision_score(SVM_validation_y, SVM_validation_predictions,average="macro")))
print("F1: " + str(m.f1_score(SVM_validation_y, SVM_validation_predictions,average="macro")))

Accuracy: 0.8138195777351248
Recall: 0.49864363274873724
Precision: 0.80932434925119
F1: 0.5676219924389994
