In [4]:
## Code adapted from UIUC PS 590: Images and Text as Data
## by Prof. Nora Webb Williams. She adapted from Diyi Yang 
## and crew at Georgia Tech.
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

df = pd.read_csv("../Data/prelim_dataset.csv")
text = df['text'].to_list()
labels = df['Islamophobic?'].to_list()
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(text)

## We first split the original data into train and test set
X_train, X_test, y_train, y_test = train_test_split(text, labels, random_state = 0)

## Extract features
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tfidf.fit(X_train)
X_train_features = tfidf.transform(X_train)
X_test_features = tfidf.transform(X_test)

clf = LinearSVC(class_weight={0: 1, 1: 5}).fit(X_train_features, y_train)
## predict labels for test data
y_pred = clf.predict(X_test_features)

print("LINEAR SVC")
## Metrics
print('With weights', metrics.classification_report(y_test, y_pred))
print("Macro:", f1_score(y_test, y_pred, average='macro'))
print("Micro:", f1_score(y_test, y_pred, average='micro'))

clf = LinearSVC().fit(X_train_features, y_train)
# predict labels for test data
y_pred = clf.predict(X_test_features)

## Metrics
print('Without weights', metrics.classification_report(y_test, y_pred))
print("Macro:", f1_score(y_test, y_pred, average='macro'))
print("Micro:", f1_score(y_test, y_pred, average='micro'))


LINEAR SVC
With weights               precision    recall  f1-score   support

         0.0       0.89      0.92      0.90       164
         1.0       0.54      0.44      0.48        34

    accuracy                           0.84       198
   macro avg       0.71      0.68      0.69       198
weighted avg       0.83      0.84      0.83       198

Macro: 0.6940312922542013
Micro: 0.8383838383838383
Without weights               precision    recall  f1-score   support

         0.0       0.86      0.97      0.91       164
         1.0       0.64      0.26      0.37        34

    accuracy                           0.85       198
   macro avg       0.75      0.62      0.64       198
weighted avg       0.83      0.85      0.82       198

Macro: 0.6443965517241379
Micro: 0.8484848484848486


In [68]:

clf = LogisticRegression(class_weight={0: 1, 1: 5}).fit(X_train_features, y_train)
## predict labels for test data
y_pred = clf.predict(X_test_features)

print("LOGIT")
# f1_score(y_test, y_pred, average='micro')
print('With weights', metrics.classification_report(y_test, y_pred))
print("Macro:", f1_score(y_test, y_pred, average='macro'))
print("Micro:", f1_score(y_test, y_pred, average='micro'))

clf = LogisticRegression().fit(X_train_features, y_train)
# predict labels for test data
y_pred = clf.predict(X_test_features)

print('Without weights', metrics.classification_report(y_test, y_pred))
print("Macro:", f1_score(y_test, y_pred, average='macro'))
print("Micro:", f1_score(y_test, y_pred, average='micro'))

LOGIT
With weights               precision    recall  f1-score   support

         0.0       0.90      0.88      0.89       164
         1.0       0.47      0.50      0.49        34

    accuracy                           0.82       198
   macro avg       0.68      0.69      0.69       198
weighted avg       0.82      0.82      0.82       198

Macro: 0.6876424189307625
Micro: 0.8181818181818182
Without weights               precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       164
         1.0       0.00      0.00      0.00        34

    accuracy                           0.83       198
   macro avg       0.41      0.50      0.45       198
weighted avg       0.69      0.83      0.75       198

Macro: 0.4530386740331492
Micro: 0.8282828282828283


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes

In [69]:
clf = MultinomialNB().fit(X_train_features, y_train)
y_pred = clf.predict(X_test_features)

print("MULTINOMIAL NB")
print('Without weights', metrics.classification_report(y_test, y_pred))
print("Macro:", f1_score(y_test, y_pred, average='macro'))
print("Micro:", f1_score(y_test, y_pred, average='micro'))

clf = BernoulliNB().fit(X_train_features, y_train)
y_pred = clf.predict(X_test_features)

print("BERNOULLI NB")
print('Without weights', metrics.classification_report(y_test, y_pred))
print("Macro:", f1_score(y_test, y_pred, average='macro'))
print("Micro:", f1_score(y_test, y_pred, average='micro'))

MULTINOMIAL NB
Without weights               precision    recall  f1-score   support

         0.0       0.83      1.00      0.91       164
         1.0       0.00      0.00      0.00        34

    accuracy                           0.83       198
   macro avg       0.41      0.50      0.45       198
weighted avg       0.69      0.83      0.75       198

Macro: 0.4530386740331492
Micro: 0.8282828282828283


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
