### Dependencies

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, MaxAbsScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report

### Read the data and vectorize the text 

In [2]:
train_df = pd.read_csv("../dataset/train.csv")
test_df = pd.read_csv("../dataset/test.csv")

In [8]:
vectorizer = TfidfVectorizer(analyzer = "word", max_features=10000)
train = vectorizer.fit_transform(train_df['Tweets'])
test = vectorizer.transform(test_df['Tweets'])

#### encode label

In [6]:
le = LabelEncoder()
le.fit(train_df.Label)

LabelEncoder()

In [7]:
train_df['class'] = le.transform(train_df.Label)
test_df['class'] = le.transform(test_df.Label)

In [42]:
print(le.classes_)

['none' 'racism' 'sexism']


In [9]:
# get train test variables
X_train, X_test, y_train, y_test = train, test, train_df['class'], test_df['class']

In [11]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(12829, 10000) (3206, 10000) (12829,) (3206,)


### Scaling the features

In [18]:
scaler = MaxAbsScaler()

In [19]:
X_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

### Feature Selection

In [20]:
lsvc = LinearSVC(C=100, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_norm, y_train)
fs = SelectFromModel(lsvc, prefit=True)
X_selected = fs.transform(X_norm)
X_test_selected = fs.transform(X_test_norm)



In [21]:
from IPython.display import Markdown, display
def show_top10_features(classifier, feature_names, categories):
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        display(Markdown("**%s**: %s" % (category, ", ".join(feature_names[top10]))))

In [24]:
feature_names = np.array(vectorizer.get_feature_names())
show_top10_features(lsvc, feature_names, le.inverse_transform(lsvc.classes_))

**none**: smug, oppressive, freebsdgirl, andre, logic, educate, _dirtytruths, jillwetzler, usqinyw5gn, khaledhamaki

**racism**: banislam, slaughtered, disinformation, islamofascists, islamolunatic, sandukankanack, golf, quran, islamolunatics, zones

**sexism**: notsexist, knux1995, questionsformen, evidently, womenagainstfeminism, somekindaboogin, geno, everydaysexism, cuntfacebitch, bitch

### MultinomialNB

In [26]:
mb = MultinomialNB()
mb.fit(X_selected, y_train)

MultinomialNB()

In [27]:
print(X_selected.shape, y_train.shape)

(12829, 6287) (12829,)


In [31]:
print(classification_report(y_true=y_test, y_pred=mb.predict(X_test_selected)))

              precision    recall  f1-score   support

           0       0.75      0.95      0.84      2186
           1       0.99      0.46      0.63       387
           2       0.60      0.23      0.33       633

    accuracy                           0.75      3206
   macro avg       0.78      0.55      0.60      3206
weighted avg       0.75      0.75      0.71      3206



### Ensemble Methods

In [35]:
gbc = AdaBoostClassifier()
gbc.fit(X_selected, y_train)

AdaBoostClassifier()

In [36]:
print(classification_report(y_true=y_test, y_pred=gbc.predict(X_test_selected)))

              precision    recall  f1-score   support

           0       0.75      0.99      0.85      2186
           1       1.00      0.62      0.76       387
           2       0.72      0.07      0.12       633

    accuracy                           0.76      3206
   macro avg       0.82      0.56      0.58      3206
weighted avg       0.77      0.76      0.70      3206



In [48]:
print(le.classes_)

['none' 'racism' 'sexism']
