In [1]:
import pandas as pd
import nltk
import string
import re
import numpy as np
from tqdm import tqdm
import pandas as pd
from tqdm import tqdm
import pickle
import emoji

from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from scipy.sparse import csr_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textstat import flesch_reading_ease, flesch_kincaid_grade

from myclasses.Features import Features
from myclasses.Data import Data

In [2]:
def save_model(model, name, subtask):
    with open(f"./models/subtask{subtask}/{name}", "wb") as fp:
        pickle.dump(model, fp)
        
def load_model(name):
    with open(f"./models/subtask{subtask}/{name}", "rb") as fp:
        clf = pickle.load(fp)
        
    return clf

# def save_obj(obj, path):
#     with open(path, "wb") as fp:
#         pickle.dump(obj, fp)

In [3]:
# """ Save after running subtask"""

# with open(f"./models/dataobj", "wb") as fp:
#     pickle.dump(d, fp)

# Subtask A

In [4]:
d = Data()

In [5]:
X_tr, X_te, y_tr, y_te = d.read(subtask="a")

100%|██████████| 13240/13240 [00:11<00:00, 1110.56it/s]
100%|██████████| 13240/13240 [00:07<00:00, 1831.74it/s]
100%|██████████| 13240/13240 [00:00<00:00, 20043.94it/s]
100%|██████████| 860/860 [00:00<00:00, 1299.26it/s]
100%|██████████| 860/860 [00:00<00:00, 1834.75it/s]
100%|██████████| 860/860 [00:00<00:00, 16786.74it/s]


In [6]:
X_tr.shape

(13240, 13156)

In [7]:
save_model(d, "data_obj", subtask="a")

In [8]:
X_tr = csr_matrix(X_tr)
X_te = csr_matrix(X_te)

In [9]:
save_model(X_te, "testX", subtask="a")
save_model(y_te, "testY", subtask="a")

In [10]:
""" Logistic Regression """

log = LogisticRegression(max_iter=3000, class_weight="balanced")
log.fit(X_tr, y_tr)
y_pred = log.predict(X_te)
f1_score(y_te, y_pred, average="macro")

0.7635732850292878

In [11]:
save_model(log, "log", subtask="a")

In [63]:
""" SVM """

svm = SVC(kernel="linear", class_weight="balanced")
svm.fit(X_tr, y_tr)
y_pred = svm.predict(X_te)
f1_score(y_te, y_pred, average="macro")

0.6887930455741511

In [12]:
save_model(svm, "svm", subtask="a")

In [22]:
""" Random Forest """

rf = RandomForestClassifier(random_state=0)
rf.fit(X_tr, y_tr)
y_pred = rf.predict(X_te)
f1_score(y_te, y_pred, average="macro")

0.7106758373205742

In [None]:
save_model(rf, "rf", subtask="a")

In [68]:
""" Gaussian NB """

nb = GaussianNB()
nb.fit(X_tr, y_tr)
y_pred = nb.predict(X_te)
f1_score(y_te, y_pred, average="macro")

0.5963947163947163

In [23]:
""" Bagging """

clf = BaggingClassifier()
clf.fit(X_tr, y_tr)
y_pred = clf.predict(X_te)
f1_score(y_te, y_pred, average="macro")

0.7126462311616022

# Subtask B

In [12]:
""" Get ids of tweets we predicted as offensive in the previous  """

ids_b = set()

for i in range(len(y_pred)):
    if y_pred[i] == "OFF":
        ids_b.add(d.df_te["id"][i])
        
len(ids_b)

237

In [13]:
d = Data()

In [14]:
X_tr2, X_te2, y_tr2, y_te2 = d.read(subtask="b", ids=ids_b)

100%|██████████| 4400/4400 [00:04<00:00, 882.75it/s] 
100%|██████████| 4400/4400 [00:02<00:00, 1632.79it/s]
100%|██████████| 4400/4400 [00:00<00:00, 19574.09it/s]
100%|██████████| 157/157 [00:00<00:00, 1868.18it/s]
100%|██████████| 157/157 [00:00<00:00, 1526.88it/s]
100%|██████████| 157/157 [00:00<00:00, 19715.74it/s]


In [15]:
X_tr2.shape

(4400, 4721)

In [16]:
save_model(d, "data_obj", subtask="b")

In [17]:
X_tr2 = csr_matrix(X_tr2)
X_te2 = csr_matrix(X_te2)

In [18]:
save_model(X_te, "testX", subtask="b")
save_model(y_te, "testY", subtask="b")

In [19]:
""" Logistic Regression """

log2 = LogisticRegression(max_iter=2000, class_weight="balanced")
log2.fit(X_tr2, y_tr2)
y_pred2 = log2.predict(X_te2)
f1_score(y_te2, y_pred2, average="macro")

0.6985407066052227

In [20]:
save_model(log2, "log", subtask="b")

In [25]:
""" SVM """

svm2 = SVC(kernel="linear", class_weight="balanced")
svm2.fit(X_tr2, y_tr2)
y_pred2 = svm2.predict(X_te2)
f1_score(y_te2, y_pred2, average="macro")

0.6319612590799031

In [26]:
""" Random Forest """

rf = RandomForestClassifier(random_state=0)
rf.fit(X_tr2, y_tr2)
y_pred = rf.predict(X_te2)
f1_score(y_te2, y_pred, average='macro')

0.476010101010101

# Subtask C

In [21]:
ids_c = set()

for i in range(len(y_pred2)):
    if y_pred2[i] == "TIN":
        ids_c.add(d.df_te["id"][i])
        
len(ids_c)        

120

In [22]:
d = Data()

In [23]:
X_tr, X_te, y_tr, y_te = d.read(subtask="c", ids=ids_c)

100%|██████████| 3876/3876 [00:01<00:00, 2107.23it/s]
100%|██████████| 3876/3876 [00:01<00:00, 2321.71it/s]
100%|██████████| 3876/3876 [00:00<00:00, 22343.12it/s]
100%|██████████| 111/111 [00:00<00:00, 2027.88it/s]
100%|██████████| 111/111 [00:00<00:00, 1655.07it/s]
100%|██████████| 111/111 [00:00<00:00, 17609.79it/s]


In [24]:
X_tr.shape

(3876, 4288)

In [25]:
save_model(d, "data_obj", subtask="c")

In [26]:
X_tr = csr_matrix(X_tr)
X_te = csr_matrix(X_te)

In [27]:
save_model(X_te, "testX", subtask="c")
save_model(y_te, "testY", subtask="c")

In [28]:
""" Logistic Regression """

log3 = LogisticRegression(max_iter=2000, class_weight="balanced", C=0.47)
log3.fit(X_tr, y_tr)
y_pred3 = log3.predict(X_te)
f1_score(y_te, y_pred3, average="macro")

0.6384019466033122

In [29]:
save_model(log3, "log", subtask="c")

In [36]:
""" SVM """

svm3 = SVC(kernel="linear", class_weight="balanced")
svm3.fit(X_tr, y_tr)
y_pred3 = svm3.predict(X_te)
f1_score(y_te, y_pred3, average="macro")

0.603157791507306