In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import string

In [2]:
df_train = pd.read_csv('./data/train.tsv', sep='\t', index_col=0, quoting=csv.QUOTE_NONE, encoding='utf-8')
df_test = pd.read_csv('./data/test.tsv', sep='\t', index_col=0, quoting=csv.QUOTE_NONE, encoding='utf-8')

In [3]:
df_train.head()

Unnamed: 0_level_0,text,hateful
id,Unnamed: 1_level_1,Unnamed: 2_level_1
518301959,RT @AaronDagloria: @YesYoureSexist #science,0
858885115,Pretty fucking sick of this pattern: Many wome...,0
903196327,When you got a sugar daddy and went from rags ...,0
618163271,Apparently cooking skills have nothing to do w...,0
323263148,@edgeofthesandbx @batchelorshow @FearDept You ...,0


In [4]:
df_train['text'] = df_train['text'].apply(lambda x : x.translate(str.maketrans('', '', string.punctuation)).lower())
df_test['text'] = df_test['text'].apply(lambda x : x.translate(str.maketrans('', '', string.punctuation)).lower())

In [5]:
vectorizer = TfidfVectorizer(max_df=0.8, min_df=5)
vectors = vectorizer.fit_transform(df_train['text'])
x_train = np.array(vectors.todense())
y_train = np.array(df_train['hateful'])

In [6]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
len(y_train)

16112

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=10)

In [10]:
vectors_test = vectorizer.transform(df_test['text'])
x_test = np.array(vectors_test.todense())
y_predict = clf.predict(x_test)

In [11]:
y_predict

array([0, 0, 0, ..., 1, 1, 0])

In [12]:
import os
try:
    os.mkdir('predictions')
except:
    pass

pd.DataFrame({'hateful': y_predict}, index=df_test.index).to_csv('./predictions/RF.csv')

In [13]:
!ls

cooking.stackexchange.id      main.py	       temp.csv       temp.txt
cooking.stackexchange.tar.gz  predictions      temp_test.csv  Untitled.ipynb
cooking.stackexchange.txt     readme.txt       temp_test.txt
data			      SoComp20_A3.pdf  temp.train


In [14]:
len(y_predict)

5000

In [15]:
import spacy
nlp = spacy.load("en_core_web_md")

In [16]:
vectors = df_train['text'].apply(nlp)

In [17]:
x_train = vectors.apply(lambda x : x.vector)
x_train = x_train.to_numpy()
x_train = np.vstack(x_train).astype(np.float)
x_train.shape

(16112, 300)

In [18]:
from sklearn.svm import SVC

In [19]:
x_train.shape

(16112, 300)

In [20]:
clf = SVC()
clf.fit(x_train, y_train)

SVC()

In [21]:
vectors = df_test['text'].apply(nlp)
x_test = vectors.apply(lambda x : x.vector)
x_test = x_test.to_numpy()
x_test = np.vstack(x_test).astype(np.float)

In [22]:
x_test.shape

(5000, 300)

In [23]:
y_predict = clf.predict(x_test)

In [24]:
y_predict

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
pd.DataFrame({'hateful': y_predict}, index=df_test.index).to_csv('./predictions/SVM.csv')

In [26]:
import fasttext

In [27]:
file = open('temp.train', 'w')
for index, line in df_train.iterrows():
    file.write(f'__label__{line.hateful} {line.text}\n')
file.close()
file = open('temp_test.txt', 'w')
for index, line in df_test.iterrows():
    file.write(f'{line.text}\n')
file.close()

In [28]:
model = fasttext.train_supervised(input='temp.train')

In [29]:
model.predict('temp_test.txt')

(('__label__0',), array([1.00001001]))

In [39]:
x_test = list(df_test['text'])
labels = model.predict(x_test)[0]
df_final = pd.DataFrame({'hateful': labels}, index=df_test.index)
df_final['hateful'] = df_final['hateful'].apply(lambda x : int(x[0][-1]))

In [41]:
df_final

Unnamed: 0_level_0,hateful
id,Unnamed: 1_level_1
397250289,0
557603687,0
51361623,0
947566416,1
752528635,1
...,...
494945715,0
346666309,0
349076166,1
210148111,1
