In [None]:
import pandas as pd
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer

url = '/dataset/train.csv'
df_train = pd.read_csv(url)
df_train = df_train[df_train['final abusive']==1]

url = '/dataset/val.csv'
df_val = pd.read_csv(url)
df_val = df_val[df_val['final abusive']==1]

url = '/dataset/test.csv'
df_test = pd.read_csv(url)
df_test = df_test[df_test['final abusive']==1]

df_train = pd.concat([df_train, df_val], ignore_index=True)

In [None]:
from sklearn.metrics import classification_report
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import nltk
nltk.download('punkt')

def token(doc):
  import nltk
  tokens =nltk.word_tokenize(doc)
  return tokens

# word unigram TFIDF vectorizer
u = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='word',  ngram_range=(1,1))

# char ngram TFIDF vectorizer
c2345 = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='char',  ngram_range=(2,5))

# word unigram + char ngram TFIDF vectorizer
w = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='word',  ngram_range=(1,1))
c = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='char',  ngram_range=(2,5))
u_c2345 = make_union(w, c, n_jobs=-1)

features = {
    'word unigram':u, 'char ngram':c2345, 'word unigram + char ngram':u_c2345
}

In [None]:
from sklearn.metrics import classification_report
target_names = ['NH', 'HS']
ngram = []
nh_p = []
nh_r = []
nh_f = []
hs_p = []
hs_r = []
hs_f = []
w_p = []
w_r = []
w_f = []


def rnd(var):     # used for rounding the final score for readibility perpose
  rounded_var = round(var*100,2)
  return rounded_var

for feature in features.keys():
  vec = features[feature]

  train_vector = vec.fit_transform(train_x)
  test_vector = vec.transform(test_x)

  model = LinearSVC(penalty='l2', loss='hinge', C=1)
  model.fit(train_vector, train_y)
  prediction = model.predict(test_vector)

  r = classification_report(test_y, prediction, target_names=target_names, output_dict=True)

  ngram.append(feature)

  nh_p.append(rnd(r['NH']['precision']))
  nh_r.append(rnd(r['NH']['recall']))
  nh_f.append(rnd(r['NH']['f1-score']))

  hs_p.append(rnd(r['HS']['precision']))
  hs_r.append(rnd(r['HS']['recall']))
  hs_f.append(rnd(r['HS']['f1-score']))

  w_p.append(rnd(r['weighted avg']['precision']))
  w_r.append(rnd(r['weighted avg']['recall']))
  w_f.append(rnd(r['weighted avg']['f1-score']))

result = {  'ngram':ngram,
    'NH precision':nh_p, 'NH recall':nh_r, 'NH F1':nh_f,
    'HS precision':hs_p, 'HS recall':hs_r, 'HS F1':hs_f,
    'weighted avg precision':w_p, 'weighted avg recall':w_r, 'weighted avg F1':w_f,
}
df_result = pd.DataFrame(data=result)
df_result

Unnamed: 0,ngram,NH precision,NH recall,NH F1,HS precision,HS recall,HS F1,weighted avg precision,weighted avg recall,weighted avg F1
0,word unigram,85.55,92.46,88.87,91.07,83.11,86.91,88.2,87.97,87.93
1,char ngram,89.6,93.26,91.39,92.38,88.29,90.29,90.93,90.87,90.86
2,word unigram + char ngram,90.18,92.42,91.29,91.58,89.11,90.33,90.85,90.83,90.83
