In [5]:
import pandas as pd
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer

url = '/dataset/train.csv'
df_train = pd.read_csv(url)
df_train = df_train[df_train['final abusive']==1]

url = '/dataset/val.csv'
df_val = pd.read_csv(url)
df_val = df_val[df_val['final abusive']==1]

url = '/dataset/test.csv'
df_test = pd.read_csv(url)
df_test = df_test[df_test['final abusive']==1]

df_train = pd.concat([df_train, df_val], ignore_index=True)

In [6]:
def cnv(df):
  t=[]
  for index in df.index:
    if '_' in df['type'][index]:
      x = df['type'][index].split('_')
      t.append(x)
    else:
      t.append([df['type'][index]])
  df['type2']=t
  return df

df_train= cnv(df_train)
df_test= cnv(df_test)

train_x = df_train['sentence'].to_list()
test_x = df_test['sentence'].to_list()

mlb = MultiLabelBinarizer()
train_y = mlb.fit_transform(df_train['type2'])
test_y = mlb.transform(df_test['type2'])
target_names = list(mlb.classes_)

In [None]:
from sklearn.metrics import classification_report
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import nltk
nltk.download('punkt')

def token(doc):
  import nltk
  tokens =nltk.word_tokenize(doc)
  return tokens

# word unigram TFIDF vectorizer
u = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='word',  ngram_range=(1,1))

# char ngram TFIDF vectorizer
c2345 = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='char',  ngram_range=(2,5))

# word unigram + char ngram TFIDF vectorizer
w = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='word',  ngram_range=(1,1))
c = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='char',  ngram_range=(2,5))
u_c2345 = make_union(w, c, n_jobs=-1)

features = {
    'word unigram':u, 'char ngram':c2345, 'word unigram + char ngram':u_c2345
}

In [12]:
feature_name = []
slander_p = []
slander_r = []
slander_f = []
religion_p = []
religion_r = []
religion_f = []
gender_p = []
gender_r = []
gender_f = []
cv_p = []
cv_r = []
cv_f = []
w_p = []
w_r = []
w_f = []

In [None]:
for feature in features.keys():
  vec = features[feature]
  train_vector = vec.fit_transform(train_x)
  val_vector = vec.transform(test_x)

  from sklearn.multiclass import OneVsRestClassifier

  clf = OneVsRestClassifier(LinearSVC(penalty='l2', loss='hinge', C=1)).fit(train_vector, train_y)
  prediction = clf.predict(val_vector)

  r = classification_report(test_y, prediction, target_names=list(mlb.classes_), output_dict=True)

  feature_name.append(feature)
  slander_p.append(round(r['slander']['precision']*100, 2))
  slander_r.append(round(r['slander']['recall']*100, 2))
  slander_f.append(round(r['slander']['f1-score']*100, 2))
  religion_p.append(round(r['religion']['precision']*100, 2))
  religion_r.append(round(r['religion']['recall']*100, 2))
  religion_f.append(round(r['religion']['f1-score']*100, 2))
  cv_p.append(round(r['callToViolence']['precision']*100, 2))
  cv_r.append(round(r['callToViolence']['recall']*100, 2))
  cv_f.append(round(r['callToViolence']['f1-score']*100, 2))
  gender_p.append(round(r['gender']['precision']*100, 2))
  gender_r.append(round(r['gender']['recall']*100, 2))
  gender_f.append(round(r['gender']['f1-score']*100, 2))
  w_p.append(round(r['weighted avg']['precision']*100, 2))
  w_r.append(round(r['weighted avg']['recall']*100, 2))
  w_f.append(round(r['weighted avg']['f1-score']*100, 2))

In [None]:
result = {
    'feature name':feature_name,
    'slander_p':slander_p, 'slander_r':slander_r, 'slander_f':slander_f,
    'religion_p':religion_p, 'religion_r':religion_r, 'religion_f':religion_f,
    'gender_p':gender_p, 'gender_r':gender_r, 'gender_f':gender_f,
    'cv_p':cv_p, 'cv_r':cv_r, 'cv_f':cv_f,
    'w_p':w_p, 'w_r':w_r, 'w_f':w_f
}
dd = pd.DataFrame(result)
dd