In [None]:
import pandas as pd
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MultiLabelBinarizer

url = '/dataset/train.csv'
df_train = pd.read_csv(url)
df_train = df_train[df_train['final abusive']==1]

url = '/dataset/val.csv'
df_val = pd.read_csv(url)
df_val = df_val[df_val['final abusive']==1]

url = '/dataset/test.csv'
df_test = pd.read_csv(url)
df_test = df_test[df_test['final abusive']==1]

df_train = pd.concat([df_train, df_val], ignore_index=True)

In [None]:
def cnv(df):
  t=[]
  for index in df.index:
    if '_' in df['target'][index]:
      x = df['target'][index].split('_')
      t.append(x)
    else:
      t.append([df['target'][index]])
  df['target2']=t
  return df

df_train= cnv(df_train)
df_test= cnv(df_test)

train_x = df_train['sentence'].to_list()
test_x = df_test['sentence'].to_list()

mlb = MultiLabelBinarizer()
train_y = mlb.fit_transform(df_train['target2'])
test_y = mlb.transform(df_test['target2'])
target_names = list(mlb.classes_)

In [None]:
from sklearn.metrics import classification_report
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import nltk
nltk.download('punkt')

def token(doc):
  import nltk
  tokens =nltk.word_tokenize(doc)
  return tokens

# word unigram TFIDF vectorizer
u = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='word',  ngram_range=(1,1))

# char ngram TFIDF vectorizer
c2345 = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='char',  ngram_range=(2,5))

# word unigram + char ngram TFIDF vectorizer
w = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='word',  ngram_range=(1,1))
c = TfidfVectorizer(strip_accents=None, tokenizer=token, analyzer='char',  ngram_range=(2,5))
u_c2345 = make_union(w, c, n_jobs=-1)

features = {
    'word unigram':u, 'char ngram':c2345, 'word unigram + char ngram':u_c2345
}

In [None]:
feature_name = []
female_p = []
female_r = []
female_f = []
group_p = []
group_r = []
group_f = []
ind_p = []
ind_r = []
ind_f = []
male_p = []
male_r = []
male_f = []
w_p = []
w_r = []
w_f = []

In [None]:
for feature in features.keys():
  vec = features[feature]
  train_vector = vec.fit_transform(train_x)
  val_vector = vec.transform(test_x)

  from sklearn.multiclass import OneVsRestClassifier

  clf = OneVsRestClassifier(LinearSVC(penalty='l2', loss='hinge', C=1)).fit(train_vector, train_y)
  prediction = clf.predict(val_vector)

  r = classification_report(test_y, prediction, target_names=list(mlb.classes_), output_dict=True)
  
  feature_name.append(feature)
  female_p.append(round(r['female']['precision']*100, 2))
  female_r.append(round(r['female']['recall']*100, 2))
  female_f.append(round(r['female']['f1-score']*100, 2))
  group_p.append(round(r['group']['precision']*100, 2))
  group_r.append(round(r['group']['recall']*100, 2))
  group_f.append(round(r['group']['f1-score']*100, 2))
  ind_p.append(round(r['ind']['precision']*100, 2))
  ind_r.append(round(r['ind']['recall']*100, 2))
  ind_f.append(round(r['ind']['f1-score']*100, 2))
  male_p.append(round(r['male']['precision']*100, 2))
  male_r.append(round(r['male']['recall']*100, 2))
  male_f.append(round(r['male']['f1-score']*100, 2))
  w_p.append(round(r['weighted avg']['precision']*100, 2))
  w_r.append(round(r['weighted avg']['recall']*100, 2))
  w_f.append(round(r['weighted avg']['f1-score']*100, 2))

In [None]:
result = {
    'feature name':feature_name,
    'female_p':female_p, 'female_r':female_r, 'female_f':female_f,
    'group_p':group_p, 'group_r':group_r, 'group_f':group_f,
    'ind_p':ind_p, 'ind_r':ind_r, 'ind_f':ind_f,
    'male_p':male_p, 'male_r':male_r, 'male_f':male_r,
    'w_p':w_p, 'w_r':w_r, 'w_f':w_f
}
dd = pd.DataFrame(result)
dd