In [1]:
import pandas as pd
import re
from sklearn.metrics import classification_report, recall_score, make_scorer, f1_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from data import *

Task c：Who or what is the target of the offensive content?
– IND: the target is an individual explicitly or implicitly mentioned in the conversation;
– GRP: hate speech, targeting group of people based on ethnicity, gender, sexual orientation,
religious belief, or other common characteristic;
– OTH: targets that do not fall into any of the previous categories, e.g., organizations, events,
and issues.

In [2]:
#load the data
data_c = pd.read_csv('data2/task_c_distant_ann.tsv', sep='\t', header=0, index_col='id')
print(round(data_c["average_ind"]).value_counts())
print(round(data_c["average_grp"]).value_counts())
print(round(data_c["average_oth"]).value_counts())
data_c.head()
#print(len(data_c))

1.0    126989
0.0     61984
Name: average_ind, dtype: int64
0.0    175147
1.0     13826
Name: average_grp, dtype: int64
0.0    185902
1.0      3071
Name: average_oth, dtype: int64


Unnamed: 0_level_0,text,average_ind,average_grp,average_oth,std_ind,std_grp,std_oth
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1159533712079503361,@USER Trump is a fucking idiot his dementia is...,0.833432,0.07611,0.107765,0.208334,0.098937,0.138649
1159533713044234241,@USER HELL YES! His grinned and thumbs up are ...,0.481062,0.367363,0.138841,0.345225,0.335924,0.08323
1159533718345830400,@USER Can't wait to see the shit show his deat...,0.438813,0.268574,0.377573,0.182609,0.18688,0.254621
1159533739871002625,@USER @USER @USER This guys is dumb check his ...,0.712995,0.123504,0.11113,0.248839,0.107572,0.067552
1159533742366633984,@USER @USER Fuck him better than his hoes,0.691414,0.146723,0.192282,0.204415,0.154818,0.104436


# data analysis 

chose the biggest label

In [3]:
def compare(x):
    class_list = ['ind','grp','oth']
    max_index = x.index(max(x, key = abs))
    return class_list[max_index],x[max_index]

In [4]:
type_list=[]
average_list = []
data_c["subtask_a"]='Null'
data_c["subtask_b"]='Null'
for i in zip(data_c['average_ind'],data_c['average_grp'],data_c['average_oth']):
    type_tmp, average_tmp = compare(i)
    type_list.append(type_tmp) 
    average_list.append(average_tmp)
    
data_c['subtask_c'] = type_list
data_c['average'] = average_list
data_c['subtask_c'].value_counts()
#data_c.head()

ind    152562
grp     24917
oth     11494
Name: subtask_c, dtype: int64

In [5]:
data_c = data_c.drop(columns=['average_ind', 'average_grp','average_oth','std_ind','std_grp','std_oth'])
data_c = data_c[((data_c.subtask_c=='ind')&(data_c.average>=0.6))|(data_c.subtask_c=='grp')|(data_c.subtask_c=='oth')]
data_c = data_c.drop(columns=['average'])
data_c['subtask_c'].value_counts()
#data_c.head()

ind    91925
grp    24917
oth    11494
Name: subtask_c, dtype: int64

In [6]:
data_c.to_csv('data/subtask_c_train.csv',sep='\t')

In [7]:
data_c = pd.read_csv('data/subtask_c_train.csv', sep='\t', header=0, index_col='id')
data_c.head()

Unnamed: 0_level_0,text,subtask_a,subtask_b,subtask_c
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1159533712079503361,@USER Trump is a fucking idiot his dementia is...,Null,Null,ind
1159533739871002625,@USER @USER @USER This guys is dumb check his ...,Null,Null,ind
1159533742366633984,@USER @USER Fuck him better than his hoes,Null,Null,ind
1159533763791130624,Junhee and I are gonna take turns beating his ...,Null,Null,ind
1159534097754132480,"@USER Exactly, a pervert. So disgusting and h...",Null,Null,ind


In [8]:
test = pd.read_csv('data/olid-training-v1.0.tsv', sep='\t', header=0, index_col='id')
print(len(test))
test = test[test.subtask_c.isin(['IND','GRP','OTH'])] 
print(len(test))
test["subtask_c"].value_counts()
test.to_csv('data/subtask_c_test.csv',sep='\t')
test["subtask_c"].value_counts()

13240
3876


IND    2407
GRP    1074
OTH     395
Name: subtask_c, dtype: int64

# Data preprocessing is complete, ready for training and testing

In [9]:
def train_test_taskc(task_sign,C=1,g='scale',k='rbf'):

    train_samples = read_file('data/subtask_c_train',task_sign)
    X, y = [ x["text"] for x in train_samples ], [ x["label"] for x in train_samples ]
    #print(y)
    bow = CountVectorizer(max_features=3000)
    tfidf = TfidfTransformer()

    svm_clf = SVC(C, gamma=g, kernel=k)

    pipeline = Pipeline([('bow', bow),
                        ('tfidf', tfidf),
                        ('clf', svm_clf),])

    print('\tTraining on', len(X), 'samples')
    pipeline.fit(X, y)

    predictions = pipeline.predict(X)
    print ('-'* 40, '\nTraining data\n', classification_report(y, predictions, digits=3))

    # Testing
    print("Evaluating SVM classifier")
    test_samples = read_file('data/subtask_c_test',task_sign)
    X, y = [ x["text"] for x in test_samples ], [ x["label"] for x in test_samples ]

    predictions = pipeline.predict(X)
    print ('Test data\n', classification_report(y, predictions, digits=3))


In [10]:
train_test_taskc('C')

1159533739871002625	@USER @USER @USER This guys is dumb check his latest tweets he is sick	Null	Null	ind
Total number of data: 128336
	Training on 128336 samples
---------------------------------------- 
Training data
               precision    recall  f1-score   support

           0      0.994     0.998     0.996     91925
           1      0.980     0.982     0.981     24917
           2      0.975     0.934     0.954     11494

    accuracy                          0.989    128336
   macro avg      0.983     0.971     0.977    128336
weighted avg      0.989     0.989     0.989    128336

Evaluating SVM classifier
97670	@USER Liberals are all Kookoo !!!	OFF	TIN	OTH
Total number of data: 3876
Test data
               precision    recall  f1-score   support

           0      0.874     0.745     0.804      2407
           1      0.571     0.764     0.653      1074
           2      0.362     0.357     0.360       395

    accuracy                          0.711      3876
   macro avg