In [22]:
import pandas as pd
import re
from sklearn.metrics import classification_report, recall_score, make_scorer, f1_score
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from data import *

Task b: Is the offensive text targeted (TIN) or untargeted (UNT)?

In [23]:
#load the data
data_b = pd.read_csv('data2/task_b_distant.tsv', sep='\t', header=0, index_col='id')
data_b.head()
print(round(data_b["average"],1).value_counts())
# 越大越可能是unt，越小越可能是tin
# 乍一看数据明显不均衡，如果直接拿0.5切的话明显小于0.5的tin比较多
print(len(data_b))

0.2    61866
0.3    51800
0.5    25999
0.4    21966
0.6    13090
0.7     8072
0.8     5056
0.1     1106
0.9       19
Name: average, dtype: int64
188974


# 数据分析

根据中心极限定理，68%的数据在一个标准差内，95%的在两个标准差内，99.7%的在三个标准差内。为了确保标签的准确性，可以通过让均值加减标准差来让整个分布都在阈值之内
![image.png](attachment:image.png)

In [24]:
#根据中心极限定理，制造正态分布的边缘，两参数可以调整的，可以看看后面处理完了的两边数据量是不是平衡的
min_edge = 1
max_edge = 1
data_b['std_min'] = data_b['average']-(min_edge*data_b['std'])
data_b['std_max'] = data_b['average']+(max_edge*data_b['std'])
data_b.head()

Unnamed: 0_level_0,text,average,std,std_min,std_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1159533712079503361,@USER Trump is a fucking idiot his dementia is...,0.230133,0.219593,0.01054,0.449726
1159533713044234241,@USER HELL YES! His grinned and thumbs up are ...,0.26986,0.178943,0.090918,0.448803
1159533718345830400,@USER Can't wait to see the shit show his deat...,0.229661,0.22853,0.001131,0.458192
1159533739871002625,@USER @USER @USER This guys is dumb check his ...,0.169093,0.180201,-0.011108,0.349293
1159533742366633984,@USER @USER Fuck him better than his hoes,0.27498,0.276721,-0.001741,0.5517


In [25]:
# 这里定义一个筛选数据的函数
# 越大越可能是unt，越小越可能是tin
def select(x,unt_min=0.5 ,tin_max=0.5, std_min=0.5, std_max=0.5,method='std'):
    # 这个方法是只用阈值卡均值，不考虑方差
    if method=='average':     
        if x[0] >= unt_min:
            b_type = 'UNT'
        elif x[0] < tin_max:
            b_type = 'TIN'
        else:
            b_type = 'Null'
        return b_type

    # 这个方法是考虑正太分布的边缘
    if method=='std':
        if x[1] >= std_min:
            b_type = 'UNT'
        elif x[2] < std_max:
            b_type = 'TIN'
        else:
            b_type = 'Null'
        return b_type

In [26]:
# 筛选
type_list=[]
data_b["subtask_a"]='Null'
for i in zip(data_b['average'],data_b['std_min'],data_b['std_max']):
    type_list.append(select(i)) 
data_b['subtask_b'] = type_list
data_b["subtask_b"].value_counts()
data_b["subtask_c"]='Null'

In [27]:
data_b = data_b.drop(columns=['average', 'std','std_min','std_max'])
data_b = data_b[data_b.subtask_b.isin(['TIN','UNT'])]   # 剔除null，没选上的
data_b.head()

Unnamed: 0_level_0,text,subtask_a,subtask_b,subtask_c
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1159533712079503361,@USER Trump is a fucking idiot his dementia is...,Null,TIN,Null
1159533713044234241,@USER HELL YES! His grinned and thumbs up are ...,Null,TIN,Null
1159533718345830400,@USER Can't wait to see the shit show his deat...,Null,TIN,Null
1159533739871002625,@USER @USER @USER This guys is dumb check his ...,Null,TIN,Null
1159533763791130624,Junhee and I are gonna take turns beating his ...,Null,TIN,Null


In [28]:
data_b.to_csv('data/subtask_b_train.csv',sep='\t')

In [29]:
data_b = pd.read_csv('data/subtask_b_train.csv', sep='\t', header=0, index_col='id')
data_b["subtask_b"].value_counts()

TIN    69506
UNT    13216
Name: subtask_b, dtype: int64

In [30]:
test = pd.read_csv('data/olid-training-v1.0.tsv', sep='\t', header=0, index_col='id')
print(len(test))
test = test[test.subtask_b.isin(['TIN','UNT'])] 
print(len(test))
test["subtask_b"].value_counts()
test.to_csv('data/subtask_b_test.csv',sep='\t')
test["subtask_b"].value_counts()

13240
4400


TIN    3876
UNT     524
Name: subtask_b, dtype: int64

# 数据预处理完毕，准备训练测试

In [31]:
def train_test_taskb(task_sign,C=1,g='scale',k='rbf'):

    train_samples = read_file('data/subtask_b_train',task_sign)
    X, y = [ x["text"] for x in train_samples ], [ x["label"] for x in train_samples ]
    #print(y)
    bow = CountVectorizer(max_features=3000)
    tfidf = TfidfTransformer()

    svm_clf = SVC(C, gamma=g, kernel=k)

    pipeline = Pipeline([('bow', bow),
                        ('tfidf', tfidf),
                        ('clf', svm_clf),])

    print('\tTraining on', len(X), 'samples')
    pipeline.fit(X, y)

    predictions = pipeline.predict(X)
    print ('-'* 40, '\nTraining data\n', classification_report(y, predictions, digits=3))

    # Testing
    print("Evaluating SVM classifier")
    test_samples = read_file('data/subtask_b_test',task_sign)
    X, y = [ x["text"] for x in test_samples ], [ x["label"] for x in test_samples ]

    predictions = pipeline.predict(X)
    print ('Test data\n', classification_report(y, predictions, digits=3))


In [32]:
train_test_taskb('B')

Total number of data: 82722
	Training on 82722 samples
---------------------------------------- 
Training data
               precision    recall  f1-score   support

           0      1.000     1.000     1.000     69506
           1      0.999     0.999     0.999     13216

    accuracy                          1.000     82722
   macro avg      0.999     0.999     0.999     82722
weighted avg      1.000     1.000     1.000     82722

Evaluating SVM classifier
Total number of data: 4400
Test data
               precision    recall  f1-score   support

           0      0.908     0.968     0.937      3876
           1      0.543     0.279     0.368       524

    accuracy                          0.886      4400
   macro avg      0.726     0.623     0.653      4400
weighted avg      0.865     0.886     0.870      4400

