## IMPORT PACKAGES 

In [30]:
import nltk
import numpy as np
import csv
#tqdm : 进度条
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split as split
#用于在jupyter 显示图像
%matplotlib inline

In [3]:
#无法下载则去www.nltk.org/nltk_data官网下载后放在相应目录下
#官网用chrome可以下载
nltk.download(['stopwords','punkt','wordnet','averaged_perceptron_tagger'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joker\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joker\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joker\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\joker\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## READ DATA 


In [18]:
#Variables

FILE_PATH = './datasets/training-v1/offenseval-training-v1.tsv'


In [13]:
def str_to_label(text_label):
    label = 0
    if text_label == 'OFF':
        label = 1
    return label

In [20]:
#从tsv文档读数据
#输入数据格式：
#id,tweet,label_a,label_b,label_c
#输出data list [tweet1, tweet2,...]
#输出label list [label1, label2, ...]
def get_training_data(FILE_PATH):
    data = []
    labels = []
    with open(FILE_PATH, encoding='utf-8') as tsvfile:
        reader = csv.reader(tsvfile,delimiter='\t')
        for i,line in enumerate(tqdm(reader,'READING DATA.....')):
            if i is 0:
                #不读第一行（title）
                continue
            label = str_to_label(line[-3])
            data.append(line[1])
            labels.append(label)
    return data, labels
            

In [24]:
all_data, all_labels = get_training_data(FILE_PATH)

READING DATA.....: 13241it [00:00, 212044.47it/s]


In [25]:
print(all_data[:3])
print(all_labels[:3])

['@USER She should ask a few native Americans what their take on this is.', '@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL', 'Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT']
[1, 1, 0]


In [27]:
## 为了评估模型性能，需要labeled test data
## 从数据中分出1/10做validation data, 1/10做test data
## 首先shuffle数据
shuffled_data, shuffled_labels = shuffle(all_data, all_labels, random_state=0)

In [28]:
print(shuffled_data[:3])
print(shuffled_labels[:3])

['@USER Good! #Antifa is violent fascism.', "@USER It has been being de-created slowly and stealthily since the CONservatives came to power... Hunt's done a sterling job.", "* Christian Kabasele reveals the secrets behind Watford's perfect start to the Premier League season : Beans are banned at Watford's training ground so Christian Kabasele spills the avocado and quinoa. He is is intelligent.. . URL URL"]
[0, 0, 0]


In [32]:
#split train and test
train_data,test_data,train_labels,test_labels = split(shuffled_data,\
                                                     shuffled_labels,\
                                                     test_size=0.1)

## Data Statics

In [None]:
# 1. 句数
print('training set size: '+str(len(train_data))+'\n','with '+str(sum(train_labels))+','+str(round(sum(train_labels)/len(train_data),4))+' offensive\n')
print('\ntest set size: '+str(len(test_data))+'\n','with '+str(sum(test_labels))+','+str(round(sum(test_labels)/len(test_data),4))+' offensive\n')
# 2.平均长度
print('\nAverage Length:\n')
train_wc= 0
test_wc = 0
train_vocab = set()
test_vocab = set()
for t in train_data:
    t_list = t.split(' ')
    train_wc += len(t_list)
    for w in t_list:
        train_vocab.add(w)
for t in test_data:
    t_list = t.split(' ')
    test_wc += len(t_list)
    for w in t_list:
        test_vocab.add(w)
print('training: '+str(round(train_wc/len(train_data),2)))
print('test: '+str(round(test_wc/len(test_data),2))+'\n')


print('\nVocab size:\n')
print('training: '+str(len(train_vocab)))
print('testing: '+str(len(test_vocab)))

In [72]:
def get_statics(data,label,datatype):
    length = len(data)
    off_d = sum(label)
    off = round(off_d/length,4)
    wc = 0
    vocab = set()
    for t in data:
        t_list = t.split(' ')
        wc += len(t_list)
        for w in t_list:
            vocab.add(w)
    a_wc = round(wc/length,2)
    
    
    print('--------'+'For %s set'%(datatype)+'-------')
    print('Length: %d'%(length))
    print('with offensive tweets %d,%.4f'%(off_d,off))
    print('average length: %.2f'%(a_wc))
    print('vocab size: %d'%(len(vocab)))
    print()

In [73]:
get_statics(train_data,train_labels,'training')
get_statics(test_data,test_labels,'testing')


--------For training set-------
Length: 11916
with offensive tweets 4009,0.3364
average length: 22.66
vocab size: 36539

--------For testing set-------
Length: 1324
with offensive tweets 391,0.2953
average length: 23.24
vocab size: 7973



## Pre-processing

In [46]:
##接下来对数据做预处理
##问题：
##1. 不能完全凭空格做好分词，有相连的标点、专有名词等的干扰
##2. 口语化，不规范用词
##3. 待补充