# Process DocuScope output

In [8]:
# Load token csvs
import os
import pandas as pd

ds_output_dirpath = '/storage2/mamille3/data/hate_speech/degibert2019/docuscope_output_by_author/degibert2019_by_author-2022-02-15-141033/'
for fname in os.listdir(os.path.join(ds_output_dirpath, 'token_csv'))[:1]:
    print(fname)
    fpath = os.path.join(ds_output_dirpath, 'token_csv', fname)
    tokens = pd.read_csv(fpath, names=['original_token', 'lowercased_token', 'whitespace_after', 'lat', 'pattern_position'])
    
tokens

577938_tokens.csv


Unnamed: 0,original_token,lowercased_token,whitespace_after,lat,pattern_position
0,GLORY,glory,s,PositiveValuesDimensionGeneral_LAT,0
1,To,to,s,PositiveRelationsInclusive_LAT,0
2,our,our,s,PositiveRelationsInclusive_LAT,1
3,British,british,s,CharacterTypesDimensionGeneral_LAT,0
4,Activist,activist,s,StrategicActivismDimensionGeneral_LAT,0
5,&,&,s,,0
6,Nationalist,nationalist,s,PublicDimensionGeneral_LAT,0
7,!,!,s,ForceDimensionGeneral_LAT,0
8,Good,good,s,PositiveEmotionGood_LAT,0
9,To,to,s,PositiveEmotionGood_LAT,1


In [13]:
# Load sentence splits and annotations
annotations_fpath = '/storage2/mamille3/data/hate_speech/degibert2019/combined_data.csv'
annotations = pd.read_csv(annotations_fpath).sort_values(['comment_id', 'sentence_id']).set_index('file_id')
annotations

Unnamed: 0_level_0,comment_id,sentence_id,text,user_id,subforum_id,num_contexts,label
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12834217_1,12834217,1,"As of March 13th , 2014 , the booklet had been...",572066,1346,0,noHate
12834217_2,12834217,2,In order to help increase the booklets downloa...,572066,1346,0,noHate
12834217_3,12834217,3,( Simply copy and paste the following text int...,572066,1346,0,noHate
12834217_4,12834217,4,Click below for a FREE download of a colorfull...,572066,1346,0,hate
12834217_5,12834217,5,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,572066,1346,0,noHate
...,...,...,...,...,...,...,...
33677015_1,33677015,1,Apparently he came to the conclusion that his ...,572948,1388,0,noHate
33677019_1,33677019,1,Wish we at least had a Marine Le Pen to vote f...,735154,1388,0,noHate
33677019_2,33677019,2,Its like the choices are white genocide candid...,735154,1388,0,noHate
33677053_1,33677053,1,Why White people used to say that sex was a si...,572266,1388,0,hate


In [28]:
# Select user data
selected_user = int(fname.split('_')[0])
user_annotations = annotations.query('user_id == @selected_user').copy()
user_annotations['sentence_length'] = [len(x) for x in user_annotations.text.str.split()]
user_annotations

Unnamed: 0_level_0,comment_id,sentence_id,text,user_id,subforum_id,num_contexts,label,sentence_length
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
30430878_1,30430878,1,GLORY To our British Activist & Nationalist ! ...,577938,1359,0,noHate,17
30430878_2,30430878,2,Glory To all our Valiant British Brothers & Si...,577938,1359,0,noHate,10


In [29]:
# Explode to tokens
s = user_annotations.text.str.split().apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'original_token'
del user_annotations['text']
merged = user_annotations.join(s)
token_annotations = merged.reset_index()
token_annotations

In [30]:
token_data = pd.concat([token_annotations, tokens], axis=1)
token_data = token_data.loc[:, ~token_data.columns.duplicated()]
token_data

Unnamed: 0,file_id,comment_id,sentence_id,user_id,subforum_id,num_contexts,label,sentence_length,original_token,lowercased_token,whitespace_after,lat,pattern_position
0,30430878_1,30430878,1,577938,1359,0,noHate,17,GLORY,glory,s,PositiveValuesDimensionGeneral_LAT,0
1,30430878_1,30430878,1,577938,1359,0,noHate,17,To,to,s,PositiveRelationsInclusive_LAT,0
2,30430878_1,30430878,1,577938,1359,0,noHate,17,our,our,s,PositiveRelationsInclusive_LAT,1
3,30430878_1,30430878,1,577938,1359,0,noHate,17,British,british,s,CharacterTypesDimensionGeneral_LAT,0
4,30430878_1,30430878,1,577938,1359,0,noHate,17,Activist,activist,s,StrategicActivismDimensionGeneral_LAT,0
5,30430878_1,30430878,1,577938,1359,0,noHate,17,&,&,s,,0
6,30430878_1,30430878,1,577938,1359,0,noHate,17,Nationalist,nationalist,s,PublicDimensionGeneral_LAT,0
7,30430878_1,30430878,1,577938,1359,0,noHate,17,!,!,s,ForceDimensionGeneral_LAT,0
8,30430878_1,30430878,1,577938,1359,0,noHate,17,Good,good,s,PositiveEmotionGood_LAT,0
9,30430878_1,30430878,1,577938,1359,0,noHate,17,To,to,s,PositiveEmotionGood_LAT,1


## Get LAT counts per sentence

In [31]:
# Remove multiword labels (just count them once)
processed = token_data.query('pattern_position == 0')

# Get dummy variables for each column value
processed.pivot_table(index=['file_id'], columns=['lat'])

Unnamed: 0_level_0,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,comment_id,...,user_id,user_id,user_id,user_id,user_id,user_id,user_id,user_id,user_id,user_id
lat,CharacterTypesDimensionGeneral_LAT,DescriptObjectsDimensionGeneralWarehouseAThruZ_LAT,ForceDimensionGeneral_LAT,OrphanedDimensionGeneral_LAT,PositiveActHelp_LAT,PositiveEmotionGood_LAT,PositiveRelationsInclusive_LAT,PositiveValuesDimensionGeneral_LAT,PositiveValuesStrong_LAT,PublicDimensionGeneral_LAT,...,OrphanedDimensionGeneral_LAT,PositiveActHelp_LAT,PositiveEmotionGood_LAT,PositiveRelationsInclusive_LAT,PositiveValuesDimensionGeneral_LAT,PositiveValuesStrong_LAT,PublicDimensionGeneral_LAT,ReasonGenericGeneralize_LAT,StrategicActivismDimensionGeneral_LAT,StrategicGoalsDimensionGeneral_LAT
file_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
30430878_1,30430878.0,30430878.0,30430878.0,30430878.0,30430878.0,30430878.0,30430878.0,30430878.0,,30430878.0,...,577938.0,577938.0,577938.0,577938.0,577938.0,,577938.0,,577938.0,577938.0
30430878_2,30430878.0,,30430878.0,,,,30430878.0,30430878.0,30430878.0,,...,,,,577938.0,577938.0,577938.0,,577938.0,,
