# Dependency parsing

In [1]:
# Load data with identity term matches extracted, tokenized
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
# path = '../data/white_supremacist_identities.pkl'
data = pd.read_pickle(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 9 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   type                        object        
 1   forum                       object        
 2   thread                      object        
 3   username                    object        
 4   date                        object        
 5   content                     object        
 6   parsed_date                 datetime64[ns]
 7   content_orig                object        
 8   netmapper_identity_matches  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 429.0+ MB


In [7]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner'])

In [18]:
samp = data[data.netmapper_identity_matches.map(lambda x: len(x) > 0)].head(1)
samp

Unnamed: 0,type,forum,thread,username,date,content,parsed_date,content_orig,netmapper_identity_matches
12,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,THE TRUE HONKLER,"Nov 20, 2020","frothysolutions said : do we ? in order to have this as a replacement for socializing , we have to want to be here . fag we were forced here by foids and lack of any other choice",2020-11-20,"FrothySolutions said: Do we? In order to have this as a replacement for socializing, we have to want to be here. fag we were forced here by foids and lack of any other choice",[fag]


In [11]:
pd.set_option('display.max_colwidth', None)
test = test.loc[12, 'content']

In [12]:
doc = nlp(test)
doc

frothysolutions said : do we ? in order to have this as a replacement for socializing , we have to want to be here . fag we were forced here by foids and lack of any other choice

In [14]:
parse = [tok.dep_ for tok in doc]
parse

['nsubj',
 'ROOT',
 'punct',
 'xcomp',
 'nsubj',
 'punct',
 'prep',
 'pobj',
 'aux',
 'acl',
 'dobj',
 'prep',
 'det',
 'pobj',
 'prep',
 'pobj',
 'punct',
 'nsubj',
 'ROOT',
 'aux',
 'xcomp',
 'aux',
 'xcomp',
 'advmod',
 'punct',
 'intj',
 'nsubjpass',
 'auxpass',
 'ROOT',
 'advmod',
 'agent',
 'pobj',
 'cc',
 'conj',
 'prep',
 'det',
 'amod',
 'pobj']

In [26]:
parse[tok_idx]

'intj'

In [25]:
[tok.head for tok in doc][tok_idx]

forced

In [22]:
# Match extracted identities to tokens
from collections import defaultdict

actions_attributes = {} # identity: {'actions': [actions], {'attributes': [attributes]} # replace with separate columns of attributes and actions in exploded df
identity_ctr = defaultdict(int) # keep track of how many of this identity I've seen

for identity in samp.loc[12, 'netmapper_identity_matches']:
    
    # Get identity mention locations
    mention_idx = [i for i, tok in enumerate(doc) if tok.text==identity]
    tok_idx = mention_idx[identity_ctr[identity]]
    
    # Verbs where identity term was the subject
    verbs_subj = [tok.head.text for tok in doc if tok.i==tok_idx \
        in mention_idx and (tok.dep_=='nsubj' or tok.dep_=='agent')]

    # Verbs where identity term was the object
    verbs_obj = [tok.head.text for tok in doc if tok.i==tok_idx and \
        (tok.dep_=='dobj' or tok.dep_=='nsubjpass' or \
        tok.dep_=='dative' or tok.dep_=='pobj')]

    # Adjectives that describe the identity term
    adjs = [tok.text.lower() for tok in doc if tok.head.i == tok_idx and \
        (tok.dep_=='amod' or tok.dep_=='appos' or \
        tok.dep_=='nsubj' or tok.dep_=='nmod')] \
        + [tok.text.lower() for tok in doc if tok.dep_=='attr' and \
            (tok.head.text=='is' or tok.head.text=='was') and \
           any([c.i==tok_idx for c in tok.head.children])]
    
    actions_attributes[identity] = {'actions': verbs_subj + verbs_obj, 'attributes': adjs}
    identity_ctr[identity] += 1
    
actions_attributes

{'fag': {'actions': [], 'attributes': []}}

In [3]:
# Create a list of identity term unique indexes for each identity term list
data['netmapper_identity_matches']

0                                 []
1                                 []
2                                 []
3                                 []
4                                 []
                     ...            
6248225    [incels, teenager, teens]
6248226                           []
6248227                           []
6248228            [parenting, kids]
6248229                     [incels]
Name: netmapper_identity_matches, Length: 6248230, dtype: object

In [12]:
from collections import defaultdict, Counter

def unique_term_index(l):
    ctr = Counter()
    res = []
    for term in l:
        res.append(ctr[term])
        ctr[term] += 1
    return res

In [13]:
unique_term_index(['incels', 'incels', 'teen'])

[0, 1, 0]

# Aggregate extracted actions and attributes

In [14]:
# Load data with extracted actions and attributes
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
data = pd.read_pickle(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 10 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   type                              object        
 1   forum                             object        
 2   thread                            object        
 3   username                          object        
 4   date                              object        
 5   content                           object        
 6   parsed_date                       datetime64[ns]
 7   content_orig                      object        
 8   netmapper_identity_matches        object        
 9   netmapper_identity_matches_spans  object        
dtypes: datetime64[ns](1), object(9)
memory usage: 476.7+ MB


In [16]:
data[data['netmapper_identity_matches_spans'].map(lambda x: len(x) > 0)].head(5)

Unnamed: 0,type,forum,thread,username,date,content,parsed_date,content_orig,netmapper_identity_matches,netmapper_identity_matches_spans
12,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,THE TRUE HONKLER,"Nov 20, 2020",frothysolutions said : do we ? in order to hav...,2020-11-20,FrothySolutions said: Do we? In order to have ...,"[fag, foids]","[(117, 120), (144, 149)]"
19,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,ChronicPaincel,"Nov 20, 2020",we had to . mogged loner said : the older i ge...,2020-11-20,We had to. Mogged Loner said: the older I get ...,[loner],"[(19, 24)]"
20,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,TigerFestival,"Nov 20, 2020",mogged loner said : i have nt left my house in...,2020-11-20,Mogged Loner said: I havent left my house in a...,"[loner, mother]","[(7, 12), (204, 210)]"
21,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,Ika-Sama,"Nov 20, 2020",tigerfestival said : a week ? i have n't left ...,2020-11-20,TigerFestival said: A Week? I haven't left my ...,[mother],"[(114, 120)]"
23,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,FrothySolutions,"Nov 20, 2020",the true honkler said : fag we were forced her...,2020-11-20,THE TRUE HONKLER said: fag we were forced here...,"[fag, foids]","[(24, 27), (51, 56)]"


In [None]:
exp = data.explode(['netmapper_identity_matches', 'netmapper_identity'