# Dependency parsing

In [1]:
# Load data with identity term matches extracted, tokenized
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
# path = '../data/white_supremacist_identities.pkl'
data = pd.read_pickle(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 9 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   type                        object        
 1   forum                       object        
 2   thread                      object        
 3   username                    object        
 4   date                        object        
 5   content                     object        
 6   parsed_date                 datetime64[ns]
 7   content_orig                object        
 8   netmapper_identity_matches  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 429.0+ MB


In [7]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner'])

In [18]:
samp = data[data.netmapper_identity_matches.map(lambda x: len(x) > 0)].head(1)
samp

Unnamed: 0,type,forum,thread,username,date,content,parsed_date,content_orig,netmapper_identity_matches
12,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,THE TRUE HONKLER,"Nov 20, 2020","frothysolutions said : do we ? in order to have this as a replacement for socializing , we have to want to be here . fag we were forced here by foids and lack of any other choice",2020-11-20,"FrothySolutions said: Do we? In order to have this as a replacement for socializing, we have to want to be here. fag we were forced here by foids and lack of any other choice",[fag]


In [11]:
pd.set_option('display.max_colwidth', None)
test = test.loc[12, 'content']

In [12]:
doc = nlp(test)
doc

frothysolutions said : do we ? in order to have this as a replacement for socializing , we have to want to be here . fag we were forced here by foids and lack of any other choice

In [14]:
parse = [tok.dep_ for tok in doc]
parse

['nsubj',
 'ROOT',
 'punct',
 'xcomp',
 'nsubj',
 'punct',
 'prep',
 'pobj',
 'aux',
 'acl',
 'dobj',
 'prep',
 'det',
 'pobj',
 'prep',
 'pobj',
 'punct',
 'nsubj',
 'ROOT',
 'aux',
 'xcomp',
 'aux',
 'xcomp',
 'advmod',
 'punct',
 'intj',
 'nsubjpass',
 'auxpass',
 'ROOT',
 'advmod',
 'agent',
 'pobj',
 'cc',
 'conj',
 'prep',
 'det',
 'amod',
 'pobj']

In [26]:
parse[tok_idx]

'intj'

In [25]:
[tok.head for tok in doc][tok_idx]

forced

In [22]:
# Match extracted identities to tokens
from collections import defaultdict

actions_attributes = {} # identity: {'actions': [actions], {'attributes': [attributes]} # replace with separate columns of attributes and actions in exploded df
identity_ctr = defaultdict(int) # keep track of how many of this identity I've seen

for identity in samp.loc[12, 'netmapper_identity_matches']:
    
    # Get identity mention locations
    mention_idx = [i for i, tok in enumerate(doc) if tok.text==identity]
    tok_idx = mention_idx[identity_ctr[identity]]
    
    # Verbs where identity term was the subject
    verbs_subj = [tok.head.text for tok in doc if tok.i==tok_idx \
        in mention_idx and (tok.dep_=='nsubj' or tok.dep_=='agent')]

    # Verbs where identity term was the object
    verbs_obj = [tok.head.text for tok in doc if tok.i==tok_idx and \
        (tok.dep_=='dobj' or tok.dep_=='nsubjpass' or \
        tok.dep_=='dative' or tok.dep_=='pobj')]

    # Adjectives that describe the identity term
    adjs = [tok.text.lower() for tok in doc if tok.head.i == tok_idx and \
        (tok.dep_=='amod' or tok.dep_=='appos' or \
        tok.dep_=='nsubj' or tok.dep_=='nmod')] \
        + [tok.text.lower() for tok in doc if tok.dep_=='attr' and \
            (tok.head.text=='is' or tok.head.text=='was') and \
           any([c.i==tok_idx for c in tok.head.children])]
    
    actions_attributes[identity] = {'actions': verbs_subj + verbs_obj, 'attributes': adjs}
    identity_ctr[identity] += 1
    
actions_attributes

{'fag': {'actions': [], 'attributes': []}}

In [3]:
# Create a list of identity term unique indexes for each identity term list
data['netmapper_identity_matches']

0                                 []
1                                 []
2                                 []
3                                 []
4                                 []
                     ...            
6248225    [incels, teenager, teens]
6248226                           []
6248227                           []
6248228            [parenting, kids]
6248229                     [incels]
Name: netmapper_identity_matches, Length: 6248230, dtype: object

In [12]:
from collections import defaultdict, Counter

def unique_term_index(l):
    ctr = Counter()
    res = []
    for term in l:
        res.append(ctr[term])
        ctr[term] += 1
    return res

In [13]:
unique_term_index(['incels', 'incels', 'teen'])

[0, 1, 0]

# Aggregate extracted actions and attributes

In [1]:
# Load data with extracted actions and attributes
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
data = pd.read_pickle(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 13 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   type                              object        
 1   forum                             object        
 2   thread                            object        
 3   username                          object        
 4   date                              object        
 5   content                           object        
 6   parsed_date                       datetime64[ns]
 7   content_orig                      object        
 8   netmapper_identity_matches        object        
 9   netmapper_identity_matches_spans  object        
 10  actions_attributes                object        
 11  dep_parse                         object        
 12  dep_head                          object        
dtypes: datetime64[ns](1), object(12)
memory usage: 619.7+ MB


In [2]:
exp = data.explode(['netmapper_identity_matches', 'actions_attributes'])
# exp.info()

# Group identities
import json

identity_groups_fpath = '../resources/identity_groups.json'
with open(identity_groups_fpath, 'r') as f:
    identity_groups = json.load(f)
print(len(identity_groups))

exp['identity_group'] = exp.netmapper_identity_matches.map(lambda x: identity_groups.get(x, x))
exploded = exp.explode('identity_group') # Count intersectional mentions as a mention in each of their categories
exploded.info()

423
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13416057 entries, 0 to 6248229
Data columns (total 14 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   type                              object        
 1   forum                             object        
 2   thread                            object        
 3   username                          object        
 4   date                              object        
 5   content                           object        
 6   parsed_date                       datetime64[ns]
 7   content_orig                      object        
 8   netmapper_identity_matches        object        
 9   netmapper_identity_matches_spans  object        
 10  actions_attributes                object        
 11  dep_parse                         object        
 12  dep_head                          object        
 13  identity_group                    object        
dtypes: datetime64

In [None]:
%%timeit -n 1 -r 1
samp = exploded.sample(int(1e6))
gped = samp.groupby('identity_group')
# gped = exploded.groupby('identity_group')

# Aggregate actions and attribute for different identities
# agg = gped.agg({'content': 'count'})
# agg = gped.agg({'actions_attributes': lambda x: {'actions': sum([el['actions'] for el in x], [])}})
agg = gped.agg({'actions_attributes': lambda x: {'actions': sum([el['actions'] for el in x], []),
                                                'attributes': sum([el['attributes'] for el in x], [])},
                                               'content': 'count'
                                              })
agg.rename(columns = {'content': 'count'}, inplace=True)
agg.info()

In [22]:
agg.sort_values('count', ascending=False, inplace=True)
agg.head(5)

Unnamed: 0_level_0,actions_attributes,count
identity_group,Unnamed: 1_level_1,Unnamed: 2_level_1
women_girls,"{'actions': ['count', 'want', 'are', 'finding'...",1938
men_boys,"{'actions': ['over', 'from', 'do', 'off', 're'...",1528
women_girls_derogatory,"{'actions': ['with', 'lmfao', 'banned', 'norma...",648
incels,"{'actions': ['accepting', 'with', 'offtopic', ...",461
men_boys_address,"{'actions': ['lol', 'picking', 'see', 'fucking...",234


In [26]:
# Make counters for associated words with identity group terms
from collections import Counter

agg['actions'] = agg.actions_attributes.map(lambda x: Counter(x['actions']).most_common())
agg['attributes'] = agg.actions_attributes.map(lambda x: Counter(x['attributes']).most_common())
agg[['actions', 'attributes']].head(5)

Unnamed: 0_level_0,actions,attributes
identity_group,Unnamed: 1_level_1,Unnamed: 2_level_1
women_girls,"[(with, 85), (are, 75), (get, 45), (have, 38),...","[(white, 18), (fucking, 13), (black, 12), (sin..."
men_boys,"[(are, 47), (have, 38), (with, 35), (is, 32), ...","[(ugly, 30), (white, 27), (most, 19), (other, ..."
women_girls_derogatory,"[(are, 23), (with, 21), (get, 14), (like, 13),...","[(white, 8), (dumb, 7), (young, 7), (foid, 7),..."
incels,"[(are, 22), (as, 20), (said, 10), (about, 7), ...","[(incel, 12), (many, 4), (white, 3), (most, 3)..."
men_boys_address,"[(is, 4), (get, 3), (looks, 3), (fucking, 2), ...","[(good, 4), (other, 3), (random, 3), (looking,..."
