# Dependency parsing

In [None]:
# Load data with identity term matches extracted, tokenized
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
# path = '../data/white_supremacist_identities.pkl'
data = pd.read_pickle(path)
data.info()

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner'])

In [None]:
samp = data[data.netmapper_identity_matches.map(lambda x: len(x) > 0)].head(1)
samp

In [None]:
pd.set_option('display.max_colwidth', None)
test = test.loc[12, 'content']

In [None]:
doc = nlp(test)
doc

In [None]:
parse = [tok.dep_ for tok in doc]
parse

In [None]:
parse[tok_idx]

In [None]:
[tok.head for tok in doc][tok_idx]

In [None]:
# Match extracted identities to tokens
from collections import defaultdict

actions_attributes = {} # identity: {'actions': [actions], {'attributes': [attributes]} # replace with separate columns of attributes and actions in exploded df
identity_ctr = defaultdict(int) # keep track of how many of this identity I've seen

for identity in samp.loc[12, 'netmapper_identity_matches']:
    
    # Get identity mention locations
    mention_idx = [i for i, tok in enumerate(doc) if tok.text==identity]
    tok_idx = mention_idx[identity_ctr[identity]]
    
    # Verbs where identity term was the subject
    verbs_subj = [tok.head.text for tok in doc if tok.i==tok_idx \
        in mention_idx and (tok.dep_=='nsubj' or tok.dep_=='agent')]

    # Verbs where identity term was the object
    verbs_obj = [tok.head.text for tok in doc if tok.i==tok_idx and \
        (tok.dep_=='dobj' or tok.dep_=='nsubjpass' or \
        tok.dep_=='dative' or tok.dep_=='pobj')]

    # Adjectives that describe the identity term
    adjs = [tok.text.lower() for tok in doc if tok.head.i == tok_idx and \
        (tok.dep_=='amod' or tok.dep_=='appos' or \
        tok.dep_=='nsubj' or tok.dep_=='nmod')] \
        + [tok.text.lower() for tok in doc if tok.dep_=='attr' and \
            (tok.head.text=='is' or tok.head.text=='was') and \
           any([c.i==tok_idx for c in tok.head.children])]
    
    actions_attributes[identity] = {'actions': verbs_subj + verbs_obj, 'attributes': adjs}
    identity_ctr[identity] += 1
    
actions_attributes

In [None]:
# Create a list of identity term unique indexes for each identity term list
data['netmapper_identity_matches']

In [None]:
from collections import defaultdict, Counter

def unique_term_index(l):
    ctr = Counter()
    res = []
    for term in l:
        res.append(ctr[term])
        ctr[term] += 1
    return res

In [None]:
unique_term_index(['incels', 'incels', 'teen'])

# Aggregate extracted actions and attributes

In [None]:
# Load data with extracted actions and attributes
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
data = pd.read_pickle(path)
data.info()

exp = data.explode(['netmapper_identity_matches', 'actions_attributes'])
# exp.info()

# Group identities
import json

identity_groups_fpath = '../resources/identity_groups.json'
with open(identity_groups_fpath, 'r') as f:
    identity_groups = json.load(f)
print(len(identity_groups))

exp['identity_group'] = exp.netmapper_identity_matches.map(lambda x: identity_groups.get(x, x))
exploded = exp.explode('identity_group') # Count intersectional mentions as a mention in each of their categories
exploded.info()

# %%timeit -n 1 -r 1 # make Counter right away
from collections import Counter

# samp = exploded.sample(int(1e6))
# gped = samp.groupby('identity_group')
gped = exploded.groupby('identity_group')

# Aggregate actions and attribute for different identities
# agg = gped.agg({'actions_attributes': lambda x: {'actions': Counter([action for el in x for action in el['actions']]).most_common(),
#                                                 'attributes': Counter([attr for el in x for attr in el['attributes']]).most_common()},
#                                                'content': 'count'
#                                               })
agg = gped.agg(verbs_subj = pd.NamedAgg(column='actions_attributes', aggfunc=lambda x: Counter([term for el in x for term in el['verbs_subj']]).most_common()),
                verbs_obj = pd.NamedAgg(column='actions_attributes', aggfunc=lambda x: Counter([term for el in x for term in el['verbs_obj']]).most_common()),
                adjs = pd.NamedAgg(column='actions_attributes', aggfunc=lambda x: Counter([term for el in x for term in el['adjs']]).most_common()),
                count = pd.NamedAgg(column='content', aggfunc='count'))
agg.sort_values('count', ascending=False, inplace=True)
agg.info()
# agg[['verbs_subj', 'verbs_obj', 'adjs']].head()

In [None]:
import plotly.express as px

for gp in agg.index[:12]:
    for col in ['verbs_subj', 'verbs_obj', 'adjs']:
        terms, counts = zip(*agg.loc[gp, col][:50])
        fig = px.bar(x=terms, y=counts, title=f'{gp} {col}', labels=dict(x="term", y="count"))
        fig.update_xaxes(tickangle=45)
        # fig.show(config = {'staticPlot': True})
        fig.show()

## Look into examples use of actions, attributes

In [None]:
# Look at verbs_obj
gp = 'incels'
term = 'transcended'
field = 'adjs'

pd.set_option('display.max_colwidth', None)
crit = (exploded.identity_group == gp) & (exploded.actions_attributes.map(lambda x: term in x[field] if isinstance(x, dict) else False))
exploded.loc[crit, ['identity_group', 'content', 'actions_attributes']].sample(5)