In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.tensorboard import SummaryWriter
import networkx as nx

from src.evaluation.hatespeech.evaluation_rulesbased_hatespeech import find_most_common_nouns

data = pd.read_csv("../../data/external/hatespeech/hs_data.csv")

nlp = spacy.load("en_core_web_md")


In [None]:
def create_spacy_docs(data, label, misogynistic = True):
    """ Returns a dataframe of spacy docs
    Args:
        
    Returns:
            
    """
    if misogynistic:
        return data.loc[data.loc[:,'annotation'] == "misogynistic", label].apply(lambda x: nlp(x))
    else:
        return data.loc[data.loc[:,'annotation'] == "not_misogynistic", label].apply(lambda x: nlp(x))


misogyistic_docs = create_spacy_docs(data, "normalize", True)
not_misogynistic_docs = create_spacy_docs(data, "normalize", False)



In [None]:
def count_most_common_nouns(docs):
    """
    Args:
        
    Returns:
    
    """
    return pd.DataFrame(find_most_common_nouns(docs), columns=['noun', 'count'])

misogynistic_most_common_nouns = count_most_common_nouns(misogyistic_docs)
not_misogyny_most_common_nouns = count_most_common_nouns(not_misogyistic_docs)


In [None]:
top_10_misogynistic = misogynistic_most_common_nouns.loc[0:10, :]
top_10_not_misogynistic = not_misogyny_most_common_nouns.loc[0:10, :]

fig, ax = plt.subplots(ncols=2, sharey=True, figsize=(15,15))
m = sns.barplot(x=top_10_m['noun'], y=top_10_m['count'], ax=ax[0])
nm = sns.barplot(x=top_10_nonm['noun'], y=top_10_nonm['count'], ax=ax[1])


ax[0].set_title("Misogyny")
ax[1].set_title("Non Misogyny")


for item1, item2 in zip(m.get_xticklabels(), nm.get_xticklabels()):
    item1.set_rotation(90)
    item2.set_rotation(90)

plt.show()


In [None]:
data['tweet_length'] = data["text"].apply(lambda tweet: len(tweet))
data.groupby('annotation').mean()['tweet_length']

sns.distplot(data.loc[data.loc[:,'annotation'] == 'misogynistic', "tweet_length"], kde=False, label="Misogynistic")
sns.distplot(data.loc[data.loc[:,'annotation'] == 'not_misogynistic', "tweet_length"], kde=False, label="Not misogynistic")

plt.legend();

In [None]:
data['tweet_length'] = data["normalize"].apply(lambda tweet: len(tweet))
data.groupby('annotation').mean()['tweet_length']

sns.distplot(data.loc[data.loc[:,'annotation'] == 'misogynistic', "tweet_length"], kde=False, label="Misogynistic")
sns.distplot(data.loc[data.loc[:,'annotation'] == 'not_misogynistic', "tweet_length"], kde=False, label="Not misogynistic")

plt.legend();


In [None]:
def pos(docs):
    
    tags = [token.tag_ for doc in docs for token in doc]
    frequencies = [(word, tags.count(word)) for word in set(tags)]
    return sorted(set(frequencies), key=lambda x: x[1], reverse = True)

pos_misogyny = pd.DataFrame(pos(misogyny_docs), columns=['POS', 'count'])
pos_nmisogyny = pd.DataFrame(pos(not_misogyny_docs), columns=['POS', 'count'])
pos_misogyny_norm = pd.DataFrame(pos(misogyny_docs_norm), columns=['POS', 'count'])
pos_nmisogyny_norm = pd.DataFrame(pos(not_misogyny_docs_norm), columns=['POS', 'count'])

top_10_pos_m = pos_misogyny.loc[0:10, :]
top_10_pos_nonm = pos_nmisogyny.loc[0:10, :]
top_10_pos_m_norm = pos_misogyny_norm.loc[0:10, :]
top_10_pos_nonm_norm = pos_nmisogyny_norm.loc[0:10, :]

fig, ax = plt.subplots(ncols=4, sharey=True, figsize=(15,15))
m = sns.barplot(x=top_10_pos_m['POS'], y=top_10_pos_m['count'], ax=ax[0])
nm = sns.barplot(x=top_10_pos_nonm['POS'], y=top_10_pos_nonm['count'], ax=ax[1])
m_norm = sns.barplot(x=top_10_pos_m_norm['POS'], y=top_10_pos_m_norm['count'], ax=ax[2])
nm_norm = sns.barplot(x=top_10_pos_nonm_norm['POS'], y=top_10_pos_nonm_norm['count'], ax=ax[3])


ax[0].set_title("Misogyny")
ax[1].set_title("Non Misogyny")

ax[2].set_title("Misogyny")
ax[3].set_title("Non Misogyny")


for item1, item2 in zip(m.get_xticklabels(), nm.get_xticklabels()):
    item1.set_rotation(90)
    item2.set_rotation(90)
    

for item1, item2 in zip(m_norm.get_xticklabels(), nm_norm.get_xticklabels()):
    item1.set_rotation(90)
    item2.set_rotation(90)

plt.show()

In [None]:
def dep(docs):
    
    tags = [token.dep_ for doc in docs for token in doc]
    frequencies = [(word, tags.count(word)) for word in set(tags)]
    return sorted(set(frequencies), key=lambda x: x[1], reverse = True)

pos_misogyny = pd.DataFrame(dep(misogyny_docs), columns=['dependency', 'count'])
pos_nmisogyny = pd.DataFrame(dep(not_misogyny_docs), columns=['dependency', 'count'])
pos_misogyny_norm = pd.DataFrame(dep(misogyny_docs_norm), columns=['dependency', 'count'])
pos_nmisogyny_norm = pd.DataFrame(dep(not_misogyny_docs_norm), columns=['dependency', 'count'])

top_10_pos_m = pos_misogyny.loc[0:10, :]
top_10_pos_nonm = pos_nmisogyny.loc[0:10, :]
top_10_pos_m_norm = pos_misogyny_norm.loc[0:10, :]
top_10_pos_nonm_norm = pos_nmisogyny_norm.loc[0:10, :]

fig, ax = plt.subplots(ncols=4, sharey=True, figsize=(15,15))
m = sns.barplot(x=top_10_pos_m['dependency'], y=top_10_pos_m['count'], ax=ax[0])
nm = sns.barplot(x=top_10_pos_nonm['dependency'], y=top_10_pos_nonm['count'], ax=ax[1])
m_norm = sns.barplot(x=top_10_pos_m_norm['dependency'], y=top_10_pos_m_norm['count'], ax=ax[2])
nm_norm = sns.barplot(x=top_10_pos_nonm_norm['dependency'], y=top_10_pos_nonm_norm['count'], ax=ax[3])


ax[0].set_title("Misogyny")
ax[1].set_title("Non Misogyny")

ax[2].set_title("Misogyny")
ax[3].set_title("Non Misogyny")


for item1, item2 in zip(m.get_xticklabels(), nm.get_xticklabels()):
    item1.set_rotation(90)
    item2.set_rotation(90)
    

for item1, item2 in zip(m_norm.get_xticklabels(), nm_norm.get_xticklabels()):
    item1.set_rotation(90)
    item2.set_rotation(90)

plt.show()

In [None]:
# Create bigrams
for doc in misogyny_docs:
    for noun_phrase in list(doc.noun_chunks):
        noun_phrase.merge(noun_phrase.root.tag_, noun_phrase.root.lemma_, noun_phrase.root.ent_type_)


In [None]:
[(token.text,token.pos_) for token in doc]

In [None]:
def load_deptree_into_graph(tweet):
    """
   
    """
    edges = []
    for token in tweet:
        for child in token.children:
            edges.append((f'{token.lower_}',
                          f'{child.lower_}'))
    return nx.Graph(edges)
    
edges = []
for doc in misogyny_docs:
    graph = load_deptree_into_graph(doc)
    edges+=list(graph.edges)
    
df = pd.DataFrame(edges, columns = ['e1', 'e2'])
nouns = df['e1'].to_list()
frequencies = [(word, nouns.count(word)) for word in set(nouns)]
sorted(set(frequencies), key=lambda x: x[1], reverse = True)
df.loc[df.loc[:, "e1"] == "feminazi"]['e2']