In [1]:
from nltk.corpus import stopwords
from datetime import datetime
from apyori import apriori
import lxml.etree as ET
import pandas as pd
import numpy as np
import unidecode
import html
import os
import re

# Import a dictionary with regular expressions
from utilities.regex_categories import * 

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Define paths 

In [2]:
NEWS_PATH = '../data/news/'
INFO_FILE_PATH = '../data/cases_info.csv'

### Functions to read the dataset

In [3]:
def normalize_string(to_normalize, encoded = False, remove_stopwords = True):
    '''
    Normalize text given a string
    '''
    text = str(to_normalize).lower()  # lowering text
    if encoded: 
        text = ' '.join([html.unescape(term) for term in text.split()])
    text = unidecode.unidecode(text)

    text = re.sub(r'[^\w\s]', '', text)  # removing all the punctuations
    last_text = text.split()  # tokenize the text

    # remove stopwords
    stopwords_set = set()
    if remove_stopwords: stopwords_set = set(stopwords.words("spanish"))
    
    last_text = ' '.join([x for x in last_text if (x not in stopwords_set)])
    return last_text

def listdir_checked(path, unwanted = ['.DS_Store']):
    '''
    Discard unwanted files or directories when listing the elements in a given path
    '''
    return (f for f in os.listdir(path) if f not in unwanted)


def create_articles_dictionary(NEWS_PATH):
    '''
    Import articles information.
    Articles are stored in directories in the NEWS_PATH.
    '''
    data = {}               # keys: media, value: list of dictionaries with info about the news articles of the given media
    unique_urls = []        # list to store unique urls to discard repeated ones
    repeated_data = {}      # store repeated articles following the same format as 'data' dictionary

    for directory in listdir_checked(NEWS_PATH):
        for file in listdir_checked(NEWS_PATH + directory):
            full_path = NEWS_PATH + directory + '/' + file
            # Read xml file - info stored following NewsML-G2 format
            root = ET.parse(full_path).getroot()
            # Parse news
            media = file.rsplit('_', 1)[0]
            # Check repeated urls
            url = root.findall(".//infoSource")[0].get("uri")
            str_date = root.findall('.//contentMeta')[0].find('contentCreated').text[:10]
            info = {
                'id': file.split(':')[-1].replace('.xml', ''),
                'media': media,
                'publication_date': datetime.strptime(str_date, '%Y-%m-%d'),
                'title': normalize_string(root.findall('.//itemRef')[0].find('title').text, encoded = True, remove_stopwords = False),
                'headline': normalize_string(root.findall(".//itemRef")[0].find('description').text.strip(), encoded = True, remove_stopwords = False),
                'article': normalize_string(root.findall('.//itemRef')[1].find('description').text.strip(), encoded = True, remove_stopwords = False),
                'url': url
            }

            if url not in unique_urls:
                unique_urls.append(url)
                try:
                    data[media].append(info)
                except:
                    data[media] = [info]

            else:
                try:
                    repeated_data[media].append(info)
                except:
                    repeated_data[media] = [info]
                    
    return data, repeated_data


def get_data_info(data, dict_key, value):
    '''
    Aux function to obtain the info of the articles with the given 
    value for the provided dict_key in data
    '''
    matching_elements = []
    for media, news in data.items():
        for article in news:
            if article[dict_key] == value:
                matching_elements.append(article)
    return matching_elements

# 1. REGULAR EXPRESSIONS 
Check presence of regular expressions in articles
### Import articles and update regular expressions

In [4]:
# IMPORT INFORMATION ABOUT ARTICLES
data, repeated_data = create_articles_dictionary(NEWS_PATH)

In [5]:
# UPDATE CATEGORIES - REGULAR EXPRESSIONS' DICT 
# categories is a dictionary with regular expressions imported from `utilities.regex_categories`
nacionalidades = []
filename = 'utilities/nacionalidades.txt'
file = open(filename, 'r')

for line in file:
    string = line.split(' - ')[-1].split('\n')[0][:-1] #remove last char that may insert gender
    nacionalidades.append(normalize_string(string))

categories['STIGMA_ORIGIN'] = categories['STIGMA_ORIGIN'] + nacionalidades

### Create a dataset with the presence frequency of each category 

In [6]:
# CREATE DATASET TO STORE REGULAR EXPRESSIONS' OCCURRENCES IN EACH ARTICLE
dataset = pd.DataFrame()

# Append identificator: tweet_id 
tweet_ids = []
for media, articles in data.items():
    for element in articles: 
        tweet_ids.append(element['id'])
dataset.insert(0, 'tweet_id', tweet_ids)

In [7]:
# Check for each category the number of times each category is mentioned
for key, categ_list in categories.items():
    general_results = []
    title_results = []
    summary_results = []
    article_results = []

    pos = 0  # Keep track of position to assert correct tweet_id
    for media, articles in data.items():
        for i, element in enumerate(articles):
            assert(tweet_ids[pos] == element['id'])
            total_title = 0
            total_summary = 0
            total_article = 0
                        
            for categ in categ_list:
                total_title += len(re.findall(categ, element['title']))
                total_summary += len(re.findall(categ, element['headline']))
                total_article += len(re.findall(categ, element['article']))
                            
            title_results.append(total_title)
            summary_results.append(total_summary)
            article_results.append(total_article)
            general_results.append(int(bool(total_title or total_summary or total_article)))
            pos += 1
            
    dataset.insert(len(dataset.columns), f'''{key}_bool''', general_results)
    dataset.insert(len(dataset.columns), f'''{key}_title''', title_results)
    dataset.insert(len(dataset.columns), f'''{key}_summary''', summary_results)
    dataset.insert(len(dataset.columns), f'''{key}_article''', article_results)

In [8]:
dataset.head()

Unnamed: 0,tweet_id,AGE_bool,AGE_title,AGE_summary,AGE_article,SEXUAL_ASSAULT_bool,SEXUAL_ASSAULT_title,SEXUAL_ASSAULT_summary,SEXUAL_ASSAULT_article,SEXUAL_HARASSMENT_bool,SEXUAL_HARASSMENT_title,SEXUAL_HARASSMENT_summary,SEXUAL_HARASSMENT_article,SEXUAL_ABUSE_bool,SEXUAL_ABUSE_title,SEXUAL_ABUSE_summary,SEXUAL_ABUSE_article,BOND_RELATIONSHIP_bool,BOND_RELATIONSHIP_title,BOND_RELATIONSHIP_summary,BOND_RELATIONSHIP_article,BOND_RELATIVE_bool,BOND_RELATIVE_title,BOND_RELATIVE_summary,BOND_RELATIVE_article,BOND_KNOWN_bool,BOND_KNOWN_title,BOND_KNOWN_summary,BOND_KNOWN_article,PLACE_PUBLIC_bool,PLACE_PUBLIC_title,PLACE_PUBLIC_summary,PLACE_PUBLIC_article,PLACE_WORKPLACE_bool,PLACE_WORKPLACE_title,PLACE_WORKPLACE_summary,PLACE_WORKPLACE_article,PLACE_HOUSE_bool,PLACE_HOUSE_title,PLACE_HOUSE_summary,PLACE_HOUSE_article,PLACE_EDUCATIONAL_bool,PLACE_EDUCATIONAL_title,PLACE_EDUCATIONAL_summary,PLACE_EDUCATIONAL_article,PLACE_LEISURE_bool,PLACE_LEISURE_title,PLACE_LEISURE_summary,PLACE_LEISURE_article,TIME_bool,TIME_title,TIME_summary,TIME_article,STIGMA_INTOXICATED_bool,STIGMA_INTOXICATED_title,STIGMA_INTOXICATED_summary,STIGMA_INTOXICATED_article,STIGMA_CLOTHING_bool,STIGMA_CLOTHING_title,STIGMA_CLOTHING_summary,STIGMA_CLOTHING_article,STIGMA_ORIGIN_bool,STIGMA_ORIGIN_title,STIGMA_ORIGIN_summary,STIGMA_ORIGIN_article,STIGMA_AGGRESSOR_bool,STIGMA_AGGRESSOR_title,STIGMA_AGGRESSOR_summary,STIGMA_AGGRESSOR_article,STIGMA_VULNERABILITY_bool,STIGMA_VULNERABILITY_title,STIGMA_VULNERABILITY_summary,STIGMA_VULNERABILITY_article,EXPRESSION_EUPHEMISM_bool,EXPRESSION_EUPHEMISM_title,EXPRESSION_EUPHEMISM_summary,EXPRESSION_EUPHEMISM_article,EXPRESSION_DOUBT_bool,EXPRESSION_DOUBT_title,EXPRESSION_DOUBT_summary,EXPRESSION_DOUBT_article
0,1287463951375179776,1,3,1,12,1,0,0,1,1,0,1,2,1,1,0,3,1,0,0,1,0,0,0,0,1,0,1,1,1,1,0,6,0,0,0,0,1,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,5,1,0,0,1,0,0,0,0
1,1288437861973471232,1,0,0,3,0,0,0,0,1,1,0,9,0,0,0,0,1,0,0,2,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,3
2,1288541048520744962,1,0,1,2,0,0,0,0,0,0,0,0,1,1,0,3,1,1,0,4,1,1,0,2,0,0,0,0,1,1,0,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,3,0,0,0,0,1,0,0,1
3,1288632903275106304,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,1,0,0,1,1,0,0,6,1,1,0,10,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2
4,1288919794729836544,0,0,0,0,0,0,0,0,1,0,0,2,1,1,0,2,1,0,0,1,1,0,0,2,1,0,0,1,1,1,0,6,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,4,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


### Add cluster

In [9]:
clusters_df = pd.read_csv(INFO_FILE_PATH)
clusters_df = clusters_df[['cluster_id', 'tweet_id']]
clusters_df['tweet_id'] = clusters_df['tweet_id'].apply(lambda x: str(x))

dataset = pd.merge(dataset, clusters_df)

# 2. CASES ANALYSIS

The following characteristics of each case are determined according to the presence of regular expressions in articles: 
- type of sexual violence
- victim-perpetrator bond
- place where the offence took place

## Type of sexual violence: harrassment, assault and abuse 
**Rules definition**

In [10]:
# RULES DEFINITION
def violence_rules(violence_type, assault, harrassment, abuse): 
    if assault == abuse == 0 and harrassment > 0: 
        violence_type['harassment'] += 1
        type_ = 'harassment'
    
    elif abuse == 0 and assault > 0: 
        violence_type['assault'] += 1
        type_ = 'assault'
    
    elif abuse > 0: 
        violence_type['abuse'] += 1
        type_ = 'abuse'

    else:
        return None, None
    
    return violence_type, type_

**Assessment**

In [11]:
violence_type = {
    'assault': 0,    
    'abuse': 0,      
    'harassment': 0  
}

not_classified = []
general_dict = {}
columns = [col for col in dataset.columns if 'SEXUAL_' in col]

for cluster_id, group in dataset[['cluster_id'] + columns].groupby(by='cluster_id'):
    # Check type of sexual assault
    assault = group['SEXUAL_ASSAULT_bool'].sum()
    harassment = group['SEXUAL_HARASSMENT_bool'].sum()
    abuse = group['SEXUAL_ABUSE_bool'].sum()
    
    aux_dict, aux_type = violence_rules(violence_type, assault, harassment, abuse)
    if aux_dict: 
        violence_type = aux_dict
        general_dict[cluster_id] = {'type': aux_type}
        continue
    
    # If not found, check considering only the title
    title_assault = group['SEXUAL_ASSAULT_title'].sum()
    title_harassment = group['SEXUAL_HARASSMENT_title'].sum()
    title_abuse = group['SEXUAL_ABUSE_title'].sum()
    
    aux_dict, aux_type = violence_rules(violence_type, title_assault, title_harassment, title_abuse)
    if aux_dict: 
        violence_type = aux_dict
        general_dict[cluster_id] = {'type': aux_type}    
    else:
        not_classified.append([harassment, abuse, assault, f'''{cluster_id} - {len(group)}'''])    

In [12]:
# Print statistics about the results
total_detected = sum(violence_type.values())
total = total_detected + len(not_classified)

print(f'''Total clusters classified: {total_detected} -- Clusters left out: {len(not_classified)}''')
print(f'''* Assault cases detected: {violence_type['assault']} which represent {round(violence_type['assault']/total*100, 3)}%''')
print(f'''* Abuse cases detected: {violence_type['abuse']} which represent {round(violence_type['abuse']/total*100, 3)}%''')
print(f'''* Harassment cases detected: {violence_type['harassment']} which represent {round(violence_type['harassment']/total*100, 3)}%''')
print(f'''Clusters not classified: {len(not_classified)} which represent {round(len(not_classified)/total*100, 3)}%''')

Total clusters classified: 276 -- Clusters left out: 13
* Assault cases detected: 71 which represent 24.567%
* Abuse cases detected: 181 which represent 62.63%
* Harassment cases detected: 24 which represent 8.304%
Clusters not classified: 13 which represent 4.498%


**Display conflictive cases that could not be classified**

In [13]:
pd.DataFrame(data=not_classified, columns = ['Harassment', 'Abuse', 'Assault', 'Cluster id - size']).head(5)

Unnamed: 0,Harassment,Abuse,Assault,Cluster id - size
0,0,0,0,58 - 1
1,0,0,0,102 - 1
2,0,0,0,112 - 1
3,0,0,0,147 - 3
4,0,0,0,184 - 1


## Relationship between the victim and the aggressor

**Rules definition**

In [14]:
# RULES DEFINITION
def rules(bond, relationship, relative, known):    
    # If none -> stranger
    if relationship == relative == known == 0:
        bond['stranger'] += 1
        type_ = 'stranger'
    
    # If only one -> then, that one
    elif relationship == relative == 0 and known > 0:
        bond['known'] += 1
        type_ = 'known'
    
    elif relationship == known == 0 and relative > 0:
        bond['relative'] += 1
        type_ = 'relative'
    
    elif known == relative == 0 and relationship > 0:
        bond['relationship'] += 1
        type_ = 'relationship'
    
    # If maximum is relationship -> relationship
    elif relationship > relative and relationship > known:
        bond['relationship'] += 1
        type_ = 'relationship'
        
    else:
        return None, None
    return bond, type_

**Assessment**

In [15]:
bond = {
    'relationship': 0,
    'relative': 0,
    'known': 0,
    'stranger': 0
}
not_classified = []

columns = [col for col in dataset.columns if 'BOND_' in col]
for cluster_id, group in dataset[['cluster_id']+columns].groupby(by='cluster_id'):
    relationship = group['BOND_RELATIONSHIP_bool'].sum()
    relative = group['BOND_RELATIVE_bool'].sum()
    known = group['BOND_KNOWN_bool'].sum()
    
    aux_dict, aux_type = rules(bond, relationship, relative, known)
    if aux_dict: 
        bond = aux_dict # Update bond dictionary
        try: 
            general_dict[cluster_id]['bond'] = aux_type
        except:
            general_dict[cluster_id] = {'type': None,
                                       'bond': aux_type}
        continue   # And quit the loop
    
    # Else, check only title
    title_relationship = group['BOND_RELATIONSHIP_title'].sum()
    title_relative = group['BOND_RELATIVE_title'].sum()
    title_known = group['BOND_KNOWN_title'].sum()
    
    aux_dict, aux_type = rules(bond, title_relationship, title_relative, title_known)
    if aux_dict:
        bond = aux_dict
        try: 
            general_dict[cluster_id]['bond'] = aux_type
        except:
            general_dict[cluster_id] = {'type': None,
                                       'bond': aux_type}
    else: 
        general_dict[cluster_id]['bond'] = None
        not_classified.append([known, relative, relationship, f'''{cluster_id} - {len(group)}'''])

In [16]:
total_detected = sum(bond.values())
total = total_detected + len(not_classified)
print(f'''Total clusters classified: {total_detected} -- Clusters left out: {len(not_classified)}''')
print(f'''* Relationship cases: {bond['relationship']} which represent {round(bond['relationship']/total*100, 3)}%''')
print(f'''* Relative agressor: {bond['relative']} which represent {round(bond['relative']/total*100, 3)}%''')
print(f'''* Known agressor: {bond['known']} which represent {round(bond['known']/total*100, 3)}%''')
print(f'''* Stranger agressor: {bond['stranger']} which represent {round(bond['stranger']/total*100, 3)}%''')
print(f'''Clusters not classified: {len(not_classified)} which represent {round(len(not_classified)/total*100, 3)}%''')

Total clusters classified: 278 -- Clusters left out: 11
* Relationship cases: 33 which represent 11.419%
* Relative agressor: 39 which represent 13.495%
* Known agressor: 82 which represent 28.374%
* Stranger agressor: 124 which represent 42.907%
Clusters not classified: 11 which represent 3.806%



**Display conflictive cases that could not be classified**

In [17]:
pd.DataFrame(data = not_classified, columns = ['Known', 'Relative', 'Relationship', 'Cluster id - size']).head(5)

Unnamed: 0,Known,Relative,Relationship,Cluster id - size
0,3,1,1,24 - 3
1,0,1,1,54 - 1
2,0,3,3,73 - 3
3,0,3,3,96 - 3
4,0,2,2,122 - 2


## Place where sexual offences occur

**Assessment**

In [18]:
type_place = {
    'public': 0,
    'workplace': 0,
    'house': 0,
    'education': 0,
    'leisure': 0
}
not_classified = []
positions = dict(zip(range(5), ['public', 'workplace', 'house', 'education', 'leisure'])) 

columns = [col for col in dataset.columns if 'PLACE_' in col]
for cluster_id, group in dataset[['cluster_id']+columns].groupby(by='cluster_id'):
    # If bond is relationship, skip cluster
    if general_dict[cluster_id]['bond'] == 'relationship':
        general_dict[cluster_id]['place'] = None
        continue
    
    case_places = {
        'public': {
            'bool':group['PLACE_PUBLIC_bool'].sum(),
            'title': group['PLACE_PUBLIC_title'].sum(),
            'summary': group['PLACE_PUBLIC_summary'].sum(),
            'article': group['PLACE_PUBLIC_article'].sum()
        },
        'workplace': {
            'bool':group['PLACE_WORKPLACE_bool'].sum(),
            'title': group['PLACE_WORKPLACE_title'].sum(),
            'summary': group['PLACE_WORKPLACE_summary'].sum(),
            'article': group['PLACE_WORKPLACE_article'].sum()
        },
        'house': {
            'bool':group['PLACE_HOUSE_bool'].sum(),
            'title': group['PLACE_HOUSE_title'].sum(),
            'summary': group['PLACE_HOUSE_summary'].sum(),
            'article': group['PLACE_HOUSE_article'].sum()
            },
        'education': {
            'bool':group['PLACE_EDUCATIONAL_bool'].sum(),
            'title': group['PLACE_EDUCATIONAL_title'].sum(),
            'summary': group['PLACE_EDUCATIONAL_summary'].sum(),
            'article': group['PLACE_EDUCATIONAL_article'].sum()
            },
        'leisure': {
            'bool':group['PLACE_LEISURE_bool'].sum(),
            'title': group['PLACE_LEISURE_title'].sum(),
            'summary': group['PLACE_LEISURE_summary'].sum(),
            'article': group['PLACE_LEISURE_article'].sum()
            }
    }
    # CHECK BOOL FIRST
    places = []
    for key, value in case_places.items(): places.append(value['bool'])
    max_pos = list(np.flatnonzero(places == np.max(places)))
    if len(max_pos) == 1:
        type_place[positions[max_pos[0]]] += 1
        general_dict[cluster_id]['place'] = positions[max_pos[0]]
        continue 
        
    # CHECK TITLE + SUMMARY
    places = []
    for key, value in case_places.items(): places.append(value['title']+value['summary'])
    max_pos = list(np.flatnonzero(places == np.max(places)))
    if len(max_pos) == 1:
        type_place[positions[max_pos[0]]] += 1
        general_dict[cluster_id]['place'] = positions[max_pos[0]]
        continue 
        
    # CHECK ARTICLE'S BODY
    places = []
    for key, value in case_places.items(): places.append(value['article'])
    max_pos = list(np.flatnonzero(places == np.max(places)))
    if len(max_pos) == 1:
        type_place[positions[max_pos[0]]] += 1
        general_dict[cluster_id]['place'] = positions[max_pos[0]]
        continue 
    
    not_classified.append([case_places['public']['bool'], case_places['workplace']['bool'],
                         case_places['house']['bool'], case_places['education']['bool'],
                         case_places['leisure']['bool'], f'''{cluster_id} - {len(group)}'''])
    general_dict[cluster_id]['place'] = None

In [19]:
total_detected = sum(type_place.values())
print(f'''Total clusters classified: {total_detected} -- Clusters left out: {len(not_classified)} ''')
print(f'''Public places: {type_place['public']} which represent {round(type_place['public']/total*100, 3)}%''')
print(f'''House: {type_place['house']} which represent {round(type_place['house']/total*100, 3)}%''')
print(f'''Workplace: {type_place['workplace']} which represent {round(type_place['workplace']/total*100, 3)}%''')
print(f'''Educational place: {type_place['education']} which represent {round(type_place['education']/total*100, 3)}%''')
print(f'''Leisure place: {type_place['leisure']} which represent {round(type_place['leisure']/total*100, 3)}%''')
print(f'''Clusters not classified: {len(not_classified)} which represent {round(len(not_classified)/total*100, 3)}%''')

Total clusters classified: 232 -- Clusters left out: 24 
Public places: 113 which represent 39.1%
House: 25 which represent 8.651%
Workplace: 5 which represent 1.73%
Educational place: 3 which represent 1.038%
Leisure place: 86 which represent 29.758%
Clusters not classified: 24 which represent 8.304%


**Display conflictive cases that could not be classified**

In [20]:
pd.DataFrame(data = not_classified, columns = ['Public', 'Workplace', 'House', 'Educational', 'Leisure', 'Cluster id - size']).head(5)

Unnamed: 0,Public,Workplace,House,Educational,Leisure,Cluster id - size
0,0,0,0,0,0,27 - 2
1,2,0,2,0,2,46 - 2
2,0,0,0,0,0,58 - 1
3,1,0,1,0,1,69 - 1
4,6,0,6,0,6,75 - 6


## Assign characteristics to each case and compute frequency statistics

In [21]:
# CREATE DATAFRAME WITH type, bond, and place for each chase id
general_df = pd.DataFrame.from_dict(data=general_dict, orient = 'index').reset_index().rename(columns = {'index': 'case_id'})
cluster_size_df = pd.DataFrame(clusters_df['cluster_id'].value_counts()).reset_index().rename(columns = {'index': 'case_id', 'cluster_id': 'cluster_size'})
general_df = general_df.merge(cluster_size_df, on='case_id')
general_df.head()

Unnamed: 0,case_id,type,bond,place,cluster_size
0,0,abuse,stranger,public,4
1,1,abuse,known,public,3
2,2,abuse,known,public,2
3,3,abuse,relative,public,2
4,4,abuse,relationship,,7


In [22]:
def create_summary_df(original_df, key, total_cases, total_articles):
    output_df = pd.DataFrame(columns = ['Type', 'Number of cases', 'Percentage of cases', 'Number of articles', 'Percentage of articles'])
    
    for value, group in general_df.groupby(by=key):
        output_df = output_df.append({'Type': value,
                                      'Number of cases': len(group),
                                      'Percentage of cases': len(group)/total_cases*100,
                                      'Number of articles': group['cluster_size'].sum(),
                                      'Percentage of articles': group['cluster_size'].sum()/total_articles*100},
                                      ignore_index = True)
    not_classified = general_df[general_df[key].isna()]
    if key == 'place':
        not_classified = general_df[(general_df[key].isna()) & (general_df['bond'] != 'relationship')]
    output_df = output_df.append({'Type': None,
                                      'Number of cases': len(not_classified),
                                      'Percentage of cases': len(not_classified)/total_cases*100,
                                      'Number of articles': not_classified['cluster_size'].sum(),
                                      'Percentage of articles': not_classified['cluster_size'].sum()/total_articles*100},
                                      ignore_index = True)
    return output_df

In [23]:
total_cases = len(general_df)
total_articles = len(dataset.tweet_id.unique())
type_violence_df = create_summary_df(general_df, 'type', total_cases, total_articles)
bond_agressor_df = create_summary_df(general_df, 'bond', total_cases, total_articles)
place_df = create_summary_df(general_df, 'place', total_cases, total_articles)

In [24]:
type_violence_df

Unnamed: 0,Type,Number of cases,Percentage of cases,Number of articles,Percentage of articles
0,abuse,181.0,62.629758,342.0,68.951613
1,assault,71.0,24.567474,101.0,20.362903
2,harasment,24.0,8.304498,38.0,7.66129
3,,13.0,4.49827,15.0,3.024194


In [25]:
bond_agressor_df

Unnamed: 0,Type,Number of cases,Percentage of cases,Number of articles,Percentage of articles
0,known,82.0,28.373702,150.0,30.241935
1,relationship,33.0,11.418685,66.0,13.306452
2,relative,39.0,13.49481,51.0,10.282258
3,stranger,124.0,42.906574,211.0,42.540323
4,,11.0,3.806228,18.0,3.629032


In [26]:
place_df

Unnamed: 0,Type,Number of cases,Percentage of cases,Number of articles,Percentage of articles
0,education,3.0,1.038062,14.0,2.822581
1,house,25.0,8.650519,36.0,7.258065
2,leisure,86.0,29.757785,145.0,29.233871
3,public,113.0,39.100346,196.0,39.516129
4,workplace,5.0,1.730104,6.0,1.209677
5,,24.0,8.304498,33.0,6.653226


# CONTENT ANALYSIS
Check which information is present in each articles, and the articles' part in which appear (headline, subtitle and body).

## General information
Type of information considered:
- place
- time
- age

In [27]:
# AGGREGATE INFORMATION FROM THE DATASET PREVIOUSLY CREATED
# To avoid running again all the regular expressions
def mentions_info(dataset, key, type_info = '_bool'):
    num_articles = 0
    columns = [col for col in dataset.columns if (key in col) and (type_info in col)]
    for index, row in dataset[['cluster_id']+columns].iterrows():
        if any(row[columns] >= 1):
            num_articles += 1
    return num_articles

In [28]:
information_dict = dict(zip(['TIME', 'AGE',
                             'PLACE_PUBLIC','PLACE_WORKPLACE', 'PLACE_HOUSE', 
                             'PLACE_EDUCATIONAL', 'PLACE_LEISURE', 
                             'BOND_RELATIONSHIP', 'BOND_RELATIVE', 'BOND_KNOWN'], [{}]*10))

for key in information_dict.keys():
    aux_dict = {}
    aux_dict['Total'] = mentions_info(dataset, key+'_')
    aux_dict['Total percentage'] = round(aux_dict['Total']/len(dataset)*100, 4)
    aux_dict['Title'] = mentions_info(dataset, key+'_', '_title')
    aux_dict['Title percentage'] = round(aux_dict['Title']/len(dataset)*100, 4)
    aux_dict['Summary'] = mentions_info(dataset, key+'_', '_summary')
    aux_dict['Summary percentage'] = round(aux_dict['Summary']/len(dataset)*100, 4)
    aux_dict['Article'] = mentions_info(dataset, key+'_', '_article')
    aux_dict['Article percentage'] = round(aux_dict['Article']/len(dataset)*100, 4)
    information_dict[key] = aux_dict

In [29]:
information_df = pd.DataFrame.from_dict(data=information_dict, orient='index')
information_df

Unnamed: 0,Total,Total percentage,Title,Title percentage,Summary,Summary percentage,Article,Article percentage
PLACE,478,96.371,221,44.5565,195,39.3145,474,95.5645
TIME,140,28.2258,2,0.4032,10,2.0161,139,28.0242
AGE,471,94.9597,229,46.1694,238,47.9839,470,94.7581
STIGMA_ORIGIN,283,57.0565,49,9.879,56,11.2903,283,57.0565


## Stigmas
Type of information considered:
- stigmas intoxicated
- stigmas about clothing
- stigmas about the origin
- stigmas about the perpetrator
- mentions of vulnerability

In [30]:
stigmas_dict = dict(zip(['STIGMA_INTOXICATED', 'STIGMA_CLOTHING', 'STIGMA_ORIGIN', 'STIGMA_AGGRESSOR', 'STIGMA_VULNERABILITY'], [{}]*5))

for key in stigmas_dict.keys(): 
    aux_dict = {}
    aux_dict['Total'] = mentions_info(dataset, key+'_')
    aux_dict['Total percentage'] = round(aux_dict['Total']/len(dataset)*100, 4)
    aux_dict['Title'] = mentions_info(dataset, key+'_', '_title')
    aux_dict['Title percentage'] = round(aux_dict['Title']/len(dataset)*100, 4)
    aux_dict['Summary'] = mentions_info(dataset, key+'_', '_summary')
    aux_dict['Summary percentage'] = round(aux_dict['Summary']/len(dataset)*100, 4)
    aux_dict['Article'] = mentions_info(dataset, key+'_', '_article')
    aux_dict['Article percentage'] = round(aux_dict['Article']/len(dataset)*100, 4)
    stigmas_dict[key] = aux_dict

In [31]:
stigmas_df = pd.DataFrame.from_dict(data=stigmas_dict, orient='index')
stigmas_df

Unnamed: 0,Total,Total percentage,Title,Title percentage,Summary,Summary percentage,Article,Article percentage
STIGMA_INTOXICATED,92,18.5484,19,3.8306,11,2.2177,90,18.1452
STIGMA_CLOTHING,62,12.5,4,0.8065,6,1.2097,62,12.5
STIGMA_ORIGIN,283,57.0565,49,9.879,56,11.2903,283,57.0565
STIGMA_AGGRESSOR,20,4.0323,5,1.0081,0,0.0,18,3.629
STIGMA_VULNERABILITY,378,76.2097,160,32.2581,142,28.629,375,75.6048


## Expression
Type of information considered:
- Euphemisms
- Doubt-related expressions
- Type of sexual violence 

In [32]:
expression_dict = dict(zip(['EXPRESSION_EUPHEMISM', 'EXPRESSION_DOUBT', 'SEXUAL_ASSAULT', 'SEXUAL_HARASSMENT', 'SEXUAL_ABUSE'], [{}]*5))

for key in expression_dict.keys(): 
    aux_dict = {}
    aux_dict['Total'] = mentions_info(dataset, key+'_')
    aux_dict['Total percentage'] = round(aux_dict['Total']/len(dataset)*100, 4)
    aux_dict['Title'] = mentions_info(dataset, key+'_', '_title')
    aux_dict['Title percentage'] = round(aux_dict['Title']/len(dataset)*100, 4)
    aux_dict['Summary'] = mentions_info(dataset, key+'_', '_summary')
    aux_dict['Summary percentage'] = round(aux_dict['Summary']/len(dataset)*100, 4)
    aux_dict['Article'] = mentions_info(dataset, key+'_', '_article')
    aux_dict['Article percentage'] = round(aux_dict['Article']/len(dataset)*100, 4)
    expression_dict[key] = aux_dict

In [33]:
expression_df = pd.DataFrame.from_dict(data=expression_dict, orient='index')
expression_df

**Store all the results**

In [42]:
general_df =  information_df.append(stigmas_df).append(expression_df)

Unnamed: 0,Total,Total percentage,Title,Title percentage,Summary,Summary percentage,Article,Article percentage
PLACE,478,96.371,221,44.5565,195,39.3145,474,95.5645
TIME,140,28.2258,2,0.4032,10,2.0161,139,28.0242
AGE,471,94.9597,229,46.1694,238,47.9839,470,94.7581
STIGMA_ORIGIN,283,57.0565,49,9.879,56,11.2903,283,57.0565
STIGMA_INTOXICATED,92,18.5484,19,3.8306,11,2.2177,90,18.1452
STIGMA_CLOTHING,62,12.5,4,0.8065,6,1.2097,62,12.5
STIGMA_ORIGIN,283,57.0565,49,9.879,56,11.2903,283,57.0565
STIGMA_AGGRESSOR,20,4.0323,5,1.0081,0,0.0,18,3.629
STIGMA_VULNERABILITY,378,76.2097,160,32.2581,142,28.629,375,75.6048
EXPRESSION_EUPHEMISM,77,15.5242,1,0.2016,3,0.6048,76,15.3226


# ASSOCIATION RULES
We checked for relevant associations in terms of content mentioned in the dataset, focusing on three groups of associations:

- A: between informative terms (information about the case)
- B: between different stigmas and expressions 
- C: rules containing categories from both A and B

**Format the dataset**

In [35]:
bool_dataset = dataset[[col for col in dataset.columns if '_bool' in col]]

In [36]:
rename_dict = {}
for col in bool_dataset.columns:
    rename_dict[col] = col.replace('_bool', '')
    
bool_dataset = bool_dataset.rename(columns=rename_dict)

In [37]:
info_columns = ['SEXUAL_ASSAULT', 'SEXUAL_HARASSMENT', 'SEXUAL_ABUSE',
        'BOND_RELATIONSHIP', 'BOND_RELATIVE', 'BOND_KNOWN',
        'PLACE_PUBLIC', 'PLACE_WORKPLACE', 'PLACE_HOUSE', 'PLACE_EDUCATIONAL', 'PLACE_LEISURE',
        'AGE', 'TIME']

stigma_expression_columns = ['STIGMA_INTOXICATED', 'STIGMA_CLOTHING', 'STIGMA_ORIGIN',
       'STIGMA_AGGRESSOR', 'STIGMA_VULNERABILITY', 'EXPRESSION_EUPHEMISM',
       'EXPRESSION_DOUBT']

info_presence = []
for i, row in bool_dataset[info_columns].iterrows():
    article = []
    for col, value in row.items(): 
        if value == 1: article.append(col)
    info_presence.append(article)
    
stigma_expression_presence = [] 
for i, row in bool_dataset[stigma_expression_columns].iterrows():
    article = []
    for col, value in row.items(): 
        if value == 1: article.append(col)
    stigma_expression_presence.append(article)

general_presence = []
for i, row in bool_dataset.iterrows():
    article = []
    for col, value in row.items(): 
        if value == 1: article.append(col)
    general_presence.append(article)

**Set thresholds and format the result**

In [38]:
# Same thresholds for info and stigma 
min_sup_ = 0.1
min_conf_ = 0.5
min_lift_ = 1.05
min_len_ = 1
max_len_ = 3

info_rules = apriori(info_presence, min_support = min_sup_, min_confidence = min_conf_, min_lift = min_lift_, min_length = min_len_, max_length = max_len_)
stigma_rules = apriori(stigma_expression_presence, min_support = min_sup_, min_confidence = min_conf_, min_lift = min_lift_, min_length = min_len_, max_length = max_len_)

#
min_sup_ = 0.3
info_rules = apriori(general_presence, min_support = min_sup_, min_confidence = min_conf_, min_lift = min_lift_, min_length = min_len_, max_length = max_len_)

In [39]:
info_results = pd.DataFrame(columns = ['antecedent', 'consequent', 'support', 'confidence', 'lift'])
for element in list(info_rules):
    if len(list(element.items)) == 1: 
        continue
    for item in element.ordered_statistics:
        if len(item.items_base) == 0:
            continue
    set_B = list(set(item.items_add))
   
    # GET ONLY MEANINGFUL CRULES
    # Check the support of the consequent is lower than rules' confidence
    if len(set_B) == 1: 
        item_B = set_B[0]
        if general_df.loc[item_B, 'Total percentage']/100 > item.confidence:
            continue
            
    if elem_info != 0 and elem_stigma != 0:
        info_results = info_results.append({
            'set_A': list(set(item.items_base)),
            'set_B': list(set(item.items_add)),
            'support': element.support,
            'confidence': item.confidence,
            'lift': item.lift
        }, ignore_index=True)
        
info_results = info_results.sort_by(by='lift', ascending = False)
info_results

Unnamed: 0,set_A,set_B,support,confidence,lift
75,"['SEXUAL_ABUSE', 'BOND_RELATIONSHIP']",['BOND_RELATIVE'],0.110887,0.617978,1.892079
72,"['PLACE_HOUSE', 'BOND_RELATIVE']",['BOND_RELATIONSHIP'],0.108871,0.504673,1.800847
74,"['PLACE_PUBLIC', 'BOND_RELATIONSHIP']",['BOND_RELATIVE'],0.118952,0.541284,1.657266
76,"['SEXUAL_ASSAULT', 'BOND_RELATIONSHIP']",['BOND_RELATIVE'],0.102823,0.51,1.561481
92,"['PLACE_HOUSE', 'SEXUAL_HARASSMENT']",['BOND_RELATIVE'],0.108871,0.5,1.530864
80,"['SEXUAL_ASSAULT', 'BOND_RELATIONSHIP']",['PLACE_HOUSE'],0.153226,0.76,1.484094
77,"['PLACE_LEISURE', 'BOND_RELATIONSHIP']",['PLACE_HOUSE'],0.177419,0.752137,1.468739
79,"['SEXUAL_ABUSE', 'BOND_RELATIONSHIP']",['PLACE_HOUSE'],0.133065,0.741573,1.448111
27,"['AGE', 'BOND_RELATIONSHIP']",['PLACE_HOUSE'],0.189516,0.717557,1.401214
78,"['PLACE_PUBLIC', 'BOND_RELATIONSHIP']",['PLACE_HOUSE'],0.155242,0.706422,1.37947


In [40]:
stigma_results = pd.DataFrame(columns = ['antecedent', 'consequent', 'support', 'confidence', 'lift'])
for element in list(stigma_rules):
    if len(list(element.items)) == 1: 
        continue
    for item in element.ordered_statistics:
        if len(item.items_base) == 0:
            continue
    set_B = list(set(item.items_add))
   
    # GET ONLY MEANINGFUL CRULES
    # Check the support of the consequent is lower than rules' confidence
    if len(set_B) == 1: 
        item_B = set_B[0]
        if general_df.loc[item_B, 'Total percentage']/100 > item.confidence:
            continue
            
    if elem_info != 0 and elem_stigma != 0:
        stigma_results = stigma_results.append({
            'set_A': list(set(item.items_base)),
            'set_B': list(set(item.items_add)),
            'support': element.support,
            'confidence': item.confidence,
            'lift': item.lift
        }, ignore_index=True)
        
stigma_results = stigma_results.sort_by(by='lift', ascending = False)
stigma_results.head()

Unnamed: 0,set_A,set_B,support,confidence,lift
1,['STIGMA_INTOXICATED'],['STIGMA_ORIGIN'],0.133065,0.717391,1.257336
4,"['STIGMA_VULNERABILITY', 'STIGMA_INTOXICATED']",['STIGMA_ORIGIN'],0.108871,0.701299,1.229131
0,['EXPRESSION_EUPHEMISM'],['STIGMA_VULNERABILITY'],0.137097,0.883117,1.158799
2,['STIGMA_INTOXICATED'],['STIGMA_VULNERABILITY'],0.155242,0.836957,1.098229
3,"['STIGMA_VULNERABILITY', 'STIGMA_INTOXICATED']",['EXPRESSION_DOUBT'],0.104839,0.675325,1.056659


In [41]:
general_results = pd.DataFrame(columns = ['antecedent', 'consequent', 'support', 'confidence', 'lift'])
for element in list(general_rules):
    if len(list(element.items)) == 1: 
        continue
    for item in element.ordered_statistics:
        if len(item.items_base) == 0:
            continue
    set_B = list(set(item.items_add))
   
    # GET ONLY MEANINGFUL CRULES
    # Check the support of the consequent is lower than rules' confidence
    if len(set_B) == 1: 
        item_B = set_B[0]
        if general_df.loc[item_B, 'Total percentage']/100 > item.confidence:
            continue
            
    set_A = set(item.items_base)
    set_B = set(item.items_add)
    elem_info = 0
    elem_stigma = 0
    for elem in set_A.union(set_B):
        if elem in stigma_expression_columns: elem_stigma += 1
        elif elem in info_columns: elem_info += 1    
         
    if elem_info != 0 and elem_stigma != 0:
        general_results = general_results.append({
            'set_A': list(set(item.items_base)),
            'set_B': list(set(item.items_add)),
            'support': element.support,
            'confidence': item.confidence,
            'lift': item.lift
        }, ignore_index=True)
        
general_results = general_results.sort_by(by='lift', ascending = False)
general_results.head(10)

Unnamed: 0,set_A,set_B,support,confidence,lift
30,"['STIGMA_ORIGIN', 'SEXUAL_ABUSE']",['PLACE_PUBLIC'],0.342742,0.944444,1.216739
20,"['EXPRESSION_DOUBT', 'SEXUAL_ABUSE']",['PLACE_PUBLIC'],0.372984,0.906863,1.168322
31,"['STIGMA_VULNERABILITY', 'SEXUAL_ABUSE']",['PLACE_PUBLIC'],0.477823,0.901141,1.16095
14,"['PLACE_PUBLIC', 'EXPRESSION_DOUBT']",['BOND_KNOWN'],0.336694,0.668,1.138584
28,"['PLACE_LEISURE', 'STIGMA_VULNERABILITY']",['SEXUAL_ABUSE'],0.429435,0.71,1.136
35,"['STIGMA_VULNERABILITY', 'SEXUAL_ASSAULT']",['SEXUAL_ABUSE'],0.40121,0.705674,1.129078
17,"['PLACE_PUBLIC', 'STIGMA_ORIGIN']",['BOND_KNOWN'],0.300403,0.659292,1.123742
13,"['PLACE_LEISURE', 'EXPRESSION_DOUBT']",['BOND_KNOWN'],0.330645,0.656,1.118131
2,['STIGMA_VULNERABILITY'],['SEXUAL_ABUSE'],0.530242,0.695767,1.113228
9,"['STIGMA_VULNERABILITY', 'AGE']",['SEXUAL_ABUSE'],0.52621,0.694149,1.110638
