# Data Exploration 

**Dataset**: Detecting Patronizing and Condescending Language

In [358]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import country_converter as coco

In [204]:
CATEGORIES_DATA_PATH = "../data/dontpatronizeme_v1.4/dontpatronizeme_categories.tsv"
SINGLE_DATA_PATH = "../data/dontpatronizeme_v1.4/dontpatronizeme_pcl.tsv"

TEMPLATE='plotly_white'

BLUE_COLOR = "#37689e"
TURQUOISE_COLOR = "#4bb4c8"
LIGHT_BLUE_COLOR = "#9bbdcf"
YELLOW_COLOR = "#e49a21"
LIGHT_YELLOW_COLOR = "#f2c576"
RED_COLOR = "#da2f20"
LIGHT_RED_COLOR = "#e66f4b"

In [16]:
binary_df = pd.read_csv(
    SINGLE_DATA_PATH,
    skiprows=4,
    sep='\t',
    names=["par_id", "art_id", "keyword", "country_code","text","label"],
)
binary_df = binary_df.dropna()
del binary_df['art_id']
binary_df["label"] = binary_df["label"].transform(lambda x: 0 if x <= 1 else 1)
binary_df.head()

Unnamed: 0,par_id,keyword,country_code,text,label
0,1,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,migrant,gh,"In Libya today , there are countless number of...",0
2,3,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,disabled,nz,Council customers only signs would be displaye...,0
4,5,refugee,ca,""" Just like we received migrants fleeing El Sa...",0


## Label analysis

In [206]:
labels = ["No PCL", "PCL"]
values = binary_df["label"].value_counts().to_numpy()

binary_colors = [LIGHT_BLUE_COLOR, LIGHT_RED_COLOR]

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'xy'}]])
fig.add_trace(go.Pie(
    labels=labels,
    values=values,
    marker_colors = binary_colors, hole=0.4, 
), 1, 1)

fig.add_trace(go.Bar(
    x=labels,
    y=values, marker_color=binary_colors,
    text =values, textfont_color='white'
), 1, 2)

fig.update_layout(
    title = dict(text="PCL vs. No PCL<br>For each 10 texts, <b>only 1</b> contains PCL"),
    showlegend=False, template=TEMPLATE, width=1000
)
fig.show()

## Keyword analysis

In [212]:
s = binary_df["keyword"].value_counts()
keywords_df = pd.DataFrame({'keyword': s.index, 'frequency': s.values})
keywords = []
pcl_per_kw = []
not_pcl_per_kw = []
for keyword in keywords_df["keyword"]:
    values = binary_df[binary_df["keyword"] == keyword]["label"].value_counts().to_numpy()
    keywords.append(keyword)
    not_pcl_per_kw.append(values[0])
    pcl_per_kw.append(values[1])
table_df = pd.DataFrame({'Keyword': keywords, "No PCL Frequency": not_pcl_per_kw, "PCL Frequency": pcl_per_kw})
table_df["No PCL Ratio"] = table_df["No PCL Frequency"] / (table_df["No PCL Frequency"] + table_df["PCL Frequency"])
table_df["PCL Ratio"] = table_df["PCL Frequency"] / (table_df["No PCL Frequency"] + table_df["PCL Frequency"])

fig = go.Figure()
fig.add_trace(go.Bar(
    y = table_df["No PCL Frequency"],
    x = table_df["Keyword"],
    orientation = 'v', marker_color=LIGHT_BLUE_COLOR,
    name = 'No PCL', text=np.round(table_df["No PCL Ratio"], 2),
    textfont_color='white', textfont_size=10, textposition='auto'
))

fig.add_trace(go.Bar(
    y = table_df["PCL Frequency"],
    x = table_df["Keyword"],
    orientation = 'v', marker_color=LIGHT_RED_COLOR,
    name = 'PCL', text=np.round(table_df["PCL Ratio"], 2),
    textfont_color='black', textfont_size=10, textposition='outside'
))
fig.update_layout(
    template=TEMPLATE,
    title = dict(
        text="Keywords distribution<br>The corpus contains an <b>equal distribution</b> of news per vulnerable groups",
        font_size=14
    ),
    barmode='stack'
)
fig.show()

## Country analysis

In [602]:
continent_countries = {
    'za': 'Africa',
    'my': 'Asia',
    'ph': 'Asia',
    'pk': 'Asia',
    'us': 'America',
    'au': 'Australia',
    'gb': 'Europe',
    'ke': 'Africa',
    'ng': 'Africa',
    'ie': 'Europe',
    'sg': 'Asia',
    'in': 'Asia',
    'ca': 'America',
    'gh': 'Africa',
    'nz': 'Australia',
    'bd': 'Asia',
    'lk': 'Asia',
    'hk': 'Asia',
    'jm': 'America',
    'tz': 'Africa',
}
continents = ['America', 'Africa', 'Asia', 'Europe', 'Australia']

In [603]:
num_countries = binary_df["country_code"].value_counts().index.shape[0]
countries_short =  binary_df["country_code"].value_counts().index
countries = [coco.convert(names=[x], to='short_name') for x in countries_short]
NUM_COUNTRIES = 20
top_countries = binary_df["country_code"].value_counts()[:NUM_COUNTRIES].index.to_numpy()
#top_countries_names = [coco.convert(names=[code], to='short_name') for code in top_countries]
keywords = np.array(keywords)
pcl_labels = np.array(["No PCL", "PCL"])

labels = list(continents) + list(keywords) + list(pcl_labels)
labels_dict = dict()
for i, label in enumerate(labels):
    labels_dict[label] = i

In [604]:
countries_df = binary_df[["country_code", "keyword", "label"]]
countries_series = countries_df.groupby(by=["country_code", "keyword", "label"])["keyword"].count()

sources = []
targets = []
values = []
colors = []

pcl_label = labels_dict["PCL"]
no_pcl_label = labels_dict["No PCL"]

countries_dict = dict()

for country in top_countries:
    continent_from_country = continent_countries[country]
    for kw in keywords:
        kw_label = labels_dict[kw]

        if countries_series[(country, kw)].shape[0] == 2:
            no_pcl = countries_series[(country, kw)][0]
            pcl = countries_series[(country, kw)][1]
        else:
            no_pcl = countries_series[(country, kw)][0]
            pcl = 0
        
        total = no_pcl + pcl
        key = (country, kw)
        if key not in countries_dict:
            countries_dict[key] = 0
        countries_dict[key] += total

        sources.append(kw_label)
        targets.append(no_pcl_label)
        values.append(no_pcl)
        colors.append("#d7ecf7")

        sources.append(kw_label)
        targets.append(pcl_label)
        values.append(pcl)
        colors.append("#f9a990")

In [605]:
continents_dict = dict()
for (country, keyword) in countries_dict.keys():
    continent = continent_countries[country]
    key = (continent, keyword)
    if key not in continents_dict:
        continents_dict[key] = 0
    continents_dict[key] += countries_dict[((country, keyword))]

In [606]:
for (continent, keyword) in continents_dict.keys():
    sources.append(labels_dict[continent])
    targets.append(labels_dict[keyword])
    values.append(continents_dict[(continent, keyword)])
    if continent == "Asia": 
        colors.append("#f7d7a3")
    else:
        colors.append("#d1d1d1")

In [607]:
#labels = [coco.convert(names=[labels[i]], to='short_name') if i < NUM_COUNTRIES else labels[i].capitalize() for i in range(len(labels))]
labels = ["PCL" if i == len(labels)-1 else labels[i] for i in range(len(labels))]
labels = ["No PCL" if i == len(labels)-2 else labels[i] for i in range(len(labels))]

fig = go.Figure()
fig.add_trace(go.Sankey(
    node = dict(
        pad = 15, thickness=20, line = dict(color="grey", width=0.5),
        label = labels,
        color = ["gray", "gray", YELLOW_COLOR, "gray", "gray"] + ["gray"]*len(keywords) + binary_colors
    ),
    link = dict(
        source=sources, target=targets, value=values, color=colors
    )
))
fig.update_layout(
    title=dict(
        text=f"<b>Asia</b> is the continent that contributes more <br>to vulnerable groups news",
        font_size=13
    ), template=TEMPLATE
)
fig.show()