# Data Exploration 

**Dataset**: Detecting Patronizing and Condescending Language

In [98]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [53]:
CATEGORIES_DATA_PATH = "../data/dontpatronizeme_v1.4/dontpatronizeme_categories.tsv"
SINGLE_DATA_PATH = "../data/dontpatronizeme_v1.4/dontpatronizeme_pcl.tsv"

TEMPLATE='plotly_white'

BLUE_COLOR = "#37689e"
TURQUOISE_COLOR = "#4bb4c8"
LIGHT_BLUE_COLOR = "#9bbdcf"
YELLOW_COLOR = "#e49a21"
LIGHT_YELLOW_COLOR = "#f2c576"
RED_COLOR = "#da2f20"
LIGHT_RED_COLOR = "#e66f4b"

In [16]:
binary_df = pd.read_csv(
    SINGLE_DATA_PATH,
    skiprows=4,
    sep='\t',
    names=["par_id", "art_id", "keyword", "country_code","text","label"],
)
binary_df = binary_df.dropna()
del binary_df['art_id']
binary_df["label"] = binary_df["label"].transform(lambda x: 0 if x <= 1 else 1)
binary_df.head()

Unnamed: 0,par_id,keyword,country_code,text,label
0,1,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,migrant,gh,"In Libya today , there are countless number of...",0
2,3,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,disabled,nz,Council customers only signs would be displaye...,0
4,5,refugee,ca,""" Just like we received migrants fleeing El Sa...",0


## Label analysis

In [63]:
labels = ["No PCL", "PCL"]
values = binary_df["label"].value_counts().to_numpy()

binary_colors = [LIGHT_BLUE_COLOR, LIGHT_RED_COLOR]

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'xy'}]])
fig.add_trace(go.Pie(
    labels=labels,
    values=values,
    marker_colors = binary_colors, hole=0.4
), 1, 1)

fig.add_trace(go.Bar(
    x=labels,
    y=values, marker_color=binary_colors
), 1, 2)

fig.update_layout(
    title = dict(text="PCL vs. No PCL<br>For each 10 texts, <b>only 1</b> contains PCL"),
    showlegend=False, template=TEMPLATE, width=1000
)
fig.show()

## Keyword analysis

In [194]:
s = binary_df["keyword"].value_counts()
keywords_df = pd.DataFrame({'keyword': s.index, 'frequency': s.values})
keywords = []
pcl_per_kw = []
not_pcl_per_kw = []
for keyword in keywords_df["keyword"]:
    values = binary_df[binary_df["keyword"] == keyword]["label"].value_counts().to_numpy()
    keywords.append(keyword)
    not_pcl_per_kw.append(values[0])
    pcl_per_kw.append(values[1])
table_df = pd.DataFrame({'Keyword': keywords, "No PCL Frequency": not_pcl_per_kw, "PCL Frequency": pcl_per_kw})
table_df["No PCL Ratio"] = table_df["No PCL Frequency"] / (table_df["No PCL Frequency"] + table_df["PCL Frequency"])
table_df["PCL Ratio"] = table_df["PCL Frequency"] / (table_df["No PCL Frequency"] + table_df["PCL Frequency"])

fig = go.Figure()
fig.add_trace(go.Bar(
    x = table_df["No PCL Frequency"],
    y = table_df["Keyword"],
    orientation = 'h', marker_color=LIGHT_BLUE_COLOR,
    name = 'No PCL', text=np.round(table_df["No PCL Ratio"], 2),
    textfont_color='white', textfont_size=10, textposition='auto'
))

fig.add_trace(go.Bar(
    x = table_df["PCL Frequency"],
    y = table_df["Keyword"],
    orientation = 'h', marker_color=LIGHT_RED_COLOR,
    name = 'PCL', text=np.round(table_df["PCL Ratio"], 2),
    textfont_color='white', textfont_size=10, textposition='inside'
))
fig.update_layout(
    template=TEMPLATE,
    title = dict(
        text="Keywords distribution<br>The corpus contains an <b>equal distribution</b> of vulnerable groups news",
        font_size=14
    ), 
    barmode='relative'
)
fig.show()