# Data Exploration 

**Dataset**: Detecting Patronizing and Condescending Language

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import country_converter as coco
import plotly.io as pio

# NLP
from textblob import TextBlob
import re
import nltk
nltk.download("wordnet")
lst_stopwords = nltk.corpus.stopwords.words("english")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nestorivanmo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
CATEGORIES_DATA_PATH = "../data/dontpatronizeme_v1.4/dontpatronizeme_categories.tsv"
SINGLE_DATA_PATH = "../data/dontpatronizeme_v1.4/dontpatronizeme_pcl.tsv"

TEMPLATE='plotly_white'

BLUE_COLOR = "#37689e"
TURQUOISE_COLOR = "#4bb4c8"
LIGHT_BLUE_COLOR = "#9bbdcf"
YELLOW_COLOR = "#e49a21"
LIGHT_YELLOW_COLOR = "#f2c576"
RED_COLOR = "#da2f20"
LIGHT_RED_COLOR = "#e66f4b"

In [3]:
binary_df = pd.read_csv(
    SINGLE_DATA_PATH,
    skiprows=4,
    sep='\t',
    names=["par_id", "art_id", "keyword", "country_code","text","label"],
)
binary_df = binary_df.dropna()
del binary_df['art_id']
binary_df["label"] = binary_df["label"].transform(lambda x: 0 if x <= 1 else 1)
binary_df.head()

Unnamed: 0,par_id,keyword,country_code,text,label
0,1,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,migrant,gh,"In Libya today , there are countless number of...",0
2,3,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,disabled,nz,Council customers only signs would be displaye...,0
4,5,refugee,ca,""" Just like we received migrants fleeing El Sa...",0


In [4]:
binary_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10468 entries, 0 to 10468
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   par_id        10468 non-null  int64 
 1   keyword       10468 non-null  object
 2   country_code  10468 non-null  object
 3   text          10468 non-null  object
 4   label         10468 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 490.7+ KB


## Label analysis

In [44]:
labels = ["Sin PCL", "PCL"]
values = binary_df["label"].value_counts().to_numpy()

binary_colors = [LIGHT_BLUE_COLOR, LIGHT_RED_COLOR]

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'xy'}]])
fig.add_trace(go.Pie(
    labels=labels,
    values=values,
    marker_colors = binary_colors, hole=0.4, 
    textfont_color='white', textfont_size=20
), 1, 1)

fig.add_trace(go.Bar(
    x=labels,
    y=values, marker_color=binary_colors,
    text =values, textfont_color='white', textfont_size=15
), 1, 2)

fig.update_layout(
    title = dict(text="PCL vs. Sin PCL<br>De cada 10 textos, <b>solo 1</b> contiene PCL", font_size=25),
    showlegend=False, template=TEMPLATE, width=1000
)
fig.show()

## Keyword analysis

In [74]:
s = binary_df["keyword"].value_counts()
keywords_df = pd.DataFrame({'keyword': s.index, 'frequency': s.values})
keywords = []
pcl_per_kw = []
not_pcl_per_kw = []
for keyword in keywords_df["keyword"]:
    values = binary_df[binary_df["keyword"] == keyword]["label"].value_counts().to_numpy()
    keywords.append(keyword)
    not_pcl_per_kw.append(values[0])
    pcl_per_kw.append(values[1])
table_df = pd.DataFrame({'Keyword': keywords, "No PCL Frequency": not_pcl_per_kw, "PCL Frequency": pcl_per_kw})
table_df["No PCL Ratio"] = table_df["No PCL Frequency"] / (table_df["No PCL Frequency"] + table_df["PCL Frequency"])
table_df["PCL Ratio"] = table_df["PCL Frequency"] / (table_df["No PCL Frequency"] + table_df["PCL Frequency"])
table_df["total"] = table_df["No PCL Frequency"] + table_df["PCL Frequency"]
table_df = table_df.sort_values(by="total", ascending=False)

fig = go.Figure()
fig.add_trace(go.Bar(
    y = table_df["No PCL Frequency"],
    x = table_df["Keyword"],
    orientation = 'v', marker_color=LIGHT_BLUE_COLOR,
    name = 'Sin PCL', text=np.round(table_df["No PCL Ratio"], 2),
    textfont_color='white', textfont_size=20, textposition='auto'
))

fig.add_trace(go.Bar(
    y = table_df["PCL Frequency"],
    x = table_df["Keyword"],
    orientation = 'v', marker_color=LIGHT_RED_COLOR,
    name = 'PCL', text=np.round(table_df["PCL Ratio"], 2),
    textfont_color='black', textfont_size=20, textposition='outside'
))
fig.update_layout(
    template=TEMPLATE,
    title = dict(
        text="El corpus contiene una <b>distribución uniforme</b> de noticias de grupos vulnerables",
        font_size=25
    ),
    barmode='stack', height=600, showlegend=False
)

fig.add_annotation(
    x = 9, y = 200, 
    text = "PCL", showarrow=False,
    font = dict(size=30, color='white'), 
    bordercolor='white', borderwidth=2,
    bgcolor=LIGHT_BLUE_COLOR
)

fig.add_annotation(
    x = 1, y = 1000, 
    text = "Sin PCL", showarrow=False,
    font = dict(size=20, color='white'), 
    bordercolor='white', borderwidth=2,
    bgcolor=LIGHT_RED_COLOR
)

fig.show()

In [69]:
table_df["total"] = table_df["No PCL Frequency"] + table_df["PCL Frequency"]
table_df

Unnamed: 0,Keyword,No PCL Frequency,PCL Frequency,No PCL Ratio,PCL Ratio,total
0,migrant,1052,36,0.966912,0.033088,1088
1,in-need,906,176,0.837338,0.162662,1082
2,vulnerable,1000,80,0.925926,0.074074,1080
3,homeless,899,178,0.834726,0.165274,1077
4,women,1018,52,0.951402,0.048598,1070
5,refugee,982,86,0.919476,0.080524,1068
6,immigrant,1031,30,0.971725,0.028275,1061
7,disabled,947,81,0.921206,0.078794,1028
8,hopeless,881,124,0.876617,0.123383,1005
9,poor-families,759,150,0.834983,0.165017,909


## Country analysis

In [76]:
continent_countries = {
    'za': 'África',
    'my': 'Asia',
    'ph': 'Asia',
    'pk': 'Asia',
    'us': 'América',
    'au': 'Oceanía',
    'gb': 'Europa',
    'ke': 'África',
    'ng': 'África',
    'ie': 'Europa',
    'sg': 'Asia',
    'in': 'Asia',
    'ca': 'América',
    'gh': 'África',
    'nz': 'Oceanía',
    'bd': 'Asia',
    'lk': 'Asia',
    'hk': 'Asia',
    'jm': 'América',
    'tz': 'África',
}
continents = ['América', 'África', 'Asia', 'Europa', 'Oceanía']

In [77]:
num_countries = binary_df["country_code"].value_counts().index.shape[0]
countries_short =  binary_df["country_code"].value_counts().index
countries = [coco.convert(names=[x], to='short_name') for x in countries_short]
NUM_COUNTRIES = 20
top_countries = binary_df["country_code"].value_counts()[:NUM_COUNTRIES].index.to_numpy()
#top_countries_names = [coco.convert(names=[code], to='short_name') for code in top_countries]
keywords = np.array(keywords)
pcl_labels = np.array(["No PCL", "PCL"])

labels = list(continents) + list(keywords) + list(pcl_labels)
labels_dict = dict()
for i, label in enumerate(labels):
    labels_dict[label] = i

In [78]:
countries_df = binary_df[["country_code", "keyword", "label"]]
countries_series = countries_df.groupby(by=["country_code", "keyword", "label"])["keyword"].count()

sources = []
targets = []
values = []
colors = []

pcl_label = labels_dict["PCL"]
no_pcl_label = labels_dict["No PCL"]

countries_dict = dict()

for country in top_countries:
    continent_from_country = continent_countries[country]
    for kw in keywords:
        kw_label = labels_dict[kw]

        if countries_series[(country, kw)].shape[0] == 2:
            no_pcl = countries_series[(country, kw)][0]
            pcl = countries_series[(country, kw)][1]
        else:
            no_pcl = countries_series[(country, kw)][0]
            pcl = 0
        
        total = no_pcl + pcl
        key = (country, kw)
        if key not in countries_dict:
            countries_dict[key] = 0
        countries_dict[key] += total

        sources.append(kw_label)
        targets.append(no_pcl_label)
        values.append(no_pcl)
        colors.append("#d7ecf7")

        sources.append(kw_label)
        targets.append(pcl_label)
        values.append(pcl)
        colors.append("#f9a990")

In [79]:
continents_dict = dict()
for (country, keyword) in countries_dict.keys():
    continent = continent_countries[country]
    key = (continent, keyword)
    if key not in continents_dict:
        continents_dict[key] = 0
    continents_dict[key] += countries_dict[((country, keyword))]

In [80]:
for (continent, keyword) in continents_dict.keys():
    sources.append(labels_dict[continent])
    targets.append(labels_dict[keyword])
    values.append(continents_dict[(continent, keyword)])
    if continent == "Asia": 
        colors.append("#f7d7a3")
    else:
        colors.append("#d1d1d1")

In [92]:
#labels = [coco.convert(names=[labels[i]], to='short_name') if i < NUM_COUNTRIES else labels[i].capitalize() for i in range(len(labels))]
labels = ["PCL" if i == len(labels)-1 else labels[i] for i in range(len(labels))]
labels = ["Sin PCL" if i == len(labels)-2 else labels[i] for i in range(len(labels))]

fig = go.Figure()
fig.add_trace(go.Sankey(
    node = dict(
        pad = 40, thickness=50, line = dict(color="grey", width=0.5),
        label = labels,
        color = ["gray", "gray", YELLOW_COLOR, "gray", "gray"] + ["gray"]*len(keywords) + binary_colors
    ),
    link = dict(
        source=sources, target=targets, value=values, color=colors
    )
))
fig.update_layout(
    title=dict(
        text=f"<b>Asia</b> es el continente que más contribuye <br>noticias sobre grupos vulnerables",
        font_size=25
    ), template=TEMPLATE, 
    height=800, width=1000
)
fig.update_traces(
    textfont_size=20, orientation='h'
)
fig.show()

## Text cleaning

In [14]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [15]:
binary_df["text"] = binary_df["text"].apply(
    lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, lst_stopwords=lst_stopwords)
)

## Sentiment Analysis

In [16]:
text_df = binary_df[["text", "label", "keyword"]]
text_df["sentiment"] = text_df["text"].apply(
    lambda x: TextBlob(x).sentiment.polarity
)
text_df["subjectivity"] = text_df["text"].apply(
    lambda x: TextBlob(x).sentiment.subjectivity
)
text_df.head()

Unnamed: 0,text,label,keyword,sentiment,subjectivity
0,living time absolute insanity pretty sure peop...,0,hopeless,0.067361,0.467824
1,libya today countless number ghanaian nigerian...,0,migrant,0.0,0.75
2,white house press secretary sean spicer said f...,0,immigrant,-0.133333,0.2
3,council customer sign would displayed two spac...,0,disabled,-0.2,0.3
4,like received migrant fleeing el salvador guat...,0,refugee,0.1,0.1


In [20]:
def key_from_sentiment(sentiment):
    if sentiment < 0: return "negativo"    
    if sentiment == 0: return "neutral"
    return "positivo"

sentiment_pcl_dict = {"neutral": 0, "positivo": 0, "negativo": 0}
sentiment_no_pcl_dict = {"neutral": 0, "positivo": 0, "negativo": 0}
for idx, row in text_df.iterrows():
    if row["label"] == 0:
        sentiment_no_pcl_dict[key_from_sentiment(row["sentiment"])] += 1
    else:
        sentiment_pcl_dict[key_from_sentiment(row["sentiment"])] += 1

In [98]:
fig = ff.create_distplot(
    [text_df[text_df["label"] == 0]["sentiment"], text_df[text_df["label"] == 1]["sentiment"]],
    group_labels=["Sin PCL", "PCL"], bin_size=0.2, show_rug=False,
    show_hist=False, colors=[LIGHT_BLUE_COLOR, LIGHT_RED_COLOR]
)

trace2 = go.Bar(
    x = list(sentiment_no_pcl_dict.keys()), 
    y = list(sentiment_no_pcl_dict.values()),
    marker_color=LIGHT_BLUE_COLOR, name="Sin PCL",
    text = np.round(np.array(list(sentiment_no_pcl_dict.values())) / np.sum(np.array(list(sentiment_no_pcl_dict.values()))), 3),
    textfont_color="white", textposition='inside',
    textfont_size=20,
    xaxis='x2', yaxis='y2'
)

fig.add_traces([trace2])

fig['layout']['xaxis2'] = {}
fig['layout']['yaxis2'] = {}

fig.layout.xaxis.update({'domain': [0, .5]})
fig.layout.xaxis2.update({'domain': [0.6, 1.]})
fig.layout.yaxis2.update({'anchor': 'x2'})

#fig.layout.margin.update({'t':50, 'b':100})
fig.update_layout(
    template=TEMPLATE, title=dict(text="Distribución en porcentaje de párrafos sin PCL", font_size=27),
    showlegend=False
)
fig.show()

In [22]:
np.array([list(sentiment_no_pcl_dict.values())])

array([[1690, 3966, 3819]])

In [99]:
fig = ff.create_distplot(
    [text_df[text_df["label"] == 0]["sentiment"], text_df[text_df["label"] == 1]["sentiment"]],
    group_labels=["No PCL", "PCL"], bin_size=0.2, show_rug=False,
    show_hist=False, colors=[LIGHT_BLUE_COLOR, LIGHT_RED_COLOR]
)


trace2 = go.Bar(
    x = list(sentiment_pcl_dict.keys()), 
    y = list(sentiment_pcl_dict.values()),
    marker_color=LIGHT_RED_COLOR, name="PCL",
    text = np.round(np.array(list(sentiment_pcl_dict.values())) / np.sum(np.array(list(sentiment_pcl_dict.values()))), 3),
    textfont_color="white", textposition='inside',
    textfont_size=20,
    xaxis='x2', yaxis='y2'
)

fig.add_traces([trace2])

fig['layout']['xaxis2'] = {}
fig['layout']['yaxis2'] = {}

fig.layout.xaxis.update({'domain': [0, .5]})
fig.layout.xaxis2.update({'domain': [0.6, 1.]})
fig.layout.yaxis2.update({'anchor': 'x2'})

#fig.layout.margin.update({'t':50, 'b':100})
fig.update_layout(
    template=TEMPLATE, title=dict(text="Distribución en porcentaje de párrafos con PCL", font_size=27),
    showlegend=False
)
fig.show()