In [None]:
"""
*******************************************************
Title: Python notebook to check the word cloud of downloaded posts and related comments

Organization: DANE
Author: Andrés D. Pérez
Version: 2.0
Modification date: 08/10/2021
Descripción:
    [Sec 1] Libraries
    [Sec 2] Functions
    [Sec 3] Merge all bases
    [Sec 4] Dataset exploration
    [Sec 5] Text closeness
    
    Returns:
        Posts dataset for each profile
*******************************************************
"""

# 1. Libraries

In [1]:
#Data handling
import pandas as pd
import seaborn as sns
import numpy as np
import collections, os
from collections import Counter
import re, unicodedata, spacy

#Progress bar
from tqdm import tqdm

#Directory listing
from os import listdir
from os.path import isfile, join

#Text processing and analysis 
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
stopwords = set(stopwords.words('spanish'))
from sklearn.preprocessing import MinMaxScaler
from difflib import SequenceMatcher

#Date handling
import dateparser
from datetime import datetime
sns.set()
date_fmt = '%b %Y'

#Graphing
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import matplotlib.pyplot as plt

# 2. Functions

In [2]:
def txt_preproc(input_str):
    '''
    Function to preprocess text strings
    
    Based on:
    https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string
    https://stackoverflow.com/questions/5843518/remove-all-special-characters-punctuation-and-spaces-from-string
    
    Receive a text string in string format called `` input_str``
    
    First remove special characters
    Remove accents and unicode characters
    Finally with the method .to_lower () convert the characters to lowercase
    
    Args:
        input_str (string):   Text string to preprocess
    Returns:
        nfkd_form (string):  Preprocessed text string
    '''
    
    nfkd_form = re.sub(r'[?|$|.|!]',r'',input_str) #Remove special chars
    nfkd_form = unicodedata.normalize('NFKD', nfkd_form) #Remove accents
    nfkd_form = u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) #Remove unicode chars
    nfkd_form = (re.sub(r'[^a-zA-Z0-9 ]',r'',nfkd_form)).lower() #Remove missing special chars and convert to lowercase
    
    return nfkd_form

def get_sim_value(in_text, in_query, th=0.5):
    """
    Function to get the indicators
    
    For description please refer to subsection 
    
    Args:
        in_text (string):   Text string to preprocess
        in_query (list):   List of string
        th (float):   Threshold to filter indicators
    Returns:
        nfkd_form (string):  Preprocessed text string
    
    """
    #Get string and query as list
    try:
        in_text = str(in_text)
    except:
        pass
    
    results = []
    #--------------------
    ucp = in_text.split()
    filtered_ucp = [word for word in ucp if word not in stopwords]
    
    w_vect = []
    for w1 in in_query:
        for w2 in filtered_ucp:
            w_vect.append((SequenceMatcher(None, str(w1), str(w2))).ratio())
    
    e_vect = []
    for el in w_vect:
        if el>=th:
            e_vect.append(el)
            
    w_array = np.asarray(w_vect)
    e_array = np.asarray(e_vect)
    #--Get values--
    w_array_sum = w_array.sum()
    w_array_mean = w_array.mean()
    w_array_median = np.median(w_array)
    
    if (len(w_array) == 0):
        w_array_max = 0
        w_array_min = 0
    else:
        w_array_max = w_array.max()
        w_array_min = w_array.min()
        
    e_array_sum = e_array.sum()
    e_array_mean = e_array.mean()
    e_array_median = np.median(e_array)
    
    if (len(e_array) == 0):
        e_array_min = 0
    else:
        e_array_min = e_array.min()
        
    e_array_tot = len(e_array)
    
    results = [w_array_sum, w_array_mean, w_array_median, e_array_sum, e_array_mean, e_array_median,
               w_array_max, w_array_min, e_array_min, e_array_tot]
    
    return results

## 2.1. The ``get_sim_value`` method

This function calculates 10 indicators for each discrimination type based on sintactic similarity.

This indicators are:
- No threshold (sum, mean & median)
- Threshold (sum, mean & median)
- Max term value
- Min term value
- Min term value with threshold
- Total terms with threshold

The "star" of this function is the ``SecuenceMatcher`` class from the ``difflib`` library.

This is an algorithm based on the one published in 1980 by Ratcliff and Obershelp under the name "gestalt pattern matching."

The idea is to find the longest contiguous matching subsequence that contains no "garbage" elements; These "junk" items are the ones that are not interesting in some sense, such as blank lines or blank spaces.

For a detailed explination visit: https://docs.python.org/3/library/difflib.html or https://towardsdatascience.com/sequencematcher-in-python-6b1e6f3915fc


In [3]:
#Lets take a look for its use before continue
word_ref = 'Machine'
word_1 = 'machine'
word_2 = 'machin'
word_3 = 'mach'
word_4 = 'learning'
word_5 = 'robot'

print('Reference word vs Reference word', SequenceMatcher(None, word_ref, word_ref).ratio())
print('Reference word vs word 1', SequenceMatcher(None, word_ref, word_1).ratio())
print('Reference word vs word 2', SequenceMatcher(None, word_ref, word_2).ratio())
print('Reference word vs word 3', SequenceMatcher(None, word_ref, word_3).ratio())
print('Reference word vs word 4', SequenceMatcher(None, word_ref, word_4).ratio())
print('Reference word vs word 5', SequenceMatcher(None, word_ref, word_5).ratio())

Reference word vs Reference word 1.0
Reference word vs word 1 0.8571428571428571
Reference word vs word 2 0.7692307692307693
Reference word vs word 3 0.5454545454545454
Reference word vs word 4 0.4
Reference word vs word 5 0.0


As you noticed, the more similar the second word is with respect to the reference word, a value closer to 1 will be obtained.

Explained this, lets talk about the function using an example. Supose you have the following text: ``My pet fly, has feathers and has a beak``.

You want to check if that pet is land or air one, so you have related terms both for **land** and **air** as follows:

land: ``fur``, ``fangs``, ``walk`` and ``snout``.

air: ``fly``, ``feathers`` and ``beak``.

As you noticed we hava a ``in_text (string)`` and two ``in_query (list)``. 

1. The function takes the text, removes the stop words so the text now works as ``[pet, fly, feathers, beak]``
2. Now takes the first query **land** (which have 4 terms) and compares as follows
    - ``fur`` vs ``pet``
    - ``fur`` vs ``fly``
    - ``fur`` vs ``feathers``
    - ``fur`` vs ``beak``
    - ``fangs`` vs ``pet``
    - ``fangs`` vs ``fly``
    - ``fangs`` vs ``feathers``
    - ``fangs`` vs ``beak``
    - ``walk`` vs ``pet``
    - ``walk`` vs ``fly``
    - ``walk`` vs ``feathers``
    - ``walk`` vs ``beak``
    - ``snout`` vs ``pet``
    - ``snout`` vs ``fly``
    - ``snout`` vs ``feathers``
    - ``snout`` vs ``beak``
3. For each comparison were obtained values (16 values in this case) which are stored in an array (the one termed as ``w_vect``
    $$w_{vect} = [val_{1}, val_{2}, val_{3}, ... , val_{n}]$$
    - Being $n$ the product of $(number-of-query-terms) \times (number-of-terms-in-text-after-removing-stop-words)$
4. The values in this vector are added obtaining the ``w_array_sum`` value
5. The mean over the values of this vector is calculated obtaining ``w_array_mean`` value
6. The meadian over the values of this vector is calculated obtaining ``w_array_median`` value
7. A second vector is obtained termed ``e_vect``.
    - The values in this vector are those values > the threshold value
8. As in the ``w_vect`` sum, mean and median are calculated over ``e_vect`` obtaining ``e_array_sum``, ``e_array_mean`` and ``e_array_median`` value respectively.
9. Finally, ``w_array_max`` value corresponds to the ``max_value`` in ``w_vect``.
10. ``w_array_min`` value corresponds to the ``min_value`` in ``w_vect``.
11. ``e_array_min`` value corresponds to the ``min_value`` in ``e_vect``.
12. ``e_array_tot`` value corresponds to the ``total of elements`` in ``e_vect``.

13. Those values are sorted as a list in ``results`` variable to be added to the list of features.
14. The process is repeated for each ``query_list`` (in this case **land** and **air** or in our project case, the 7 discrimination types).
15. Then are added to the full dataset.

# 3. Merge all bases

In [4]:
#List the available files(posts) to concatenate
mypath = './bases/popular_base/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

onlyfiles

['base_AlcaldeJorgeMendoza_0.csv',
 'base_AlvaroUribeVel_0.csv',
 'base_AlvaroUribeVel_1.csv',
 'base_AMIGOSDEALEXCAMPOS_0.csv',
 'base_andreshurtadoalcalde_0.csv',
 'base_andreshurtadoalcalde_1.csv',
 'base_andreshurtadoalcalde_2.csv',
 'base_andreshurtadoalcalde_3.csv',
 'base_andreshurtadoalcalde_4.csv',
 'base_andreshurtadoalcalde_5.csv',
 'base_Antonio-Caballero-58050573565_0.csv',
 'base_BancoDavivienda_0.csv',
 'base_Bancolombia_0.csv',
 'base_CanalRCN_0.csv',
 'base_carlospenagos_si_0.csv',
 'base_ClaudiaLopezCL_0.csv',
 'base_DANEColombia_0.csv',
 'base_departamentonacionaldeplaneacion_0.csv',
 'base_DeportivoCaliOficial_0.csv',
 'base_DIANCol_0.csv',
 'base_DQuinteroCalle_0.csv',
 'base_EdgarTovarPedraza_0.csv',
 'base_ejercitocolombia_0.csv',
 'base_elespectadorcom_0.csv',
 'base_eltiempo_0.csv',
 'base_FCFSeleccionColPage_0.csv',
 'base_FiscaliaCol_0.csv',
 'base_gustavopetrourrego_0.csv',
 'base_icetexcolombia_0.csv',
 'base_independiente_santafe_0.csv',
 'base_ivanduquema

In [5]:
len(onlyfiles)

42

In [6]:
#All bases are concatenated
all_base_list = []

for base in tqdm(onlyfiles):
    post_df = pd.read_csv(mypath+base, index_col=0)
    all_base_list.append(post_df)
    
full_df_posts = pd.concat(all_base_list)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:04<00:00,  9.68it/s]


In [7]:
full_df_posts

Unnamed: 0,post_time,post_id,text,user,user_comment
0,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Luz Marina Cuevas Valderrama,"Que triste noticia, lamentablemente su falleci..."
1,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Ana Josef Aparicio,Descansa en paz mi doctor de los leticianos ve...
2,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Jorge Enrique Peña,Descanse en Paz gran PROFESIONAL ...
3,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Yolanda Murayari,"Qué mi Dios lo tenga en su Santo Reino,paz en ..."
4,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Maruja Quiñonez,Hay luto y trztesa en mi corazon con la partid...
...,...,...,...,...,...
4518,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Ninna Mezza,"No, las universidades se van a quedar sin estu..."
4519,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Diego Fernando Escobar Garcia,Peluca o transplante?
4520,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Sebastián Mazo,... a maquillarnos?
4521,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Andres Almeida,Pensé que estaba mostrando una empanada


In [8]:
#Lets save the complete posts dataset
full_df_posts.to_csv('./bases/all_posts_comments_fb_raw_final.csv', index=False)

# 4. Dataset exploration

In [9]:
#The dataset is loaded
fb_base_posts = pd.read_csv('./bases/all_posts_comments_fb_raw_final.csv')
fb_base_posts['text_proc'] = 'ok'
fb_base_posts['user_comment_proc'] = 'ok'
print('Found: ', len(fb_base_posts), 'records')
fb_base_posts.head()

Found:  886849 records


Unnamed: 0,post_time,post_id,text,user,user_comment,text_proc,user_comment_proc
0,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Luz Marina Cuevas Valderrama,"Que triste noticia, lamentablemente su falleci...",ok,ok
1,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Ana Josef Aparicio,Descansa en paz mi doctor de los leticianos ve...,ok,ok
2,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Jorge Enrique Peña,Descanse en Paz gran PROFESIONAL ...,ok,ok
3,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Yolanda Murayari,"Qué mi Dios lo tenga en su Santo Reino,paz en ...",ok,ok
4,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Maruja Quiñonez,Hay luto y trztesa en mi corazon con la partid...,ok,ok


In [10]:
# Looking for nan in text
if (fb_base_posts['text'].isnull().values.any()):
    tot_nan = fb_base_posts['text'].isnull().sum()
    fb_base_posts['text'].fillna('no_text_in_this_field', inplace=True)
    print('Found: ', tot_nan, 'NaN registers in text column')
    print('Filling NaN with no_text')
else:
    print('No NaN records found...')

Found:  52766 NaN registers in text column
Filling NaN with no_text


In [11]:
# Looking for nan in user_comment
if (fb_base_posts['user_comment'].isnull().values.any()):
    tot_nan = fb_base_posts['user_comment'].isnull().sum()
    fb_base_posts['user_comment'].fillna('no_comment_in_this_field', inplace=True)
    print('Found: ', tot_nan, 'NaN registers in user_comment column')
    print('Filling NaN with no_comment')
else:
    print('No NaN records found...')

Found:  66836 NaN registers in user_comment column
Filling NaN with no_comment


In [12]:
orig_len = len(fb_base_posts)

In [13]:
fb_base_posts = fb_base_posts[fb_base_posts['text']!='no_text_in_this_field']
fb_base_posts = fb_base_posts[fb_base_posts['user_comment']!='no_comment_in_this_field']

In [14]:
fitler_len = len(fb_base_posts)
delta_len = orig_len-fitler_len

print(delta_len, 'regs removed from bse')

115347 regs removed from bse


In [15]:
#Preprocessing text
for row in tqdm(range(len(fb_base_posts['text']))):
    df_text = fb_base_posts['text'].iloc[row]
    df_text = txt_preproc(df_text)
    
    df_comment = fb_base_posts['user_comment'].iloc[row]
    df_comment = txt_preproc(df_comment)
    
    fb_base_posts['text_proc'].iloc[row] = df_text
    fb_base_posts['user_comment_proc'].iloc[row] = df_comment

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 771502/771502 [2:28:41<00:00, 86.47it/s]


In [16]:
fb_base_posts.head()

Unnamed: 0,post_time,post_id,text,user,user_comment,text_proc,user_comment_proc
0,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Luz Marina Cuevas Valderrama,"Que triste noticia, lamentablemente su falleci...",hoy lamento profundamente el fallecimiento del...,que triste noticia lamentablemente su fallecim...
1,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Ana Josef Aparicio,Descansa en paz mi doctor de los leticianos ve...,hoy lamento profundamente el fallecimiento del...,descansa en paz mi doctor de los leticianos ve...
2,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Jorge Enrique Peña,Descanse en Paz gran PROFESIONAL ...,hoy lamento profundamente el fallecimiento del...,descanse en paz gran profesional ...
3,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Yolanda Murayari,"Qué mi Dios lo tenga en su Santo Reino,paz en ...",hoy lamento profundamente el fallecimiento del...,que mi dios lo tenga en su santo reinopaz en s...
4,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Maruja Quiñonez,Hay luto y trztesa en mi corazon con la partid...,hoy lamento profundamente el fallecimiento del...,hay luto y trztesa en mi corazon con la partid...


In [17]:
#Saving preproc FB posts and comments
fb_base_posts.to_csv('./bases/all_posts_comments_fb_preproc_final.csv', index=False)

# 4.1. Wordcloud - Posts

In [3]:
fb_base_posts = pd.read_csv('./bases/all_posts_comments_fb_preproc_final.csv')
fb_base_posts

Unnamed: 0,post_time,post_id,text,user,user_comment,text_proc,user_comment_proc
0,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Luz Marina Cuevas Valderrama,"Que triste noticia, lamentablemente su falleci...",hoy lamento profundamente el fallecimiento del...,que triste noticia lamentablemente su fallecim...
1,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Ana Josef Aparicio,Descansa en paz mi doctor de los leticianos ve...,hoy lamento profundamente el fallecimiento del...,descansa en paz mi doctor de los leticianos ve...
2,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Jorge Enrique Peña,Descanse en Paz gran PROFESIONAL ...,hoy lamento profundamente el fallecimiento del...,descanse en paz gran profesional ...
3,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Yolanda Murayari,"Qué mi Dios lo tenga en su Santo Reino,paz en ...",hoy lamento profundamente el fallecimiento del...,que mi dios lo tenga en su santo reinopaz en s...
4,2021-03-22 17:36:51,7.561767e+14,Hoy lamento profundamente el fallecimiento del...,Maruja Quiñonez,Hay luto y trztesa en mi corazon con la partid...,hoy lamento profundamente el fallecimiento del...,hay luto y trztesa en mi corazon con la partid...
...,...,...,...,...,...,...,...
771497,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Ninna Mezza,"No, las universidades se van a quedar sin estu...",con la pandemia con las evidencias de inestabi...,no las universidades se van a quedar sin estud...
771498,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Diego Fernando Escobar Garcia,Peluca o transplante?,con la pandemia con las evidencias de inestabi...,peluca o transplante
771499,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Sebastián Mazo,... a maquillarnos?,con la pandemia con las evidencias de inestabi...,a maquillarnos
771500,2020-09-26 19:20:13,1.016039e+16,"""Con la pandemia, con las evidencias de inesta...",Andres Almeida,Pensé que estaba mostrando una empanada,con la pandemia con las evidencias de inestabi...,pense que estaba mostrando una empanada


In [None]:
#fb_base_posts = pd.read_csv('./bases/all_posts_comments_fb_preproc_final.csv')
title_words = ''
title_list_full, title_list_filtered = [], []

# iterate through the csv file 
for val in fb_base_posts.text_proc: 
    # typecaste each val to string 
    val = str(val) 
    # split the value 
    tokens = val.split()
    
    for tok in tokens:
        title_list_full.append(tok)
        if tok not in stopwords:
            title_list_filtered.append(tok)
        
    title_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(title_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (9, 9), facecolor = None) 
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

In [None]:
sorted_tc_dict = sorted(dict(Counter(title_list_filtered)).items(), key=lambda kv: kv[1])
title_array = np.asarray(sorted_tc_dict)

labels = np.flipud(title_array[:,0])
values = np.flipud(title_array[:,1]).astype(int)

In [None]:
data = [go.Bar(
   x = labels[:100],
   y = values[:100]
)]
fig = go.Figure(data=data)
fig.update_layout(xaxis_tickangle=-45)
fig.update_layout(
    title_text='Top 100 high-frequency words for texts',
    xaxis_title="Words",
    yaxis_title="Frequency",
)
fig.show()

# 4.2. Wordcloud - Comments

In [None]:
# Posts and comments
#fb_base_posts = pd.read_csv('./bases/all_posts_comments_fb_preproc_final.csv')
title_words = ''
title_list_full, title_list_filtered = [], []

# iterate through the csv file 
for val in fb_base_posts.user_comment_proc: 
    # typecaste each val to string 
    val = str(val) 
    # split the value 
    tokens = val.split()
    
    for tok in tokens:
        title_list_full.append(tok)
        if tok not in stopwords:
            title_list_filtered.append(tok)
        
    title_words += " ".join(tokens)+" "

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(title_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (9, 9), facecolor = None) 
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)

plt.show()

In [None]:
sorted_tc_dict = sorted(dict(Counter(title_list_filtered)).items(), key=lambda kv: kv[1])
title_array = np.asarray(sorted_tc_dict)

labels = np.flipud(title_array[:,0])
values = np.flipud(title_array[:,1]).astype(int)

In [None]:
data = [go.Bar(
   x = labels[:100],
   y = values[:100]
)]
fig = go.Figure(data=data)
fig.update_layout(xaxis_tickangle=-45)
fig.update_layout(
    title_text='Top 100 high-frequency words for texts',
    xaxis_title="Words",
    yaxis_title="Frequency",
)
fig.show()

# 5. Text closeness

In [3]:
#The dataset is loaded
fb_base_posts = pd.read_csv('./bases/all_posts_comments_fb_preproc_final.csv')

#Indicator variables are defined
fb_base_new_cols = ['d_eco_sum', 'd_eco_mean', 'd_eco_median', 'd_eco_sum_th', 'd_eco_mean_th', 'd_eco_median_th',
                    'd_eco_max', 'd_eco_min', 'd_eco_minth', 'd_eco_tot_th',
                    'd_osex_sum', 'd_osex_mean', 'd_osex_median', 'd_osex_sum_th', 'd_osex_mean_th', 'd_osex_median_th',
                    'd_osex_max', 'd_osex_min', 'd_osex_minth', 'd_osex_tot_th',
                    'd_pol_sum', 'd_pol_mean', 'd_pol_median', 'd_pol_sum_th', 'd_pol_mean_th', 'd_pol_median_th',
                    'd_pol_max', 'd_pol_min', 'd_pol_minth', 'd_pol_tot_th',
                    'd_mig_sum', 'd_mig_mean', 'd_mig_median', 'd_mig_sum_th', 'd_mig_mean_th', 'd_mig_median_th',
                    'd_mig_max', 'd_mig_min', 'd_mig_minth', 'd_mig_tot_th',
                    'd_sex_sum', 'd_sex_mean', 'd_sex_median', 'd_sex_sum_th', 'd_sex_mean_th', 'd_sex_median_th',
                    'd_sex_max', 'd_sex_min', 'd_sex_minth', 'd_sex_tot_th',
                    'd_rac_sum', 'd_rac_mean', 'd_rac_median', 'd_rac_sum_th', 'd_rac_mean_th', 'd_rac_median_th',
                    'd_rac_max', 'd_rac_min', 'd_rac_minth', 'd_rac_tot_th',
                    'd_dis_sum', 'd_dis_mean', 'd_dis_median', 'd_dis_sum_th', 'd_dis_mean_th', 'd_dis_median_th',
                    'd_dis_max', 'd_dis_min', 'd_dis_minth', 'd_dis_tot_th']

#Indicator variables are set as 0
for n_col in fb_base_new_cols:
    fb_base_posts[n_col] = 0
    
fb_base_posts.head()

Unnamed: 0,post_time,post_id,text,user,user_comment,text_proc,user_comment_proc,d_eco_sum,d_eco_mean,d_eco_median,...,d_dis_sum,d_dis_mean,d_dis_median,d_dis_sum_th,d_dis_mean_th,d_dis_median_th,d_dis_max,d_dis_min,d_dis_minth,d_dis_tot_th
0,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Luz Marina Cuevas Valderrama,"Que triste noticia, lamentablemente su falleci...",hoy lamento profundamente el fallecimiento del...,que triste noticia lamentablemente su fallecim...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Ana Josef Aparicio,Descansa en paz mi doctor de los leticianos ve...,hoy lamento profundamente el fallecimiento del...,descansa en paz mi doctor de los leticianos ve...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Jorge Enrique Peña,Descanse en Paz gran PROFESIONAL ...,hoy lamento profundamente el fallecimiento del...,descanse en paz gran profesional ...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Yolanda Murayari,"Qué mi Dios lo tenga en su Santo Reino,paz en ...",hoy lamento profundamente el fallecimiento del...,que mi dios lo tenga en su santo reinopaz en s...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-03-22 17:36:51,756176700000000.0,Hoy lamento profundamente el fallecimiento del...,Maruja Quiñonez,Hay luto y trztesa en mi corazon con la partid...,hoy lamento profundamente el fallecimiento del...,hay luto y trztesa en mi corazon con la partid...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
"""
#Previous query terms
d_economic = ["indigente", "mendigo", "mendiga", "vagabundo", "vagabunda", "arrastrado", "arrastrada",
              "nero", "nera", "guiso", "guisa", "pobre", "muerto de hambre", "pordiosero", "pordiosera",
              "desechable"]

d_racial = ["negro", "negra", "indio", "india", "negrito", "indiecito", "niche", "africa", "choco",
            "chocoano", "indigena", "afro", "afrocolombiano", "salvaje"]

d_discap = ["discapacitado", "discapacitada", "minusvalido", "minusvalida", "invalido", "invalido",
            "excepcional", "imbecil", "idiota", "anormal", "impedido", "impedida", "retrasado", "cojo", "coja",
            "autista", "cieguito", "manguito", "loquita"]

d_sexo = ["zorra", "perra", "verdulera", "fulana", "bruja", "mujerzuela", "loba", "prostituta", "puta"]

d_o_sexual = ["marica", "gay", "lgbti", "hermafrodita", "maricon", "lesbiana",
              "travesti", "loquita", "rarito", "desviado", "homosexualismo", "lesbianismo"]

d_politic = ["vandalo", "vandala", "comunista", "mamerto", "mamerta", "facho", "facha",
             "fascista", "tibio", "tibia", "gente de bien", "petrista", "uribista",
             "guerrillero", "guerrillera", "paraco", "paraca", "reinsertados", "desmovilizados"]
"""
#=============================================================================================
#New query terms
d_economic = ['indigent', 'mendig', 'vagabund', 'arrastrad', 'ner', 'guis',
              'pobre', 'muert de hambre', 'pordioser', 'desechable',
              'miserable', 'desvalid', 'mugrient', 'pelafustan', 'limosnero', 
              'necesitad', 'vag', 'recostad']

d_o_sexual = ['marica', 'gay', 'lgbti', 'hermafrodita', 'maricon', 'lesbiana', 'travesti',
              'loquita', 'rarit', 'desviad', 'homosexual', 'lesbianismo', 'rosco', 'roscon',
              'afeminado', 'marimach', 'machorra', 'arepera', 'pirob', 'machista', 
              'feminista', 'monosexual', 'pansexualista', 'pasiv', 'activ', 'transexual',
              'bollera', 'salir del closet', 'coming out', 'cross-dressing', 'drag',
              'hetero', 'queer', 'torcid', 'rosca floja']

d_politic = ['vandal', 'comunista', 'mamert', 'fach', 'fascista', 'tibi',
             'gente de bien', 'petrista', 'uribista', 'guerriller', 'subversiv',
             'parac', 'reinsertados', 'desmovilizados', 'sindicalista', 'socialista',
             'derechista', 'izquierdista', 'god', 'furibista', 'zurd',
             'dictadura', 'agitador', 'primera linea', 'revolucionario', 'delfin',
             'lagart', 'lentej', 'mermelada', 'palanca', 'serrucho', 'voltearepas']

d_migra = ['venec', 'extranjer', 'desplazad', 'cham', 'refugiad', 'expatriad', 'apatrida', 'atrasad']

d_sexo = ['zorra', 'perra', 'verdulera', 'fulana', 'bruja', 'mujerzuela', 'loba',
          'prostituta', 'puta', 'mujercita', 'jovencita', 'ninita', 'señorita',
          'feminazi', 'prostituta', 'perdida', 'vida facil', 'nina', 'nena', 
          'mantenida', 'zunga', 'morronga', 'golfa', 'aventurera', 'callejera',
          'atrevida', 'mujer publica', 'vagabunda']

d_racial = ['negr', 'indi', 'negrit', 'indiecit', 'niche', 'africa', 'choco',
            'chocoano', 'indigena', 'afro', 'afrocolombiano', 'salvaje', 'gitan',
            'costen', 'cachac', 'paisa', 'pastus', 'amarill', 'oriental', 
            'achinad', 'mon', 'gring', 'europe', 'chol', 'cuajada', 'african']

d_discap = ['discapacitad', 'minusvalid', 'invalid', 'excepcional', 'imbecil', 'idiota', 'anormal', 
            'impedid', 'retrasad', 'coj', 'autista', 'cieguito', 'manguito', 'deficiente', 'enferm',
            'incapacitad', 'disminuid', 'inutil', 'especial', 'sordomud', 'mud', 'invidente', 'cieg',
            'cegaton', 'vista', 'retardado', 'mental', 'mongolic', 'tont', 'bobit', 'incapaz',
            'tarad', 'enferm', 'locos', 'demente', 'trastornad', 'depresiv', 'esquizofrenic', 'bipolar',
            'loquit', 'paralitic', 'lisiad', 'tullid', 'minusvalid', 'mutilad', 'moch', 'chuequit',
            'enan', 'gigante', 'gord', 'obes', 'rellen', 'regordeta']

all_dis_list = [d_economic, d_o_sexual, d_politic, d_migra, d_sexo, d_racial, d_discap]

In [5]:
#Indicators calculus
for row in tqdm(range(len(fb_base_posts))):
    user_comment_proc = fb_base_posts['user_comment_proc'].iloc[row]

    all_indic_list = []
    for disc_type in all_dis_list:
        values_disc = get_sim_value(user_comment_proc, disc_type, 0.5)

        for indic_val in values_disc:
            all_indic_list.append(indic_val)
            
    for col_name_index in range(len(fb_base_new_cols)):
        fb_base_posts[fb_base_new_cols[col_name_index]].iloc[row]=all_indic_list[col_name_index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
  e_array_mean = e_array.mean()
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  w_array_mean = w_array.mean()
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 771502/771502 [19:29:18<00:00, 11.00it/s]


In [6]:
#Saving preproc FB posts and comments
fb_base_posts.to_csv('./bases/all_posts_comments_fb_indices_final_v1.csv', index=False)

# Closeness review

In [3]:
fb_base_posts = pd.read_csv('./bases/all_posts_comments_fb_indices.csv')
    
fb_base_posts.head()

Unnamed: 0,post_time,post_id,text,user,user_comment,text_proc,user_comment_proc,d_eco_sum,d_eco_mean,d_eco_median,...,d_rac_median,d_rac_sum_th,d_rac_mean_th,d_rac_median_th,d_dis_sum,d_dis_mean,d_dis_median,d_dis_sum_th,d_dis_mean_th,d_dis_median_th
0,2020-10-03 07:52:06,1.015893e+16,Más aprendices mejor remuneración para ayudar ...,Freddy Vera Perez,"Señor presidente, en este momento tan difícil ...",mas aprendices mejor remuneracion para ayudar ...,senor presidente en este momento tan dificil m...,136.378352,0.236768,0.235294,...,0.222222,17.930609,0.578407,0.545455,394.802505,0.232784,0.222222,32.699834,0.544997,0.533333
1,2020-10-03 07:52:06,1.015893e+16,Más aprendices mejor remuneración para ayudar ...,Francy Elena Ortiz Olaya,"Le debe preocupar y mucho, ver cómo sumergió a...",mas aprendices mejor remuneracion para ayudar ...,le debe preocupar y mucho ver como sumergio a ...,148.749832,0.250421,0.25,...,0.2,13.161381,0.526455,0.5,406.215308,0.232256,0.222222,27.696568,0.532626,0.5
2,2020-10-03 07:52:06,1.015893e+16,Más aprendices mejor remuneración para ayudar ...,Gloria Zapata,Muchos colombianos lo sabemos y siempre será r...,mas aprendices mejor remuneracion para ayudar ...,muchos colombianos lo sabemos y siempre sera r...,75.469591,0.246633,0.25,...,0.181818,11.210876,0.560544,0.5,202.245521,0.224468,0.222222,16.777844,0.541221,0.526316
3,2020-10-03 07:52:06,1.015893e+16,Más aprendices mejor remuneración para ayudar ...,Sirley Beatriz Noriega Herrera,"Por lo menos Ud hace, esté en la situación q ...",mas aprendices mejor remuneracion para ayudar ...,por lo menos ud hace este en la situacion q e...,40.419357,0.187127,0.181818,...,0.181818,6.365079,0.578644,0.571429,113.86995,0.179041,0.181818,5.424242,0.542424,0.5
4,2020-10-03 07:52:06,1.015893e+16,Más aprendices mejor remuneración para ayudar ...,Daniel Valenzuela,Que irónico antes los aprendices ganaban el mí...,mas aprendices mejor remuneracion para ayudar ...,que ironico antes los aprendices ganaban el mi...,111.715179,0.248256,0.235294,...,0.235294,16.386922,0.52861,0.5,304.338962,0.22969,0.222222,25.978436,0.530172,0.5


In [None]:
#Example of Indicators plotting
gen_width = 980
gen_height = 200

x_axis_values = list(fb_base_posts.index)

d_eco_sum_vals_th = list(fb_base_posts['d_eco_sum_th'])
d_osex_sum_vals_th = list(fb_base_posts['d_osex_sum_th'])
d_pol_sum_vals_th = list(fb_base_posts['d_pol_sum_th'])
d_mig_sum_vals_th = list(fb_base_posts['d_mig_sum_th'])
d_sex_sum_vals_th  = list(fb_base_posts['d_sex_sum_th'])
d_rac_sum_vals_th = list(fb_base_posts['d_rac_sum_th'])
d_dis_sum_vals_th = list(fb_base_posts['d_dis_sum_th'])

d_eco_mean_vals_th = list(fb_base_posts['d_eco_mean_th'])
d_osex_mean_vals_th = list(fb_base_posts['d_osex_mean_th'])
d_pol_mean_vals_th = list(fb_base_posts['d_pol_mean_th'])
d_mig_mean_vals_th = list(fb_base_posts['d_mig_mean_th'])
d_sex_mean_vals_th  = list(fb_base_posts['d_sex_mean_th'])
d_rac_mean_vals_th = list(fb_base_posts['d_rac_mean_th'])
d_dis_mean_vals_th = list(fb_base_posts['d_dis_mean_th'])

d_eco_median_vals_th = list(fb_base_posts['d_eco_median_th'])
d_osex_median_vals_th = list(fb_base_posts['d_osex_median_th'])
d_pol_median_vals_th = list(fb_base_posts['d_pol_median_th'])
d_mig_median_vals_th = list(fb_base_posts['d_mig_median_th'])
d_sex_median_vals_th  = list(fb_base_posts['d_sex_median_th'])
d_rac_median_vals_th = list(fb_base_posts['d_rac_median_th'])
d_dis_median_vals_th = list(fb_base_posts['d_dis_median_th'])
#===========================================
fig = make_subplots(rows=1, cols=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_eco_sum_vals_th,
    mode='lines', name='Economic', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_osex_sum_vals_th,
    mode='lines', name='Sex. Orient', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_pol_sum_vals_th,
    mode='lines', name='Politics', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_mig_sum_vals_th,
    mode='lines', name='Migration', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_sex_sum_vals_th,
    mode='lines', name='Sex', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_rac_sum_vals_th,
    mode='lines', name='Racial', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_dis_sum_vals_th,
    mode='lines', name='Disability', line = dict(width = 1)),
    row=1, col=1)

fig.update_layout(
    autosize=False,
    width=gen_width,
    height=gen_height,
    margin=dict(l=0, r=0, t=50, b=0),
    font=dict(size=10),
    title_text="Sum indicator for 7 disc with threshold",
    xaxis_title="Comment",
    yaxis_title="Value",
    legend_title="Discrimination",
)
fig.show()
#-------------------------------------------
fig = make_subplots(rows=1, cols=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_eco_mean_vals_th,
    mode='lines', name='Economic', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_osex_mean_vals_th,
    mode='lines', name='Sex. Orient', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_pol_mean_vals_th,
    mode='lines', name='Politics', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_mig_mean_vals_th,
    mode='lines', name='Migration', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_sex_mean_vals_th,
    mode='lines', name='Sex', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_rac_mean_vals_th,
    mode='lines', name='Racial', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_dis_mean_vals_th,
    mode='lines', name='Disability', line = dict(width = 1)),
    row=1, col=1)

fig.update_layout(
    autosize=False,
    width=gen_width,
    height=gen_height,
    margin=dict(l=0, r=0, t=50, b=0),
    font=dict(size=10),
    title_text="Mean indicator for 7 disc with threshold",
    xaxis_title="Comment",
    yaxis_title="Value",
    legend_title="Discrimination",
)
fig.show()
#------------------------------------------
fig = make_subplots(rows=1, cols=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_eco_median_vals_th,
    mode='lines', name='Economic', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_osex_median_vals_th,
    mode='lines', name='Sex. Orient', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_pol_median_vals_th,
    mode='lines', name='Politics', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_mig_median_vals_th,
    mode='lines', name='Migration', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_sex_median_vals_th,
    mode='lines', name='Sex', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_rac_median_vals_th,
    mode='lines', name='Racial', line = dict(width = 1)),
    row=1, col=1)
fig.append_trace(go.Scatter(
    x=x_axis_values, y=d_dis_median_vals_th,
    mode='lines', name='Disability', line = dict(width = 1)),
    row=1, col=1)

fig.update_layout(
    autosize=False,
    width=gen_width,
    height=gen_height,
    margin=dict(l=0, r=0, t=50, b=0),
    font=dict(size=10),
    title_text="Median indicator for 7 disc with threshold",
    xaxis_title="Comment",
    yaxis_title="Value",
    legend_title="Discrimination",
)

fig.show()