# Analyzer

> This class contains the main methods to make a polarization analysis

In [8]:
#| default_exp Analyzer

In [7]:
#| hide
from nbdev.showdoc import *

In [1]:
#| export
import pandas as pd
from unidecode import unidecode
import regex as re
from mexican_polarization.Loader import Loader 
from wordcloud import WordCloud
from nltk.corpus import stopwords
import matplotlib.pyplot as plt





In [13]:
#| export
class Analyzer:
    def __init__(self,
                 loader:Loader,
                 ):
        """Constructor of the Analyzer class"""
        if loader.processed_bool == False:
            self.methods_available = False
        else:
            self.methods_available = True
        self.loader = loader

    def moral_words_count(self, message_col: str) -> pd.DataFrame:
        """Returns a dataframe with the count of virtue and vice words in each message in the message_col column"""
        if not self.methods_available:
            print('You need to process the data first')
            return None
        
        vice_d = self.loader.vice_dict
        virtue_d = self.loader.virtue_dict
        df = self.loader.processed

        vice = set(vice_d.keys())
        virtue = set(virtue_d.keys())

        def count_words(text, set_of_words):
            words_in_text = text.lower().split()
            word_count = 0
            word_list = []
            for word in words_in_text:
                if word in set_of_words:
                    word_count += 1
                    word_list.append(word)
            return word_count, word_list

        # Agregar las columnas 'Vice words' y 'Virtue words' utilizando apply y la función count_words
        df[['Vice words count', 'Vice words']] = df[message_col].apply(lambda x: pd.Series(count_words(str(x), vice)))
        df[['Virtue words count', 'Virtue words']] = df[message_col].apply(lambda x: pd.Series(count_words(str(x), virtue)))

        df.sort_values(by=['Vice words count', 'Virtue words count'], ascending=False)

        df['Total words'] = df[message_col].apply(lambda x: len(x.split()))

        df = df[df['Total words'] != 0] # Remove rows with empty messages
        df = df[df['Total words'] > 3] # Remove rows with less than 3 words 

        df['Sum vice and virtue'] = df['Vice words count'] + df['Virtue words count']

        return df
    
    def get_moral_df(self,
                     df:pd.DataFrame): #DataFrame with the moral words count
        """Returns a dataframe with moral words in the message, and gives a series of ratios."""

        df_moral = df[(df ['Vice words count'] > 0) | (df['Virtue words count'] > 0)]
        df_moral['VVRate'] = df.apply(lambda row: (row['Virtue words count'] + row['Vice words count']) / row['Total words'], axis=1)
        df_moral['Vice Rate'] = df.apply(lambda row: row['Vice words count'] / row['Total words'], axis=1)
        df_moral['Virtue Rate'] = df.apply(lambda row: row['Virtue words count'] / row['Total words'], axis=1)

        df_moral['Category'] = df_moral.apply(lambda row: 'Vice' if row['Vice Rate'] > row['Virtue Rate'] else 'Virtue', axis=1)
        df_moral['Original Message'] = self.loader.original['Message'].loc[self.loader.original.index]

        df_moral = df_moral.round({'VVRate': 2, 'Vice Rate': 2, 'Virtue Rate': 2})
        df_moral = df_moral.sort_values(by=['VVRate'], ascending=False)
    
        return df_moral
    
    def moral_words_wc(self,
                       message_col:str, #Column with the messages
                       media_names:str = None, #Column with media names
                        media_list:list = None): 

        """Creates a wordcloud with the virtue and vice words in the message_col column"""
        if self.methods_available == False:
            print('You need to process the data first')
            return None
        df = self.loader.processed
        stop_words_es = set(stopwords.words('spanish'))
        morales = set()
        morales.update(self.loader.vice_dict.keys())
        morales.update(self.loader.virtue_dict.keys())

        if media_list:
            page_names = media_list
        else:   
            page_names = df[media_names].unique()


        for name in page_names:
            page_df = df[df[media_names] == name]
            
            # Remove rows with empty messages
            page_df = page_df[page_df[media_names].apply(lambda x: isinstance(x, str))]
            
            # Tokenize the messages and remove stop words
            messages = ' '.join([word for sentence in page_df[message_col] for word in sentence.split() if word.lower() not in stop_words_es])
            
            # Filter only the words that are in the moral words set
            moral_words = ' '.join([word for word in messages.split() if word.lower() in morales])

            # Create wordclouds for general words and moral words
            wordcloud_general = WordCloud(width=800, height=400).generate(messages)
            wordcloud_morales = WordCloud(width=800, height=400).generate(moral_words)

            # Show wordcloud for general words
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud_general, interpolation='bilinear')
            plt.title(f'Most Common Words in {name}')
            plt.axis("off")
            plt.show()

            # Show wordcloud for moral words
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud_morales, interpolation='bilinear')
            plt.title(f'Most Common Moral Words in {name}')
            plt.axis("off")
            plt.show()


In [None]:
show_doc(Analyzer.moral_words_count)

---

### Analyzer.moral_words_count

>      Analyzer.moral_words_count (message_col:str)

Returns a dataframe with the count of virtue and vice words in each message in the message_col column

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| message_col | str | Column with the messages |
| **Returns** | **DataFrame** | **DataFrame with extra counting columns** |

In [None]:
show_doc(Analyzer.moral_words_wc)

---

### Analyzer.moral_words_wc

>      Analyzer.moral_words_wc (message_col:str, media_names:str)

Creates a wordcloud with the virtue and vice words in the message_col column

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| message_col | str | Column with the messages |
| media_names | str | Column with media names |

In [19]:
loader = Loader(csv="/Users/gerardoguerrero/mexican-polarization/nbs/datalake/2024-01-28-13-18-08-CST-search-csv-export-FBpages-trenmaya.csv",vice_dict='vice2.txt',virtue_dict='virtue2.txt')

In [20]:
loader.process_csv(text_cols=['Page Name', 'Message'], num_cols=['Total Interactions'], date_cols=['Post Created'])

  df = pd.read_csv(self.csv)


In [21]:
analyzer = Analyzer(loader)
df1 = analyzer.moral_words_count('Message')


In [25]:
df1.head(100)

Unnamed: 0,Page Name,Message,Total Interactions,Post Created,Vice words count,Vice words,Virtue words count,Virtue words,Total words,Sum vice and virtue
0,nmas,durante este ano la sedena continuo la constru...,18.0,2023-12-31 16:02:53 CST,0,[],0,[],26,0
1,entre lineas,el presidente de colombia gustavo petro recono...,18.0,2023-12-31 16:02:34 CST,1,[neoliberales],0,[],39,1
2,la jornada maya,el trenmaya avanzara hacia la excelencia en el...,44.0,2023-12-31 16:02:23 CST,1,[aseguro],1,[servicio],15,2
5,politico mx,cuanto cuesta viajar en el tren maya de cancun...,433.0,2023-12-31 16:01:51 CST,0,[],0,[],12,0
6,la prensamx,nacional estrena amlo segundo tramo del tren m...,1.0,2023-12-31 16:01:36 CST,0,[],1,[nacional],32,1
...,...,...,...,...,...,...,...,...,...,...
112,el diario de ciudad victoria noticias,trenmaya se inauguro el tramo cancunpalenque a...,13.0,2023-12-31 14:40:08 CST,1,[proteccion],1,[seguridad],70,2
113,vanguardia,mexico trenmaya a las 630 de la madrugada amlo...,15.0,2023-12-31 14:40:00 CST,0,[],1,[primer],36,1
117,tvtodocom,amlo inaugura tramo de cancun a palenque en tr...,6.0,2023-12-31 14:36:47 CST,0,[],1,[nacional],26,1
120,sin filtros,sinfiltros amlo inaugura tramo de cancun a pal...,14.0,2023-12-31 14:36:12 CST,0,[],1,[nacional],27,1


In [8]:
df_original = df_original[['Page Name', 'Message', 'Total Interactions', 'Post Created']]

In [26]:
df2 = analyzer.get_moral_df(df1)
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_moral['VVRate'] = df.apply(lambda row: (row['Virtue words count'] + row['Vice words count']) / row['Total words'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_moral['Vice Rate'] = df.apply(lambda row: row['Vice words count'] / row['Total words'], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-v

Unnamed: 0,Page Name,Message,Total Interactions,Post Created,Vice words count,Vice words,Virtue words count,Virtue words,Total words,Sum vice and virtue,VVRate,Vice Rate,Virtue Rate,Category,Original Message
34823,defendemos mexico,el gran mentiroso estafador,88.0,2023-11-02 20:33:35 CST,2,"[mentiroso, estafador]",1,[gran],4,3,0.75,0.50,0.25,Vice,El gran mentiroso estafador
59823,que tranza la garbanza,recuperando lo perdido vendido robado,0.0,2023-08-10 17:53:22 CDT,3,"[perdido, vendido, robado]",0,[],5,3,0.60,0.60,0.00,Vice,"Recuperando lo perdido, vendido, robado......"
122468,por mexico ni amlo ni morena fanpage,el gran cinico niamlonimorena,250.0,2023-02-23 03:04:26 CST,1,[cinico],1,[gran],4,2,0.50,0.25,0.25,Virtue,🔴 El gran cínico... #niamlonimorena
125102,roberto rojo,la gran mentira del tren mata,7.0,2023-02-14 04:57:23 CST,2,"[mentira, mata]",1,[gran],6,3,0.50,0.33,0.17,Vice,La gran mentira del Tren Mata.
95491,el show de la georgia,listo un verdadero proyecto,3.0,2023-05-13 17:32:21 CDT,0,[],2,"[listo, verdadero]",4,2,0.50,0.00,0.50,Virtue,Listo 🆙👌🏽un verdadero proyecto ✍️
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102430,semanario linea roja,un modelo turistico sustentable el reto hacia ...,3.0,2023-04-24 23:31:18 CDT,0,[],1,[obligatorio],288,1,0.00,0.00,0.00,Virtue,"Un modelo turístico sustentable, el reto hacia..."
130935,amlo y la mananera de hoy,el presidente andres manuel lopez obrador afir...,41.0,2023-01-24 16:30:12 CST,0,[],1,[nacional],233,1,0.00,0.00,0.00,Virtue,El presidente Andrés Manuel López Obrador afir...
132743,bolillo noticias,vacantes tren maya requisitos para ocupar un p...,1.0,2023-01-20 12:23:26 CST,0,[],1,[mantener],282,1,0.00,0.00,0.00,Virtue,Vacantes Tren Maya: Requisitos para ocupar un ...
92677,reporterosmx,asi seran las estaciones y paraderos del tramo...,8.0,2023-05-20 15:21:04 CDT,0,[],1,[gran],211,1,0.00,0.00,0.00,Virtue,ASÍ SERÁN LAS ESTACIONES Y PARADEROS DEL TRAMO...


In [28]:
df_vice = df2[df2['Category'] == 'Vice']
df_vice2 = df_vice[['Page Name','Original Message','Total Interactions','Post Created','VVRate','Vice Rate','Virtue Rate','Category']]
df_vice2


Unnamed: 0,Page Name,Original Message,Total Interactions,Post Created,VVRate,Vice Rate,Virtue Rate,Category
34823,defendemos mexico,El gran mentiroso estafador,88.0,2023-11-02 20:33:35 CST,0.75,0.50,0.25,Vice
59823,que tranza la garbanza,"Recuperando lo perdido, vendido, robado......",0.0,2023-08-10 17:53:22 CDT,0.60,0.60,0.00,Vice
125102,roberto rojo,La gran mentira del Tren Mata.,7.0,2023-02-14 04:57:23 CST,0.50,0.33,0.17,Vice
32486,luis alonso garcia hernandez,ES UN CINICO MENTIROSO AMLO,0.0,2023-11-08 11:33:54 CST,0.40,0.40,0.00,Vice
31817,antipeje,"No son iguales, son peores...",120.0,2023-11-10 12:44:42 CST,0.40,0.40,0.00,Vice
...,...,...,...,...,...,...,...,...
126617,linea de fuego quintana roo,DETECTAN HASTA 80 CASOS DE LA ENFERMEDAD DE LA...,22.0,2023-02-09 09:32:51 CST,0.00,0.00,0.00,Vice
8374,novedades yucatan,😋DELICIOSA COMIDA YUCATECA OFRECERÁ RESTAURANT...,64.0,2023-12-17 20:00:01 CST,0.00,0.00,0.00,Vice
126652,taxi vigia,Detectan hasta 80 casos de la enfermedad de la...,38.0,2023-02-09 06:56:05 CST,0.00,0.00,0.00,Vice
43980,tnt news nuevo laredo,AVANCES DEL TREN MAYA REPORTADOS DURANTE LA CO...,0.0,2023-09-11 16:00:59 CDT,0.00,0.00,0.00,Vice


In [None]:
df_vice3 = df_vice2.head(500)
df_vice3

In [96]:
df_vice3.to_csv('vice_tagged_trenmaya.csv',index=False)

In [30]:
df_virtue = df2[df2['Category'] == 'Virtue']
df_virtue.sort_values(by=['Virtue Rate'], ascending=False, inplace=True)
df_virtue2 = df_virtue[['Page Name','Original Message','Total Interactions','Post Created','VVRate','Vice Rate','Virtue Rate','Category']]
df_virtue2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_virtue.sort_values(by=['Virtue Rate'], ascending=False, inplace=True)


Unnamed: 0,Page Name,Original Message,Total Interactions,Post Created,VVRate,Vice Rate,Virtue Rate,Category
95491,el show de la georgia,Listo 🆙👌🏽un verdadero proyecto ✍️,3.0,2023-05-13 17:32:21 CDT,0.50,0.0,0.50,Virtue
47226,politicamente incorrectos,Me parece excelente por seguridad 👌,4.0,2023-09-04 17:57:06 CDT,0.40,0.0,0.40,Virtue
56046,reventours san luis potosi,Que bonito es lo bonito,3.0,2023-08-22 21:34:51 CDT,0.40,0.0,0.40,Virtue
26425,morena new york comite 1,Muy buenos días mi comandante supremo 🫡🫡🫡,48.0,2023-11-29 07:22:52 CST,0.33,0.0,0.33,Virtue
44725,campeche en linea,HISTÓRICO PRIMER VIAJE DEL TREN MAYA🤩🚝👏#Noticias,,2023-09-09 11:21:29 CDT,0.33,0.0,0.33,Virtue
...,...,...,...,...,...,...,...,...
78195,diario lider online,*Bedolla revisa proyecto para activar vuelos d...,0.0,2023-06-30 13:45:55 CDT,0.01,0.0,0.00,Virtue
56168,goyo yupit digital,APRUEBAN DONACIÓN DE TERRENO PARA LA CONSTRUCC...,24.0,2023-08-22 13:10:41 CDT,0.01,0.0,0.00,Virtue
107025,campeche daily news,AVANZA PROGRAMA SEMBRANDO VIDA EN RUTA DEL TRE...,2.0,2023-04-10 15:41:45 CDT,0.01,0.0,0.00,Virtue
78196,periodico la region,*Bedolla revisa proyecto para activar vuelos d...,3.0,2023-06-30 13:43:02 CDT,0.01,0.0,0.00,Virtue


In [None]:
df_virtue3 = df_virtue2.head(500)
df_virtue3