In [2]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 10000
from unicodedata import normalize
from sklearn.preprocessing import MinMaxScaler
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import math

In [3]:
def normalizer(x, min_, max_, a, b):
    return ((b - a) * ((x - min_) / (max_ - min_))) + a

In [4]:
def remover_acentos(text):
    return normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')

In [5]:
def processing_tweet(tweet):
    tokenized = word_tokenize(tweet)
    stop_words = stopwords.words('portuguese')
    keywords = [word for word in tokenized if not word in stop_words]
    return Counter(keywords), len(tokenized)

In [6]:
def clean_dataset(df):
    
    df['text'] = df['text'].str.replace(r'https://t.co/([a-zA-Z]|[0-9])+', "")
    df['text'] = df['text'].str.replace(r'["*!:%?#./\-()@]', "")
    
    for index, tweet in df.iterrows():
        tweet['text'] = remover_acentos(tweet['text'])
    
    return df

In [64]:
radical_words = {
    'saude' : ['saude','hosp','medic','remedio','doen','enferm'],
    'seguranca' : ['seguranca','polic','crim','violen','preso','presidi'],
    'educacao' : ['educac','professor','escola','ensino','alun','faculdade','universi'],
    'economia' : ['economi','produ','mercado','comerci','industr','desenvolv','terceiriz','setor','agro','agric', 'dividia'],
    'cultura' : ['cultura','turis','parque','museu','music','arte','cinema','danc'],
    'tecnologia' : ['tecno','inovac','ciencia','cienti','conhecim','comput', 'inform'],
    'meio_ambiente' : ['ambiente','florest','preserva','natur','desmatamento','polu','clima', 'ecolog', 'saneamento', 'fauna', 'flora', 'reserva'],
}

In [8]:
candidates_twitter = ['alvarodias_', 'cirogomes', 'geraldoalckmin', 'GuilhermeBoulos',
                      'Haddad_Fernando', 'jairbolsonaro', 'joaoamoedonovo', 'MarinaSilva', 'meirelles']

In [9]:
areas = ['saude','seguranca','educacao','economia','cultura','tecnologia','meio_ambiente']

In [10]:
candidates_name = {
    'alvarodias_': 'alvaro',
    'cirogomes': 'ciro',
    'geraldoalckmin': 'alckmin',
    'GuilhermeBoulos': 'boulos',
    'Haddad_Fernando': 'haddad',
    'jairbolsonaro': 'bolsonaro',
    'joaoamoedonovo': 'amoedo',
    'MarinaSilva': 'marina',
    'meirelles': 'meirelles'
}

In [61]:
min_max_scaler = MinMaxScaler(feature_range=(1, 5))

In [11]:
def get_candidate_count(cand):
    
    candidate = {
        'saude' : 0,
        'seguranca' : 0,
        'educacao' : 0,
        'economia' : 0,
        'cultura' : 0,
        'tecnologia': 0,
        'meio_ambiente': 0
    }
    
    print(f'Candidate: {cand}')
    csv = cand + '_tweets.csv'
    df = pd.read_csv('datasets/' + csv)
    df = clean_dataset(df)
    
    for area, radical_area in radical_words.items():
        for radical in radical_area:
            for tweet in df['text']:
                combined_tweets = '\t'.join(tweet.split())
                
                if radical in combined_tweets:
                    candidate[area]+=1
    
    return candidate

In [12]:
def get_candidate_tfidf(cand):
    
    print(f'Candidate: {cand}')
    csv = cand + '_tweets.csv'
    df = pd.read_csv('datasets/' + csv)
    df = clean_dataset(df)
    
    total_of_words_per_tweet = []
    count_areas_per_tweets = {}
    
    df_tfidf = df.copy()
    df_tfidf.columns = ['tweet']

    for index, tweet in enumerate(df['text']):
        
        count_words, total_of_words = processing_tweet(tweet)
        total_of_words_per_tweet.append(total_of_words)
        
        tf_areas = {
            'saude' : 0,
            'seguranca' : 0,
            'educacao' : 0,
            'economia' : 0,
            'cultura' : 0,
            'tecnologia': 0,
            'meio_ambiente': 0
        }
        
        for area, radical_area in radical_words.items():
            for radical in radical_area:
                for word, value in count_words.items():
                    if radical in word:
                        
                        if not tf_areas.get(area):
                            tf_areas[area] = value
                        else:
                            tf_areas[area]+=value
                        
        count_areas_per_tweets[index] = tf_areas
        
    df_tfidf['total'] = total_of_words_per_tweet
    
    for a in areas:
        df_tfidf['tf_' + str(a)] = [tf for index, areas_per_tweet in count_areas_per_tweets.items() for area, tf in areas_per_tweet.items() if area == str(a)]
        
        df_tfidf['tf_' + str(a)] = df_tfidf['tf_' + str(a)] / df_tfidf.total
        
        idf = math.log10(float(len(df_tfidf) / len(df_tfidf['tf_' + str(a)].loc[df_tfidf['tf_' + str(a)] > 0])))
        
        df_tfidf['tfidf_' + str(a)] = df_tfidf['tf_' + str(a)] * idf
    
    tfidf = {}
    
    for a in areas:
        tfidf[a] = df_tfidf['tfidf_' + str(a)].sum()
    
    return tfidf

In [65]:
df_cand = {}
for c in candidates_twitter:
    df_cand[candidates_name[c]] = get_candidate_count(c)

Candidate: alvarodias_
Candidate: cirogomes
Candidate: geraldoalckmin
Candidate: GuilhermeBoulos
Candidate: Haddad_Fernando
Candidate: jairbolsonaro
Candidate: joaoamoedonovo
Candidate: MarinaSilva
Candidate: meirelles


In [66]:
df_count = pd.DataFrame(data=df_cand).T

In [67]:
df_count

Unnamed: 0,cultura,economia,educacao,meio_ambiente,saude,seguranca,tecnologia
alvaro,54,65,25,26,19,64,15
ciro,39,70,57,12,32,28,30
alckmin,53,127,64,22,77,115,27
boulos,55,99,74,15,47,133,22
haddad,31,91,87,17,14,52,17
bolsonaro,49,64,39,6,21,83,31
amoedo,164,62,45,22,12,48,27
marina,22,38,50,27,21,42,16
meirelles,46,118,33,14,12,43,124


In [68]:
min_value = min(df_count.min())
max_value = max(df_count.max())

In [69]:
df_count_normalized = normalizer(df_count, min_value, max_value, 1, 5)

In [70]:
df_count_normalized

Unnamed: 0,cultura,economia,educacao,meio_ambiente,saude,seguranca,tecnologia
alvaro,2.21519,2.493671,1.481013,1.506329,1.329114,2.468354,1.227848
ciro,1.835443,2.620253,2.291139,1.151899,1.658228,1.556962,1.607595
alckmin,2.189873,4.063291,2.468354,1.405063,2.797468,3.759494,1.531646
boulos,2.240506,3.35443,2.721519,1.227848,2.037975,4.21519,1.405063
haddad,1.632911,3.151899,3.050633,1.278481,1.202532,2.164557,1.278481
bolsonaro,2.088608,2.468354,1.835443,1.0,1.379747,2.949367,1.632911
amoedo,5.0,2.417722,1.987342,1.405063,1.151899,2.063291,1.531646
marina,1.405063,1.810127,2.113924,1.531646,1.379747,1.911392,1.253165
meirelles,2.012658,3.835443,1.683544,1.202532,1.151899,1.936709,3.987342


In [71]:
df_count = pd.DataFrame(data=df_cand)
df_count_scaled = pd.DataFrame(
    data=min_max_scaler.fit_transform(df_count.values),
    columns=df_count.columns,
    index=df_count.index
).T



In [72]:
df_count_scaled

Unnamed: 0,cultura,economia,educacao,meio_ambiente,saude,seguranca,tecnologia
alvaro,4.12,5.0,1.8,1.88,1.32,4.92,1.0
ciro,2.862069,5.0,4.103448,1.0,2.37931,2.103448,2.241379
alckmin,2.180952,5.0,2.6,1.0,3.095238,4.542857,1.190476
boulos,2.355932,3.847458,3.0,1.0,2.084746,5.0,1.237288
haddad,1.883117,5.0,4.792208,1.155844,1.0,2.974026,1.155844
bolsonaro,3.233766,4.012987,2.714286,1.0,1.779221,5.0,2.298701
amoedo,5.0,2.315789,1.868421,1.263158,1.0,1.947368,1.394737
marina,1.705882,3.588235,5.0,2.294118,1.588235,4.058824,1.0
meirelles,2.214286,4.785714,1.75,1.071429,1.0,2.107143,5.0


In [73]:
df_cand = {}
for c in candidates_twitter:
    df_cand[candidates_name[c]] = get_candidate_tfidf(c)

Candidate: alvarodias_
Candidate: cirogomes
Candidate: geraldoalckmin
Candidate: GuilhermeBoulos
Candidate: Haddad_Fernando
Candidate: jairbolsonaro
Candidate: joaoamoedonovo
Candidate: MarinaSilva
Candidate: meirelles


In [74]:
df_tfidf = pd.DataFrame(data=df_cand).T

In [75]:
df_tfidf

Unnamed: 0,cultura,economia,educacao,meio_ambiente,saude,seguranca,tecnologia
alvaro,4.853293,5.315886,2.552918,2.8809,2.312021,4.88956,1.714165
ciro,4.414601,5.412548,4.687122,1.321788,2.937164,2.782521,2.727035
alckmin,3.789179,7.240056,4.733266,2.077605,5.403103,6.763509,2.311402
boulos,4.478458,7.076449,6.14241,1.59556,3.842861,9.600947,2.120682
haddad,3.354658,6.352816,6.894184,2.10199,1.577179,4.113809,1.660293
bolsonaro,4.45852,5.961724,3.633312,0.904391,2.270752,5.810963,4.972414
amoedo,10.156667,5.395471,4.123103,2.354691,1.296814,4.27577,3.952851
marina,1.628511,2.668232,3.320282,2.083892,1.845769,3.154592,1.51867
meirelles,7.199673,14.091262,3.941678,1.955496,1.719706,5.19077,13.063275


In [76]:
min_value = min(df_tfidf.min())
max_value = max(df_tfidf.max())
df_tfidf_normalized = normalizer(df_tfidf, min_value, max_value, 1, 5)

In [77]:
df_tfidf_normalized

Unnamed: 0,cultura,economia,educacao,meio_ambiente,saude,seguranca,tecnologia
alvaro,2.197828,2.338148,1.500051,1.599539,1.426979,2.208829,1.24563
ciro,2.064759,2.367468,2.147423,1.12661,1.616605,1.569697,1.552866
alckmin,1.875049,2.92181,2.161421,1.355873,2.364603,2.777258,1.426791
boulos,2.084129,2.872183,2.588859,1.209654,1.891332,3.637944,1.36894
haddad,1.743244,2.652682,2.816896,1.36327,1.204078,1.973519,1.229289
bolsonaro,2.078081,2.534051,1.827769,1.0,1.414461,2.48832,2.233962
amoedo,3.806511,2.362288,1.976338,1.439923,1.119034,2.022647,1.924696
marina,1.219649,1.535029,1.732817,1.35778,1.28555,1.682558,1.186331
meirelles,2.909561,5.0,1.921306,1.318834,1.247311,2.300196,4.688179


In [78]:
df_tfidf = pd.DataFrame(data=df_cand)
df_tfidf_scaled = pd.DataFrame(
    data= min_max_scaler.fit_transform(df_tfidf.values),
    columns=df_tfidf.columns,
    index=df_tfidf.index
).T

In [79]:
df_tfidf_scaled

Unnamed: 0,cultura,economia,educacao,meio_ambiente,saude,seguranca,tecnologia
alvaro,4.486253,5.0,1.931502,2.295753,1.663967,4.52653,1.0
ciro,4.024194,5.0,4.290668,1.0,2.579536,2.428324,2.37407
alckmin,2.326172,5.0,3.057675,1.0,3.576681,4.630759,1.181152
boulos,2.440479,3.7386,3.271895,1.0,2.122894,5.0,1.262384
haddad,2.337203,4.592727,5.0,1.394817,1.0,2.908315,1.062527
bolsonaro,3.81107,5.0,3.158387,1.0,2.080697,4.880758,4.217524
amoedo,5.0,2.85044,2.275998,1.477605,1.0,2.344924,2.199134
marina,1.243872,3.552295,5.0,2.254924,1.726236,4.632128,1.0
meirelles,2.771796,5.0,1.718413,1.076236,1.0,2.122273,4.667629
