# Create Target Feature

In [6]:
import PyPDF2
import unidecode
import pandas as pd
from collections import Counter
import csv
import os
import os.path
import random
import re
import datetime

### Create lists with all paths to all files

In [7]:
folders = ["ciclo_3",
           "ciclo_4",
           "ciclo_5",
           "edicoes_anteriores/sorteio_34",
           "edicoes_anteriores/sorteio_35",
           "edicoes_anteriores/sorteio_36",
           "edicoes_anteriores/sorteio_37",
           "edicoes_anteriores/sorteio_38",
           "edicoes_anteriores/sorteio_39",
           "edicoes_anteriores/sorteio_40"]

seq_folders = []
file_names = []
file_names_and_paths = []

for folder in folders:
    directory = '../programa_de_fiscalizacao_em_entes_federativos/' + folder
    
    number_of_files = len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])
    
    for i in range(0, number_of_files):
        file_name_and_path = directory + "/" + os.listdir(directory)[i]
        if (".pdf" in file_name_and_path):
            seq_folders.append(folder)
            file_names.append(os.listdir(directory)[i])
            file_names_and_paths.append(file_name_and_path)

print('Example: \n' + file_names_and_paths[0:1][0])

Example: 
../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf


### Generate target feature for each report ('read' and summarised the polarity)

In [8]:
sentilex_database = pd.read_csv("../sentilex/99_create_initial_sentilex_database.csv",
                               sep = ";")

sentilex_database.adjective = sentilex_database.adjective.str.normalize('NFKD').\
                                str.encode('ascii', errors='ignore').str.decode('utf-8')

In [9]:
cities = pd.DataFrame()

print("List of reports read and summarised")

for file_number in range(0, len(file_names_and_paths)):
    folder = seq_folders[file_number]
    file_name = file_names[file_number]
    file_name_and_path = file_names_and_paths[file_number]
    print(str(datetime.datetime.now()) + ' ' + file_name_and_path)
    
    # read report using external library pdf miner and save in 'temp_report.txt'
    command_to_cmd = 'pdf2txt.py "' + file_name_and_path + '" > temp_report.txt'
    os.system(command_to_cmd)
    
    # read temporary file
    temporary_file = open('temp_report.txt', 'r')
    
    whole_text = ''
    
    for line in temporary_file:
        whole_text += line
    
    words = re.findall(r"[\w']+", unidecode.unidecode(re.sub('\d', ' ', whole_text).lower()))

    # create the frequencies
    words_freq = pd.DataFrame.from_dict(Counter(words), orient = 'index').reset_index()
    words_freq.columns = ['word', 'freq']
    words_freq['pct'] = words_freq['freq']/sum(words_freq.freq)

    # aggregate polarity
    words_freq_polarity = words_freq.merge(sentilex_database,
                                           left_on = "word",
                                           right_on = "adjective",
                                           how = "left").iloc[:, [0, 1, 2, 4]]
    
    words_freq_polarity_fill = words_freq_polarity.fillna(0)
    
    # summarise
    number_of_words = words_freq_polarity_fill.freq.sum()
    pct_pol_neg = words_freq_polarity_fill[words_freq_polarity_fill.polarity == -1].pct.sum()
    pct_pol_pos = words_freq_polarity_fill[words_freq_polarity_fill.polarity == 1].pct.sum()
    pct_pol_neu = words_freq_polarity_fill[words_freq_polarity_fill.polarity == 0].pct.sum()
    
    current_city = pd.DataFrame({"folder": folder,
                                 "file_name": file_name,
                                 "number_of_words": number_of_words,
                                 "pct_pol_neg": pct_pol_neg,
                                 "pct_pol_pos": pct_pol_pos,
                                 "pct_pol_neu": pct_pol_neu},
                                index = [0])
    
    cities = cities.append(current_city)

List of reports read and summarised
2019-06-05 21:39:32.528643 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf
2019-06-05 21:39:56.029051 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9024-Ulianópolis-PA.pdf
2019-06-05 21:40:36.182610 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9010-Aldeias Altas-MA.pdf
2019-06-05 21:41:26.127173 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9034-Paraíba do Sul-RJ.pdf
2019-06-05 21:41:41.033747 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9045-Governador Celso Ramos-SC.pdf
2019-06-05 21:41:44.086117 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9016-Pirajuba-MG.pdf
2019-06-05 21:41:51.088958 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9018-Naviraí-MS.pdf
2019-06-05 21:42:14.171958 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9015-Nova Lima-MG.pdf
2019-06-05 21:42:28.939082 ../programa_de_fiscalizacao_em_entes_federativos/ci

2019-06-05 22:08:09.539296 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10323-São Borja-RS.pdf
2019-06-05 22:08:17.520618 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10374-Jataí-GO.pdf
2019-06-05 22:09:01.266251 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10379-São Domingos-SC.pdf
2019-06-05 22:09:46.437324 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10407-Oeiras-PI.pdf
2019-06-05 22:10:41.809363 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10291-Criciúma-SC.pdf
2019-06-05 22:10:55.354109 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10383-Caldas Novas-GO.pdf
2019-06-05 22:11:21.479151 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10311-Manhuaçu-MG.pdf
2019-06-05 22:11:38.593587 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10289-Estância-SE.pdf
2019-06-05 22:12:15.006752 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10403-Tobias Barreto-SE.pdf
2019-06-05 22:13:06.546984 ../

2019-06-05 22:42:01.834264 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10388-Sertãozinho-SP.pdf
2019-06-05 22:42:09.382291 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10442-Viçosa-MG.pdf
2019-06-05 22:42:13.833237 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10286-Cascavel-PR.pdf
2019-06-05 22:42:42.949845 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10438-Icó-CE.pdf
2019-06-05 22:42:52.966012 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10287-Londrina-PR.pdf
2019-06-05 22:43:09.325680 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10312-Juiz de Fora-MG.pdf
2019-06-05 22:43:28.929470 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10324-Santa Rosa-RS.pdf
2019-06-05 22:43:37.211706 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10361-Rio Verde-GO.pdf
2019-06-05 22:44:08.736821 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10371-Aquidauana-MS.pdf
2019-06-05 22:44:51.756491 ../programa

2019-06-05 23:12:34.739470 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1872-Muçum-RS.pdf
2019-06-05 23:12:39.387157 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1875-Benedito Novo-SC.pdf
2019-06-05 23:12:46.230342 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1835-Itapagé-CE.pdf
2019-06-05 23:13:07.353116 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1883-Taubaté-SP.pdf
2019-06-05 23:13:10.367574 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1831-Lamarão-BA.pdf
2019-06-05 23:13:15.873665 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1882-Santa Albertina-SP.pdf
2019-06-05 23:13:21.057265 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1857-Manaíra-PB.pdf
2019-06-05 23:13:26.155661 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sortei

2019-06-05 23:24:28.342219 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1935-Sangão-SC.pdf
2019-06-05 23:24:34.510552 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1893-Horizonte-CE.pdf
2019-06-05 23:24:42.099598 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1905-Mesquita-MG.pdf
2019-06-05 23:24:49.107011 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1887-Morro do Chapéu-BA.pdf
2019-06-05 23:25:10.611823 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1906-Patrocínio-MG.pdf
2019-06-05 23:25:31.573416 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1931-Itaara-RS.pdf
2019-06-05 23:25:34.309926 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1909-Colíder-MT.pdf
2019-06-05 23:25:43.188782 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio

2019-06-05 23:35:27.563295 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2479-Condado-PE.pdf
2019-06-05 23:36:29.107243 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2481-Iranduba-AM.pdf
2019-06-05 23:36:38.048797 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2465-Balsas-MA.pdf
2019-06-05 23:37:01.998148 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2493-Arraial do Cabo-RJ.pdf
2019-06-05 23:37:18.107247 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2501-Pacoti-CE.pdf
2019-06-05 23:37:28.112574 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2461-Itacuruba-PE.pdf
2019-06-05 23:38:03.501057 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2781-São Domingos-SE.pdf
2019-06-05 23:38:14.307067 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_

2019-06-05 23:48:46.510151 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2861-Paraíso-SP.pdf
2019-06-05 23:48:49.589306 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2801-Santa Rita do Pardo-MS.pdf
2019-06-05 23:48:59.480722 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2957-Casa Nova-BA.pdf
2019-06-05 23:49:07.407537 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2993-Perdigão-MG.pdf
2019-06-05 23:49:12.167338 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/3032-Terra Roxa-PR.pdf
2019-06-05 23:49:16.483698 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2977-Guarani de Goiás-GO.pdf
2019-06-05 23:49:31.443669 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/3018-Limoeiro-PE.pdf
2019-06-05 23:49:46.050505 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_ant

2019-06-06 00:00:29.202626 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3146-Aurilândia-GO.pdf
2019-06-06 00:00:38.327421 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3234-Pardinho-SP.pdf
2019-06-06 00:00:42.992358 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3200-Petrópolis-RJ.pdf
2019-06-06 00:00:54.336212 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3232-Lavínia-SP.pdf
2019-06-06 00:00:58.483582 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3210-Coronel Pilar-RS.pdf
2019-06-06 00:01:01.915686 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3218-Três Barras-SC.pdf
2019-06-06 00:01:24.644676 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3190-Júlio Borges-PI.pdf
2019-06-06 00:01:40.669969 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anterio

2019-06-06 00:19:36.498728 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3346-Nuporanga-SP.pdf
2019-06-06 00:19:41.085018 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3324-Pinhalão-PR.pdf
2019-06-06 00:20:04.764621 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3366-Itambacuri-MG.pdf
2019-06-06 00:20:36.950945 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3436-Trombudo Central-SC.pdf
2019-06-06 00:20:47.125743 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3425-Capitão de Campos-PI.pdf
2019-06-06 00:21:06.386881 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3358-Tiros-MG.pdf
2019-06-06 00:21:20.000287 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3380-Cristinápolis-SE.pdf
2019-06-06 00:21:44.162026 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_an

In [10]:
os.remove("temp_report.txt")

In [11]:
cities.to_csv("../target_feature/01_create_target_feature.csv",
              sep=';',
              encoding='utf-8',
              index=False)