# Create Target Feature

In [38]:
import PyPDF2
import unidecode
import pandas as pd
from collections import Counter
import csv
import os
import os.path
import random
import re
import datetime

### Create lists with all paths to all files

In [39]:
folders = ["ciclo_3",
           "ciclo_4",
           "ciclo_5",
           "edicoes_anteriores/sorteio_34",
           "edicoes_anteriores/sorteio_35",
           "edicoes_anteriores/sorteio_36",
           "edicoes_anteriores/sorteio_37",
           "edicoes_anteriores/sorteio_38",
           "edicoes_anteriores/sorteio_39",
           "edicoes_anteriores/sorteio_40"]

seq_folders = []
file_names = []
file_names_and_paths = []

for folder in folders:
    directory = '../programa_de_fiscalizacao_em_entes_federativos/' + folder
    
    number_of_files = len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])
    
    for i in range(0, number_of_files):
        file_name_and_path = directory + "/" + os.listdir(directory)[i]
        if (".pdf" in file_name_and_path):
            seq_folders.append(folder)
            file_names.append(os.listdir(directory)[i])
            file_names_and_paths.append(file_name_and_path)

print('Example: \n' + file_names_and_paths[0:1][0])

Example: 
../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf


### Generate target feature for each report ('read' and summarised the polarity)

In [40]:
sentilex_database = pd.read_csv("../sentilex/99_01_sentilex_database.csv",
                                sep = ";")

sentilex_database.adjective = sentilex_database.adjective.str.normalize('NFKD').\
                                str.encode('ascii', errors='ignore').str.decode('utf-8')

In [42]:
cities = pd.DataFrame()

print("List of reports read and summarised")

for file_number in range(0, len(file_names_and_paths)):
    folder = seq_folders[file_number]
    file_name = file_names[file_number]
    file_name_and_path = file_names_and_paths[file_number]
    print(str(datetime.datetime.now()) + ' ' + file_name_and_path)
    
    # read report using external library pdf miner and save in 'temp_report.txt'
    command_to_cmd = 'pdf2txt.py "' + file_name_and_path + '" > temp_report.txt'
    os.system(command_to_cmd)
    
    # read temporary file
    temporary_file = open('temp_report.txt', 'r')
    
    whole_text = ''
    
    for line in temporary_file:
        whole_text += line
    
    words = re.findall(r"[\w']+", unidecode.unidecode(re.sub('\d', ' ', whole_text).lower()))

    # create the frequencies
    words_freq = pd.DataFrame.from_dict(Counter(words), orient = 'index').reset_index()
    words_freq.columns = ['word', 'freq']
    words_freq['pct'] = words_freq['freq']/sum(words_freq.freq)

    # aggregate polarity
    words_freq_polarity = words_freq.merge(sentilex_database,
                                           left_on = "word",
                                           right_on = "adjective",
                                           how = "left").iloc[:, [0, 1, 2, 4]]
    
    # summarise
    number_of_words = words_freq_polarity.freq.sum()
    pct_pol_neg = words_freq_polarity[words_freq_polarity.polarity == -1].pct.sum()
    pct_pol_pos = words_freq_polarity[words_freq_polarity.polarity == 1].pct.sum()
    pct_pol_neu = words_freq_polarity[words_freq_polarity.polarity == 0].pct.sum()
    pct_pol_missing = words_freq_polarity[words_freq_polarity.polarity.isna()].pct.sum()

    current_city = pd.DataFrame({"folder": folder,
                                 "file_name": file_name,
                                 "number_of_words": number_of_words,
                                 "pct_pol_neg": pct_pol_neg,
                                 "pct_pol_pos": pct_pol_pos,
                                 "pct_pol_neu": pct_pol_neu,
                                 "pct_pol_missing": pct_pol_missing},
                                index = [0])
    
    cities = cities.append(current_city)
    
    # save last words_freq_polarity dataframe as an example
    if file_number + 1 == len(file_names_and_paths):
        words_freq_polarity.to_csv('temp_words_freq_polarity.csv',
                                   sep=';',
                                   encoding='utf-8',
                                   index=False)


List of reports read and summarised
2019-08-03 10:22:00.855850 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf
2019-08-03 10:22:22.150012 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9024-Ulianópolis-PA.pdf
2019-08-03 10:22:49.557043 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9010-Aldeias Altas-MA.pdf
2019-08-03 10:23:28.138577 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9034-Paraíba do Sul-RJ.pdf
2019-08-03 10:23:38.701800 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9045-Governador Celso Ramos-SC.pdf
2019-08-03 10:23:41.355285 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9016-Pirajuba-MG.pdf
2019-08-03 10:23:47.821345 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9018-Naviraí-MS.pdf
2019-08-03 10:24:08.811497 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9015-Nova Lima-MG.pdf
2019-08-03 10:24:22.165094 ../programa_de_fiscalizacao_em_entes_federativos/ci

2019-08-03 10:43:55.991098 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10323-São Borja-RS.pdf
2019-08-03 10:44:01.525275 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10374-Jataí-GO.pdf
2019-08-03 10:44:32.015977 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10379-São Domingos-SC.pdf
2019-08-03 10:45:05.517659 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10407-Oeiras-PI.pdf
2019-08-03 10:45:42.209910 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10291-Criciúma-SC.pdf
2019-08-03 10:45:51.181025 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10383-Caldas Novas-GO.pdf
2019-08-03 10:46:09.476134 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10311-Manhuaçu-MG.pdf
2019-08-03 10:46:23.122377 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10289-Estância-SE.pdf
2019-08-03 10:46:55.736660 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10403-Tobias Barreto-SE.pdf
2019-08-03 10:47:42.682156 ../

2019-08-03 11:15:35.888693 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10388-Sertãozinho-SP.pdf
2019-08-03 11:15:43.456947 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10442-Viçosa-MG.pdf
2019-08-03 11:15:47.920498 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10286-Cascavel-PR.pdf
2019-08-03 11:16:17.567647 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10438-Icó-CE.pdf
2019-08-03 11:16:27.495732 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10287-Londrina-PR.pdf
2019-08-03 11:16:41.706578 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10312-Juiz de Fora-MG.pdf
2019-08-03 11:16:59.996769 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10324-Santa Rosa-RS.pdf
2019-08-03 11:17:07.867314 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10361-Rio Verde-GO.pdf
2019-08-03 11:17:37.500476 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10371-Aquidauana-MS.pdf
2019-08-03 11:18:18.017841 ../programa

2019-08-03 11:44:24.920831 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1872-Muçum-RS.pdf
2019-08-03 11:44:29.250138 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1875-Benedito Novo-SC.pdf
2019-08-03 11:44:35.726171 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1835-Itapagé-CE.pdf
2019-08-03 11:44:55.592914 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1883-Taubaté-SP.pdf
2019-08-03 11:44:58.558080 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1831-Lamarão-BA.pdf
2019-08-03 11:45:03.895406 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1882-Santa Albertina-SP.pdf
2019-08-03 11:45:08.794976 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1857-Manaíra-PB.pdf
2019-08-03 11:45:13.573966 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sortei

2019-08-03 11:55:37.006369 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1935-Sangão-SC.pdf
2019-08-03 11:55:42.703980 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1893-Horizonte-CE.pdf
2019-08-03 11:55:49.649580 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1905-Mesquita-MG.pdf
2019-08-03 11:55:56.089327 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1887-Morro do Chapéu-BA.pdf
2019-08-03 11:56:16.176442 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1906-Patrocínio-MG.pdf
2019-08-03 11:56:35.989295 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1931-Itaara-RS.pdf
2019-08-03 11:56:38.558302 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1909-Colíder-MT.pdf
2019-08-03 11:56:47.294594 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio

2019-08-03 12:06:17.150927 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2479-Condado-PE.pdf
2019-08-03 12:07:17.993314 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2481-Iranduba-AM.pdf
2019-08-03 12:07:27.065655 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2465-Balsas-MA.pdf
2019-08-03 12:07:51.037785 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2493-Arraial do Cabo-RJ.pdf
2019-08-03 12:08:07.052790 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2501-Pacoti-CE.pdf
2019-08-03 12:08:16.938468 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2461-Itacuruba-PE.pdf
2019-08-03 12:08:52.194180 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2781-São Domingos-SE.pdf
2019-08-03 12:09:03.167799 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_

2019-08-03 12:19:36.954772 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2861-Paraíso-SP.pdf
2019-08-03 12:19:40.206347 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2801-Santa Rita do Pardo-MS.pdf
2019-08-03 12:19:50.224942 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2957-Casa Nova-BA.pdf
2019-08-03 12:19:58.095377 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2993-Perdigão-MG.pdf
2019-08-03 12:20:02.916560 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/3032-Terra Roxa-PR.pdf
2019-08-03 12:20:07.284355 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2977-Guarani de Goiás-GO.pdf
2019-08-03 12:20:22.179992 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/3018-Limoeiro-PE.pdf
2019-08-03 12:20:36.522964 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_ant

2019-08-03 12:31:38.768704 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3146-Aurilândia-GO.pdf
2019-08-03 12:31:48.650665 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3234-Pardinho-SP.pdf
2019-08-03 12:31:53.496568 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3200-Petrópolis-RJ.pdf
2019-08-03 12:32:05.601277 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3232-Lavínia-SP.pdf
2019-08-03 12:32:10.582322 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3210-Coronel Pilar-RS.pdf
2019-08-03 12:32:14.423413 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3218-Três Barras-SC.pdf
2019-08-03 12:32:38.615603 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3190-Júlio Borges-PI.pdf
2019-08-03 12:32:55.898490 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anterio

2019-08-03 12:53:07.075684 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3346-Nuporanga-SP.pdf
2019-08-03 12:53:11.652870 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3324-Pinhalão-PR.pdf
2019-08-03 12:53:39.540510 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3366-Itambacuri-MG.pdf
2019-08-03 12:54:12.962710 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3436-Trombudo Central-SC.pdf
2019-08-03 12:54:24.126667 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3425-Capitão de Campos-PI.pdf
2019-08-03 12:54:46.829654 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3358-Tiros-MG.pdf
2019-08-03 12:55:02.771035 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3380-Cristinápolis-SE.pdf
2019-08-03 12:55:29.702256 ../programa_de_fiscalizacao_em_entes_federativos/edicoes_an

In [45]:
len(file_names_and_paths)

597

In [43]:
os.remove("temp_report.txt")

In [44]:
cities.to_csv("../target_feature/01_target_feature.csv",
              sep=';',
              encoding='utf-8',
              index=False)