# Create Target Feature

In [1]:
import PyPDF2
import unidecode
import pandas as pd
from collections import Counter
import csv
import os
import os.path
import random
import re
import datetime

### Create lists with all paths to all files

In [2]:
folders = ["ciclo_3",
           "ciclo_4",
           "ciclo_5",
           "edicoes_anteriores/sorteio_34",
           "edicoes_anteriores/sorteio_35",
           "edicoes_anteriores/sorteio_36",
           "edicoes_anteriores/sorteio_37",
           "edicoes_anteriores/sorteio_38",
           "edicoes_anteriores/sorteio_39",
           "edicoes_anteriores/sorteio_40"]

seq_folders = []
file_names = []
file_names_and_paths = []

for folder in folders:
    directory = '../programa_de_fiscalizacao_em_entes_federativos/' + folder
    
    number_of_files = len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])
    
    for i in range(0, number_of_files):
        file_name_and_path = directory + "/" + os.listdir(directory)[i]
        if (".pdf" in file_name_and_path):
            seq_folders.append(folder)
            file_names.append(os.listdir(directory)[i])
            file_names_and_paths.append(file_name_and_path)

print('Example: \n' + file_names_and_paths[0:1][0])

Example: 
../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf


### Generate target feature for each report ('read' and summarised the polarity)

In [3]:
sentilex_database = pd.read_csv("../sentilex/99_01_sentilex_database.csv",
                                sep = ";")

sentilex_database.adjective = sentilex_database.adjective.str.normalize('NFKD').\
                                str.encode('ascii', errors='ignore').str.decode('utf-8')

In [5]:
file_names_and_paths = ['../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf']

In [7]:
cities = pd.DataFrame()

print("List of reports read and summarised")

for file_number in range(0, len(file_names_and_paths)):
    folder = seq_folders[file_number]
    file_name = file_names[file_number]
    file_name_and_path = file_names_and_paths[file_number]
    print(str(datetime.datetime.now()) + ' ' + file_name_and_path)
    
    # read report using external library pdf miner and save in 'temp_report.txt'
    command_to_cmd = 'pdf2txt.py "' + file_name_and_path + '" > temp_report.txt'
    os.system(command_to_cmd)
    
    # read temporary file
    temporary_file = open('temp_report.txt', 'r')
    
    whole_text = ''
    
    for line in temporary_file:
        whole_text += line
    
    words = re.findall(r"[\w']+", unidecode.unidecode(re.sub('\d', ' ', whole_text).lower()))

    # create the frequencies
    words_freq = pd.DataFrame.from_dict(Counter(words), orient = 'index').reset_index()
    words_freq.columns = ['word', 'freq']
    words_freq['pct'] = words_freq['freq']/sum(words_freq.freq)

    # aggregate polarity
    words_freq_polarity = words_freq.merge(sentilex_database,
                                           left_on = "word",
                                           right_on = "adjective",
                                           how = "left").iloc[:, [0, 1, 2, 4]]
    
    # summarise
    number_of_words = words_freq_polarity.freq.sum()
    pct_pol_neg = words_freq_polarity[words_freq_polarity.polarity == -1].pct.sum()
    pct_pol_pos = words_freq_polarity[words_freq_polarity.polarity == 1].pct.sum()
    pct_pol_neu = words_freq_polarity[words_freq_polarity.polarity == 0].pct.sum()
    pct_pol_missing = words_freq_polarity[words_freq_polarity.polarity.isna()].pct.sum()

    current_city = pd.DataFrame({"folder": folder,
                                 "file_name": file_name,
                                 "number_of_words": number_of_words,
                                 "pct_pol_neg": pct_pol_neg,
                                 "pct_pol_pos": pct_pol_pos,
                                 "pct_pol_neu": pct_pol_neu,
                                 "pct_pol_missing": pct_pol_missing},
                                index = [0])

    cities = cities.append(current_city)    
    
    # save last words_freq_polarity dataframe as an example
    if file_number + 1 == len(file_names_and_paths):
        words_freq_polarity.to_csv('temp_words_freq_polarity.csv',
                                   sep=';',
                                   encoding='utf-8',
                                   index=False)


List of reports read and summarised
2019-08-02 21:48:31.160329 ../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/8998-Santo Antônio de Jesus-BA.pdf


In [10]:
os.remove("temp_report.txt")

In [11]:
cities.to_csv("../target_feature/01_target_feature.csv",
              sep=';',
              encoding='utf-8',
              index=False)