# Import and handle SentiLex-PT02 database

In [247]:
# import libraries
import csv
import pandas as pd

In [274]:
# read csv file
sentiLexDatabase = pd.read_csv("SentiLex-flex-PT02.txt", header = None)
sentiLexDatabase.columns = ["adjective", "description"]

# extract "polarity" from "description"
polarity = pd.DataFrame(sentiLexDatabase.description.str.split('\;+').str[3].str.split('\=+').str[1])
sentiLexDatabase = pd.concat([sentiLexDatabase, polarity], axis = 1, join = 'outer')

# remove duplicates
sentiLexDatabase = sentiLexDatabase.iloc[:, [0, 2]].drop_duplicates()
sentiLexDatabase.columns = ["adjective", "polarity"]

# select only polarities in [-1, 0, 1]
polarities = ["-1", "0", "1"]
sentiLexDatabase = sentiLexDatabase[sentiLexDatabase.polarity.isin(polarities)]

# Define randomly reports for improving SentiLex-PT02

In [286]:
# import libraries
import os
import os.path
import random

In [277]:
folders = ["ciclo_3",
           "ciclo_4",
           "edicoes_anteriores/sorteio_34",
           "edicoes_anteriores/sorteio_35",
           "edicoes_anteriores/sorteio_36",
           "edicoes_anteriores/sorteio_37",
           "edicoes_anteriores/sorteio_38",
           "edicoes_anteriores/sorteio_39",
           "edicoes_anteriores/sorteio_40"]

fileNamesAndPaths = []

for folder in folders:
    directory = '../programa_de_fiscalizacao_em_entes_federativos/' + folder
    
    numberOfFiles = len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))]) - 3
    random.seed(7)
    randomFileNumber = int(random.uniform(0, numberOfFiles))
    
    fileNameAndPath = directory + "/" + os.listdir(directory)[randomFileNumber]
    fileNamesAndPaths.append(fileNameAndPath)

# Import reports, collect unique words and save words not in SentiLex-PT02

In [278]:
# import libraries
import PyPDF2
import unidecode
import pandas as pd
from collections import Counter
import csv

In [287]:
print("List of reports read to improve SentiLex database")

for fileNumber in range(0, len(fileNamesAndPaths)):
    filename = fileNamesAndPaths[fileNumber]
    print(filename)
    
    # create a pdf object
    file = open(filename, 'rb')
    
    # create a pdf reader object
    fileReader = PyPDF2.PdfFileReader(file)

    # iterate all documents
    wordIndex = -1
    flagInAWord = 0
    words = []

    for i in range(fileReader.numPages):
        page = unidecode.unidecode(fileReader.getPage(i).extractText().lower())

        for j in range(len(page)):
            letter = page[j]

            if (not letter.isalpha()) and flagInAWord != 0:
                flagInAWord = 0
            elif letter.isalpha() and flagInAWord == 0:
                flagInAWord = 1
                wordIndex += 1
                words.append(letter)
            elif letter.isalpha() and flagInAWord != 0:
                words[wordIndex] += letter

    wordsUnique = pd.DataFrame(pd.DataFrame(words).iloc[:, 0].unique())
    wordsUnique.columns = ["adjective"]

    wordsWithPolarity = wordsUnique.merge(sentiLexDatabase, left_on = "adjective", right_on = "adjective", how = "left")
    
    wordsWithPolarity[wordsWithPolarity.polarity.isnull()].to_csv("improving_senti_lex/" + folders[fileNumber].replace("/", "_") + ".csv",
                                                              sep = ';',
                                                              encoding = 'utf-8',
                                                              index = False)


List of reports read to improve SentiLex database
../programa_de_fiscalizacao_em_entes_federativos/ciclo_3/9037-Poço Branco-RN.pdf
../programa_de_fiscalizacao_em_entes_federativos/ciclo_4/10321-Uruguaiana-RS.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_34/1837-São Mateus-ES.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_35/1906-Patrocínio-MG.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_36/2483-Pontal do Paraná-PR.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_37/2871-São José do Sul-RS.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_38/2975-Presidente Kennedy-ES.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_39/3179-São Domingos do Araguaia-PA.pdf
../programa_de_fiscalizacao_em_entes_federativos/edicoes_anteriores/sorteio_40/3390-Goianésia do Pará-PA.pdf


In [282]:
sentiLexDatabase.to_csv("0_create_sentilex_database.csv",
                        sep = ';',
                        encoding = 'utf-8',
                        index = False)