# Processing of Party Text Programs to get Word Usage Count

## Import the relevant libraries

In [3]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [4]:
Partidos = ["BE", "PAN", "PCP", "L", "CH", "PS", "IL", "PEV", "AD"]

In [5]:
ToExclude = pd.read_csv('2Exclude.txt')
ToExclude

Unnamed: 0,2Exclude
0,bloco
1,pp
2,vii
3,três
4,seis
...,...
172,t r
173,v capítulo
174,xi
175,projecto lei


## Go through text and count word usage paragraph by paragraph

In [6]:
def CountSequences(words,n):
#Given a list, returns a dictionary mapping each n-element sequence tuple to its number of occurrences in the list.
# Initialize all counts implicitly to 0.
    countDict = defaultdict(int)

    for i in range(len(words)-n+1):
        key = tuple(words[i:i+n])
        countDict[key] = countDict[key] + 1
    return countDict

In [8]:
FullWordList = pd.DataFrame(columns = ['Word', 'Partido'])
ProcessedInput = pd.DataFrame(columns = ['Word', 'Partido', 'Q_Words', 'Count'])            

for Partido in ["PAN"]: #Partidos:
    print(Partido+'.txt')
    
    with open(Partido+'.txt', encoding='utf-8-sig') as fp:
        
        line = fp.readline()

        while line:
            
            line = line.lower()
            
            for ToExcludeWord in ToExclude['2Exclude']:
                line = line.replace(" "+ToExcludeWord+" ", " ")
                if line[0:len(ToExcludeWord)+1] == ToExcludeWord+" ":
                    line = line[len(ToExcludeWord)+1:len(line)]
                if line[len(line)-len(ToExcludeWord)-2:len(line)-1] == " "+ToExcludeWord:
                    line = line[0:len(line)-len(ToExcludeWord)-2]+"\n"
            
            line = line.strip()
            Words = line.split()

            for Group in range (3):
                ToExport = CountSequences(Words,Group+1)
                ToExportKeys = ToExport.keys()
                for i in ToExportKeys:
                    ProcessedInput = ProcessedInput.append({'Word': ' '.join(i), 
                                                            'Partido' : Partido, 
                                                            'Q_Words' : Group+1, 
                                                            'Count': ToExport[i]},
                                                          ignore_index = True)
                    
            line = fp.readline()
        
    fp.close()
    
ProcessedInput

PAN.txt


Unnamed: 0,Word,Partido,Q_Words,Count
0,cuidar,PAN,1,1
1,pessoas,PAN,1,1
2,cuidar pessoas,PAN,2,1
3,defender,PAN,1,1
4,animais,PAN,1,1
...,...,...,...,...
49731,organização judiciária tenha,PAN,3,1
49732,judiciária tenha conta,PAN,3,1
49733,tenha conta especificidades,PAN,3,1
49734,conta especificidades regiões,PAN,3,1


## Save as previous process is very slow

In [10]:
ProcessedInput.to_csv('Processed PAN.txt', index=True, mode='w', header=True, encoding='utf-8-sig')

## Consolidation of Multiple Runs

In [11]:
# Code to concatenate previous processing (as that is very slow execution pre-processing may by done in chunks)
ProcessedPast = pd.read_csv('Processed AD+BE+CH+IL+L+PAN+PCP+PS.txt')
FullWordList = pd.concat([ProcessedInput,ProcessedPast])
FullWordList.to_csv('Processed AD+BE+CH+IL+L+PAN+PCP+PS.txt', index=True, mode='w', header=True, encoding='utf-8-sig')
FullWordList


Unnamed: 0.2,Word,Partido,Q_Words,Count,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Unnamed: 0.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1,Unnamed: 0.1.1.1.1.1.1.1.1
0,cuidar,PAN,1,1,,,,,,,,,
1,pessoas,PAN,1,1,,,,,,,,,
2,cuidar pessoas,PAN,2,1,,,,,,,,,
3,defender,PAN,1,1,,,,,,,,,
4,animais,PAN,1,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
671571,mundo mais justo,PCP,3,1,598265.0,602116.0,538404.0,534553.0,538459.0,349403.0,273043.0,164802.0,164802.0
671572,mais justo pacífico,PCP,3,1,598266.0,602117.0,538405.0,534554.0,538460.0,349404.0,273044.0,164803.0,164803.0
671573,justo pacífico desenvolvido,PCP,3,1,598267.0,602118.0,538406.0,534555.0,538461.0,349405.0,273045.0,164804.0,164804.0
671574,pacífico desenvolvido sustentável,PCP,3,1,598268.0,602119.0,538407.0,534556.0,538462.0,349406.0,273046.0,164805.0,164805.0
