# Processing of Party Text Programs to get Word Usage Count

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [189]:
Partidos = np.empty(10, dtype=object)
Partidos[0]="CDS"
Partidos[1]="PAN"
Partidos[2]="PCP"
Partidos[3]="Livre"
Partidos[4]="CH"
Partidos[5]="PS"
Partidos[6]="IL"
Partidos[7]="PEV"
Partidos[8]="BE"
Partidos[9]="PSD"

In [190]:
ToExclude = pd.read_csv('2Exclude.txt')
ToExclude

Unnamed: 0,2Exclude
0,pp
1,vii
2,três
3,seis
4,/
...,...
121,qual
122,quer
123,seja
124,tal


## Go through text and count word usage paragraph by paragraph

In [191]:
def CountSequences(words,n):
#Given a list, returns a dictionary mapping each n-element sequence tuple to its number of occurrences in the list.
# Initialize all counts implicitly to 0.
    countDict = defaultdict(int)

    for i in range(len(words)-n+1):
        key = tuple(words[i:i+n])
        countDict[key] = countDict[key] + 1
    return countDict

In [192]:
FullWordList = pd.DataFrame(columns = ['Word', 'Partido'])
ProcessedInput = pd.DataFrame(columns = ['Word', 'Partido', 'Q_Words', 'Count'])            

for Partido in range (1):
    print(Partidos[Partido]+'.txt')
    
    with open(Partidos[Partido]+'.txt', encoding='utf-8-sig') as fp:
        
        line = fp.readline()

        while line:
            
            line = line.lower()
            
            for ToExcludeWord in ToExclude['2Exclude']:
                line = line.replace(" "+ToExcludeWord+" ", " ")
                if line[0:len(ToExcludeWord)+1] == ToExcludeWord+" ":
                    line = line[len(ToExcludeWord)+1:len(line)]
                if line[len(line)-len(ToExcludeWord)-2:len(line)-1] == " "+ToExcludeWord:
                    line = line[0:len(line)-len(ToExcludeWord)-2]+"\n"
            
            line = line.strip()
            Words = line.split()

            for Group in range (3):
                ToExport = CountSequences(Words,Group+1)
                ToExportKeys = ToExport.keys()
                for i in ToExportKeys:
                    ProcessedInput = ProcessedInput.append({'Word': '  '.join(i), 
                                                            'Partido' : Partidos[Partido], 
                                                            'Q_Words' : Group+1, 
                                                            'Count': ToExport[i]},
                                                          ignore_index = True)
                    
            line = fp.readline()
        
    fp.close()
    
ProcessedInput

CDS.txt


Unnamed: 0,Word,Partido,Q_Words,Count
0,compromisso,CDS,1,1
1,envolve,CDS,1,1
2,compromisso envolve,CDS,2,1
3,i,CDS,1,1
4,defesa,CDS,1,1
...,...,...,...,...
4596,transferência competências meios,CDS,3,1
4597,competências meios municípios,CDS,3,1
4598,central,CDS,1,1
4599,lisboa,CDS,1,1


## Save as previous process is very slow

In [193]:
ProcessedInput.to_csv('Processed CDS.txt', index=True, mode='w', header=True, encoding='utf-8-sig')

## Nice place to restart run if data was saved

In [12]:
ProcessedPast = pd.read_csv('Processed CH+BE+PCP+Livre+PSD+PS+IL+PAN+CDS.txt')
FullWordList = ProcessedPast

In [13]:
# Code to concatenate previous processing (as that is very slow execution my by done in chunks)
#FullWordList = pd.concat([ProcessedInput,ProcessedPast])
#FullWordList.to_csv('Processed CH+BE+PCP+Livre+PSD+PS+IL+PAN+CDS.txt', index=True, mode='w', header=True, encoding='utf-8-sig')



## Nice place to restart run if data was saved

## Calculate Party Program word size

In [14]:
PartidoStats = FullWordList[FullWordList.Q_Words==1.0].groupby(['Partido']).count()
PartidoStats.rename(columns={"Q_Words": "Count_Words"}, inplace=True)
PartidoStats.drop('Word', axis = 1, inplace=True)
PartidoStats.drop('Count', axis = 1, inplace=True)
PartidoStats

Unnamed: 0_level_0,Unnamed: 0,Count_Words,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1
Partido,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BE,38763,38763,38763,38763,38763,38763
CDS,1876,1876,0,0,0,0
CH,2309,2309,2309,2309,2309,2309
IL,102905,102905,102905,102905,0,0
Livre,20294,20294,20294,20294,20294,20294
PAN,31879,31879,31879,0,0,0
PCP,28417,28417,28417,28417,28417,28417
PS,32349,32349,32349,32349,32349,0
PSD,38165,38165,38165,38165,38165,0


## Synonyms are use to consolidate words with same "meaning"

In [15]:
Synonyms = pd.read_csv('Synonyms.txt')
Synonyms

Unnamed: 0,Word,Word2
0,direita direita,direita
1,familiares,família
2,familiar,família
3,devem ser,deve ser
4,últimos,último
...,...,...
157,têm,tem
158,todas,todo
159,todos,todo
160,trabalho,trabalhar


In [16]:
WordList = FullWordList.merge(Synonyms, on=['Word'], how='left', indicator=True)
WordList

Unnamed: 0.2,Unnamed: 0,Word,Partido,Q_Words,Count,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Word2,_merge
0,0,compromisso,CDS,1,1,,,,,,left_only
1,1,envolve,CDS,1,1,,,,,,left_only
2,2,compromisso envolve,CDS,2,1,,,,,,left_only
3,3,i,CDS,1,1,,,,,,left_only
4,4,defesa,CDS,1,1,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
823037,817305,resposta situações temporárias,Livre,3,1,728770.0,445928.0,247585.0,247585.0,,left_only
823038,817306,voltar,Livre,1,1,728771.0,445929.0,247586.0,247586.0,,left_only
823039,817307,descarregar,Livre,1,1,728772.0,445930.0,247587.0,247587.0,,left_only
823040,817308,programa,Livre,1,1,728773.0,445931.0,247588.0,247588.0,,left_only


In [17]:
def GetSynonym(a, b):
    if pd.isna(b):
        return a
    else:
        return b

In [18]:
WordList['Word'] = WordList.apply(lambda x: GetSynonym(x['Word'], x['Word2']), axis = 1)
WordList.drop('_merge', axis = 1, inplace=True)
WordList.drop('Word2', axis = 1, inplace=True)
WordList.head(20)

Unnamed: 0.2,Unnamed: 0,Word,Partido,Q_Words,Count,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1
0,0,compromisso,CDS,1,1,,,,
1,1,envolve,CDS,1,1,,,,
2,2,compromisso envolve,CDS,2,1,,,,
3,3,i,CDS,1,1,,,,
4,4,defesa,CDS,1,1,,,,
5,5,intransigente,CDS,1,1,,,,
6,6,vida,CDS,1,1,,,,
7,7,desde,CDS,1,1,,,,
8,8,defesa intransigente,CDS,2,1,,,,
9,9,intransigente vida,CDS,2,1,,,,


In [19]:
WordListCount = WordList.groupby(['Partido', 'Word'])['Count'].sum().reset_index(name="Count")
WordListMin = WordList.groupby(['Partido', 'Word'])['Q_Words'].min().reset_index(name="Q_Words")
WordListCount = WordListCount.merge(WordListMin, on=['Partido', 'Word'], how='left', indicator=True)
WordListCount.drop('_merge', axis = 1, inplace=True)
WordListCount

Unnamed: 0,Partido,Word,Count,Q_Words
0,BE,%,15,1
1,BE,% 272,1,2
2,BE,% despesas,1,2
3,BE,% despesas consumo,1,3
4,BE,% pib,1,2
...,...,...,...,...
504586,PSD,€ 50 m€,2,3
504587,PSD,€ combate,1,2
504588,PSD,€ combate à,1,3
504589,PSD,€ empresas,2,2


## Criteria for word down-selection - density and min count

In [20]:
#Data = [[1.0,int(3000), int(150), 1],[2.0,int(2000), int(50), 2],[3.0,int(1500), int(25),4]]
Data = [[1.0,int(4000), int(225), 1],[2.0,int(1650), int(45), 2],[3.0,int(1250), int(25),4]]
WordCriteria = pd.DataFrame(Data, columns = ['Q_Words', 'Min_Density', 'Min_Count', 'Weight'])
WordCriteria

Unnamed: 0,Q_Words,Min_Density,Min_Count,Weight
0,1.0,4000,225,1
1,2.0,1650,45,2
2,3.0,1250,25,4


In [21]:
WordListDensity = WordListCount.merge(PartidoStats, on=['Partido'], how='left', indicator=True)
WordListDensity = WordListDensity.merge(WordCriteria, on=['Q_Words'], how='left')
WordListDensity['Density'] = WordListDensity.apply(lambda x: x['Count']*1000000/x['Count_Words'], axis = 1)
WordListDensity

Unnamed: 0.2,Partido,Word,Count,Q_Words,Unnamed: 0,Count_Words,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,_merge,Min_Density,Min_Count,Weight,Density
0,BE,%,15,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,386.966953
1,BE,% 272,1,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,25.797797
2,BE,% despesas,1,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,25.797797
3,BE,% despesas consumo,1,3,38763,38763,38763,38763,38763,38763,both,1250,25,4,25.797797
4,BE,% pib,1,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,25.797797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504586,PSD,€ 50 m€,2,3,38165,38165,38165,38165,38165,0,both,1250,25,4,52.404035
504587,PSD,€ combate,1,2,38165,38165,38165,38165,38165,0,both,1650,45,2,26.202018
504588,PSD,€ combate à,1,3,38165,38165,38165,38165,38165,0,both,1250,25,4,26.202018
504589,PSD,€ empresas,2,2,38165,38165,38165,38165,38165,0,both,1650,45,2,52.404035


In [22]:
SelectedWordsPartido = WordListDensity[(WordListDensity['Density'] >  WordListDensity['Min_Density']) | (WordListDensity['Count'] >  WordListDensity['Min_Count'])]
SelectedWordsPartido.sort_values(by=['Density'],ascending=False)

Unnamed: 0.2,Partido,Word,Count,Q_Words,Unnamed: 0,Count_Words,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,_merge,Min_Density,Min_Count,Weight,Density
68682,CDS,compromisso,32,1,1876,1876,0,0,0,0,both,4000,225,1,17057.569296
261363,Livre,proposta,345,1,20294,20294,20294,20294,20294,20294,both,4000,225,1,17000.098551
76431,CH,social,37,1,2309,2309,2309,2309,2309,2309,both,4000,225,1,16024.252923
73077,CH,direita,36,1,2309,2309,2309,2309,2309,2309,both,4000,225,1,15591.165006
356275,PCP,mais,430,1,28417,28417,28417,28417,28417,28417,both,4000,225,1,15131.787310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190817,IL,profissionais saúde,52,2,102905,102905,102905,102905,0,0,both,1650,45,2,505.320441
182401,IL,podem ser,46,2,102905,102905,102905,102905,0,0,both,1650,45,2,447.014236
125790,IL,ensino técnico profissional,40,3,102905,102905,102905,102905,0,0,both,1250,25,4,388.708032
210684,IL,serviço nacional saúde,26,3,102905,102905,102905,102905,0,0,both,1250,25,4,252.660221


In [23]:
ToExport = SelectedWordsPartido.copy()
#ToExport.drop('Q_Words', axis = 1, inplace=True)
ToExport['Weight_Count'] = ToExport['Count'] * ToExport['Weight']
ToExport['Weight_Density'] = ToExport['Density'] * ToExport['Weight']
#ToExport.drop('Count_Words', axis = 1, inplace=True)
ToExport.drop('_merge', axis = 1, inplace=True)
ToExport.drop('Unnamed: 0', axis = 1, inplace=True)
ToExport.drop('Unnamed: 0.1', axis = 1, inplace=True)
ToExport.drop('Min_Density', axis = 1, inplace=True)
ToExport.drop('Min_Count', axis = 1, inplace=True)
ToExport.sort_values(by=['Density'],ascending=False)

Unnamed: 0,Partido,Word,Count,Q_Words,Count_Words,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Weight,Density,Weight_Count,Weight_Density
68682,CDS,compromisso,32,1,1876,0,0,0,1,17057.569296,32,17057.569296
261363,Livre,proposta,345,1,20294,20294,20294,20294,1,17000.098551,345,17000.098551
76431,CH,social,37,1,2309,2309,2309,2309,1,16024.252923,37,16024.252923
73077,CH,direita,36,1,2309,2309,2309,2309,1,15591.165006,36,15591.165006
356275,PCP,mais,430,1,28417,28417,28417,28417,1,15131.787310,430,15131.787310
...,...,...,...,...,...,...,...,...,...,...,...,...
190817,IL,profissionais saúde,52,2,102905,102905,0,0,2,505.320441,104,1010.640882
182401,IL,podem ser,46,2,102905,102905,0,0,2,447.014236,92,894.028473
125790,IL,ensino técnico profissional,40,3,102905,102905,0,0,4,388.708032,160,1554.832127
210684,IL,serviço nacional saúde,26,3,102905,102905,0,0,4,252.660221,104,1010.640882


Basic count to check if parties are reasonable represented on selected words

In [24]:
ToExport.groupby(['Partido','Q_Words'])['Word'].count()

Partido  Q_Words
BE       1          16
         2           3
CDS      1          29
         2           4
         3           1
CH       1          32
         2           7
         3           4
IL       1          44
         2          16
         3           3
Livre    1          21
         2           2
         3           2
PAN      1          17
         2           2
         3           4
PCP      1          14
PS       1          20
         2           3
PSD      1          19
Name: Word, dtype: int64

In [25]:
ToExport.to_csv('Word Density by Partido.txt', index=True, mode='w', header=True, encoding='utf-8-sig')

In [26]:
ToExclude = pd.read_csv('2Exclude.txt')
ToExport = pd.read_csv('Word Density by Partido.txt')
ToExport

Unnamed: 0.1,Unnamed: 0,Partido,Word,Count,Q_Words,Count_Words,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,Weight,Density,Weight_Count,Weight_Density
0,9278,BE,bloco,156,1,38763,38763,38763,38763,1,4024.456311,156,4024.456311
1,15099,BE,criar,164,1,38763,38763,38763,38763,1,4230.838686,164,4230.838686
2,18893,BE,deve ser,50,2,38763,38763,38763,38763,2,1289.889843,100,2579.779687
3,19554,BE,direito,235,1,38763,38763,38763,38763,1,6062.482264,235,6062.482264
4,25084,BE,euros,160,1,38763,38763,38763,38763,1,4127.647499,160,4127.647499
...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,496178,PSD,sistema,184,1,38165,38165,38165,0,1,4821.171230,184,4821.171230
259,496916,PSD,social,236,1,38165,38165,38165,0,1,6183.676143,236,6183.676143
260,498911,PSD,tem,181,1,38165,38165,38165,0,1,4742.565178,181,4742.565178
261,503048,PSD,à,332,1,38165,38165,38165,0,1,8699.069828,332,8699.069828


In [27]:
SelectedWords = ToExport.groupby(['Word'])['Word'].count().reset_index(name="Count")
SelectedWords.drop('Count', axis = 1, inplace=True)
SelectedWords = SelectedWords[~SelectedWords['Word'].isin(ToExclude['2Exclude'])]
SelectedWords

Unnamed: 0,Word
0,abandono escolar
1,acesso
2,administração pública
3,animal
4,apoio
...,...
118,todo
119,trabalhar
121,é
122,é essencial


In [28]:
ToExport2 = SelectedWords.merge(WordListDensity, on=['Word'], how='left', indicator=False)
PartidoCount = ToExport2.groupby(['Partido'])['Partido'].count().reset_index(name="Count_Different")
ToExport2 = PartidoCount.merge(ToExport2, on=['Partido'], how='left', indicator=False)
ToExport2

Unnamed: 0.2,Partido,Count_Different,Word,Count,Q_Words,Unnamed: 0,Count_Words,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,_merge,Min_Density,Min_Count,Weight,Density
0,BE,101,abandono escolar,4,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,103.191187
1,BE,101,acesso,101,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,2605.577484
2,BE,101,administração pública,14,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,361.169156
3,BE,101,animal,42,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,1083.507468
4,BE,101,apoio,111,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,2863.555452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,PSD,106,todo,149,1,38165,38165,38165,38165,38165,0,both,4000,225,1,3904.100616
888,PSD,106,trabalhar,73,1,38165,38165,38165,38165,38165,0,both,4000,225,1,1912.747282
889,PSD,106,é,394,1,38165,38165,38165,38165,38165,0,both,4000,225,1,10323.594917
890,PSD,106,é essencial,6,2,38165,38165,38165,38165,38165,0,both,1650,45,2,157.212105


## Cross Pivot so that words are columns (dimensions)

In [29]:
DensityPivot = ToExport2.pivot(index=['Partido', 'Count_Words', 'Count_Different'],columns='Word', values='Density')
DensityPivot = DensityPivot.fillna(0)
DensityPivot.to_csv('Data for Clustering - Density.txt', index=True, mode='w', header=True, encoding='utf-8-sig')
DensityPivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Word,abandono escolar,acesso,administração pública,animal,apoio,atingir objetivos desenvolvimento,através,aumento,bem,bem estar,...,sobre,social,socialismo,também,tem,todo,trabalhar,é,é essencial,é necessário
Partido,Count_Words,Count_Different,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
BE,38763,101,103.191187,2605.577484,361.169156,1083.507468,2863.555452,0.0,1754.250187,2631.375281,1547.867812,283.775766,...,3431.106983,6243.066842,0.0,2012.228156,3921.265124,4127.647499,3998.658515,8719.655341,51.595594,438.562547
CDS,1876,82,0.0,1066.098081,0.0,0.0,4264.392324,0.0,533.049041,2132.196162,1599.147122,1066.098081,...,2665.245203,6396.588486,0.0,1599.147122,1599.147122,7995.735608,1599.147122,5863.539446,533.049041,0.0
CH,2309,86,0.0,2165.439584,0.0,0.0,1299.263751,0.0,2165.439584,2165.439584,3464.703335,0.0,...,1732.351667,16024.252923,7362.494586,433.087917,433.087917,2598.527501,1299.263751,5630.142919,0.0,0.0
IL,102905,108,524.755843,3167.970458,1098.100189,437.296536,1613.138331,0.0,1826.927749,2332.24819,2167.047277,194.354016,...,3488.654584,3294.300568,29.153102,2895.874836,4888.003498,3955.104222,1992.128662,11282.25062,553.908945,719.109859
Livre,20294,102,0.0,3449.295358,788.410368,2463.782399,3252.192766,0.0,5814.526461,1379.718143,3794.224894,1773.923327,...,3055.090174,6356.558589,49.275648,1921.750271,1823.198975,7687.001084,4730.462206,3991.327486,147.826944,591.307776
PAN,31879,103,94.105838,2572.22623,564.635026,9096.897644,3293.704319,972.42699,2446.75178,1976.222592,4297.499922,2101.697042,...,2603.594843,3732.864895,0.0,1693.905079,3105.492644,7748.047304,2666.332068,5803.193325,345.054738,501.897801
PCP,28417,104,70.380406,2287.363198,1126.086498,0.0,3167.118274,0.0,3624.590914,2604.075026,2005.841574,0.0,...,1618.74934,5982.334518,0.0,1513.178731,3976.492944,3413.449696,2041.031777,9958.827462,175.951015,527.853046
PS,32349,100,30.912857,3184.024236,1916.597113,123.651427,5564.314198,0.0,4853.318495,2658.505673,2287.551393,247.302853,...,1576.55569,6151.658475,0.0,2318.464249,1143.775696,4915.144208,3462.239946,4204.148505,340.041423,247.302853
PSD,38165,106,26.202018,1807.939211,1100.484737,288.222193,3118.040089,0.0,3196.646142,2174.767457,4035.110704,445.434298,...,1755.535176,6183.676143,0.0,3458.666317,4742.565178,3904.100616,1912.747282,10323.594917,157.212105,733.656492


In [30]:
CountPivot = ToExport2.pivot(index='Partido',columns='Word', values='Count')
CountPivot = CountPivot.fillna(0)
CountPivot.to_csv('Data for Clustering - Count.txt', index=True, mode='w', header=True, encoding='utf-8-sig')
CountPivot

Word,abandono escolar,acesso,administração pública,animal,apoio,atingir objetivos desenvolvimento,através,aumento,bem,bem estar,...,sobre,social,socialismo,também,tem,todo,trabalhar,é,é essencial,é necessário
Partido,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BE,4.0,101.0,14.0,42.0,111.0,0.0,68.0,102.0,60.0,11.0,...,133.0,242.0,0.0,78.0,152.0,160.0,155.0,338.0,2.0,17.0
CDS,0.0,2.0,0.0,0.0,8.0,0.0,1.0,4.0,3.0,2.0,...,5.0,12.0,0.0,3.0,3.0,15.0,3.0,11.0,1.0,0.0
CH,0.0,5.0,0.0,0.0,3.0,0.0,5.0,5.0,8.0,0.0,...,4.0,37.0,17.0,1.0,1.0,6.0,3.0,13.0,0.0,0.0
IL,54.0,326.0,113.0,45.0,166.0,0.0,188.0,240.0,223.0,20.0,...,359.0,339.0,3.0,298.0,503.0,407.0,205.0,1161.0,57.0,74.0
Livre,0.0,70.0,16.0,50.0,66.0,0.0,118.0,28.0,77.0,36.0,...,62.0,129.0,1.0,39.0,37.0,156.0,96.0,81.0,3.0,12.0
PAN,3.0,82.0,18.0,290.0,105.0,31.0,78.0,63.0,137.0,67.0,...,83.0,119.0,0.0,54.0,99.0,247.0,85.0,185.0,11.0,16.0
PCP,2.0,65.0,32.0,0.0,90.0,0.0,103.0,74.0,57.0,0.0,...,46.0,170.0,0.0,43.0,113.0,97.0,58.0,283.0,5.0,15.0
PS,1.0,103.0,62.0,4.0,180.0,0.0,157.0,86.0,74.0,8.0,...,51.0,199.0,0.0,75.0,37.0,159.0,112.0,136.0,11.0,8.0
PSD,1.0,69.0,42.0,11.0,119.0,0.0,122.0,83.0,154.0,17.0,...,67.0,236.0,0.0,132.0,181.0,149.0,73.0,394.0,6.0,28.0


In [31]:
FinalPivot = DensityPivot.merge(CountPivot, on=['Partido'], how='left', indicator=True)
FinalPivot.drop('_merge', axis = 1, inplace=True)
FinalPivot = FinalPivot.fillna(0)
FinalPivot

Word,abandono escolar_x,acesso_x,administração pública_x,animal_x,apoio_x,atingir objetivos desenvolvimento_x,através_x,aumento_x,bem_x,bem estar_x,...,sobre_y,social_y,socialismo_y,também_y,tem_y,todo_y,trabalhar_y,é_y,é essencial_y,é necessário_y
Partido,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BE,103.191187,2605.577484,361.169156,1083.507468,2863.555452,0.0,1754.250187,2631.375281,1547.867812,283.775766,...,133.0,242.0,0.0,78.0,152.0,160.0,155.0,338.0,2.0,17.0
CDS,0.0,1066.098081,0.0,0.0,4264.392324,0.0,533.049041,2132.196162,1599.147122,1066.098081,...,5.0,12.0,0.0,3.0,3.0,15.0,3.0,11.0,1.0,0.0
CH,0.0,2165.439584,0.0,0.0,1299.263751,0.0,2165.439584,2165.439584,3464.703335,0.0,...,4.0,37.0,17.0,1.0,1.0,6.0,3.0,13.0,0.0,0.0
IL,524.755843,3167.970458,1098.100189,437.296536,1613.138331,0.0,1826.927749,2332.24819,2167.047277,194.354016,...,359.0,339.0,3.0,298.0,503.0,407.0,205.0,1161.0,57.0,74.0
Livre,0.0,3449.295358,788.410368,2463.782399,3252.192766,0.0,5814.526461,1379.718143,3794.224894,1773.923327,...,62.0,129.0,1.0,39.0,37.0,156.0,96.0,81.0,3.0,12.0
PAN,94.105838,2572.22623,564.635026,9096.897644,3293.704319,972.42699,2446.75178,1976.222592,4297.499922,2101.697042,...,83.0,119.0,0.0,54.0,99.0,247.0,85.0,185.0,11.0,16.0
PCP,70.380406,2287.363198,1126.086498,0.0,3167.118274,0.0,3624.590914,2604.075026,2005.841574,0.0,...,46.0,170.0,0.0,43.0,113.0,97.0,58.0,283.0,5.0,15.0
PS,30.912857,3184.024236,1916.597113,123.651427,5564.314198,0.0,4853.318495,2658.505673,2287.551393,247.302853,...,51.0,199.0,0.0,75.0,37.0,159.0,112.0,136.0,11.0,8.0
PSD,26.202018,1807.939211,1100.484737,288.222193,3118.040089,0.0,3196.646142,2174.767457,4035.110704,445.434298,...,67.0,236.0,0.0,132.0,181.0,149.0,73.0,394.0,6.0,28.0


In [32]:
FinalPivot.to_csv('Data for Clustering.txt', index=True, mode='w', header=True, encoding='utf-8-sig')