# Processing of Party Text Programs to get Word Usage Count

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
Partidos = ["BE", "PAN", "PCP", "L", "CH", "PS", "IL", "PEV", "AD"]

In [3]:
ToExclude = pd.read_csv('2Exclude.txt')
ToExclude

Unnamed: 0,2Exclude
0,bloco
1,pp
2,vii
3,três
4,seis
...,...
125,cdu
126,ad
127,º
128,ª


## Go through text and count word usage paragraph by paragraph

In [4]:
def CountSequences(words,n):
#Given a list, returns a dictionary mapping each n-element sequence tuple to its number of occurrences in the list.
# Initialize all counts implicitly to 0.
    countDict = defaultdict(int)

    for i in range(len(words)-n+1):
        key = tuple(words[i:i+n])
        countDict[key] = countDict[key] + 1
    return countDict

In [6]:
FullWordList = pd.DataFrame(columns = ['Word', 'Partido'])
ProcessedInput = pd.DataFrame(columns = ['Word', 'Partido', 'Q_Words', 'Count'])            

for Partido in ["PEV", "Livre"]: #Partidos:
    print(Partido+'.txt')
    
    with open(Partido+'.txt', encoding='utf-8-sig') as fp:
        
        line = fp.readline()

        while line:
            
            line = line.lower()
            
            for ToExcludeWord in ToExclude['2Exclude']:
                line = line.replace(" "+ToExcludeWord+" ", " ")
                if line[0:len(ToExcludeWord)+1] == ToExcludeWord+" ":
                    line = line[len(ToExcludeWord)+1:len(line)]
                if line[len(line)-len(ToExcludeWord)-2:len(line)-1] == " "+ToExcludeWord:
                    line = line[0:len(line)-len(ToExcludeWord)-2]+"\n"
            
            line = line.strip()
            Words = line.split()

            for Group in range (3):
                ToExport = CountSequences(Words,Group+1)
                ToExportKeys = ToExport.keys()
                for i in ToExportKeys:
                    ProcessedInput = ProcessedInput.append({'Word': ' '.join(i), 
                                                            'Partido' : Partido, 
                                                            'Q_Words' : Group+1, 
                                                            'Count': ToExport[i]},
                                                          ignore_index = True)
                    
            line = fp.readline()
        
    fp.close()
    
ProcessedInput

PEV.txt


Livre.txt


Unnamed: 0,Word,Partido,Q_Words,Count
0,manifesto,PEV,1,1
1,ecologista,PEV,1,1
2,manifesto ecologista,PEV,2,1
3,recuperar,PEV,1,1
4,voz,PEV,1,1
...,...,...,...,...
83248,janeiro,Livre,1,1
83249,28 janeiro,Livre,2,1
83250,www,Livre,1,1
83251,partidolivre,Livre,1,1


## Save as previous process is very slow

In [7]:
ProcessedInput.to_csv('Processed PEV+Livre.txt', index=True, mode='w', header=True, encoding='utf-8-sig')

## Nice place to restart run if data was saved

In [12]:
ProcessedPast = pd.read_csv('Processed BE+PAN+PCP+L+CH+PS+IL+PEV+AD.txt')
FullWordList = ProcessedPast

In [8]:
# Code to concatenate previous processing (as that is very slow execution pre-processing may by done in chunks)
ProcessedPast = pd.read_csv('Processed BE+PCP+IL.txt')
FullWordList = pd.concat([ProcessedInput,ProcessedPast])
FullWordList.to_csv('Processed BE+PCP+IL+PEV+Livre.txt', index=True, mode='w', header=True, encoding='utf-8-sig')
FullWordList


Unnamed: 0.2,Word,Partido,Q_Words,Count,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1
0,manifesto,PEV,1,1,,,
1,ecologista,PEV,1,1,,,
2,manifesto ecologista,PEV,2,1,,,
3,recuperar,PEV,1,1,,,
4,voz,PEV,1,1,,,
...,...,...,...,...,...,...,...
349403,mundo mais justo,PCP,3,1,273043.0,164802.0,164802.0
349404,mais justo pacífico,PCP,3,1,273044.0,164803.0,164803.0
349405,justo pacífico desenvolvido,PCP,3,1,273045.0,164804.0,164804.0
349406,pacífico desenvolvido sustentável,PCP,3,1,273046.0,164805.0,164805.0


## Calculate Party Program word size

In [9]:
PartidoStats = FullWordList[FullWordList.Q_Words==1.0].groupby(['Partido']).count()
PartidoStats.rename(columns={"Q_Words": "Count_Words"}, inplace=True)
PartidoStats.drop('Word', axis = 1, inplace=True)
PartidoStats.drop('Count', axis = 1, inplace=True)
PartidoStats

Unnamed: 0_level_0,Count_Words,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1
Partido,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BE,36826,36826,36826,36826
IL,38357,38357,38357,0
Livre,35481,0,0,0
PCP,22598,22598,22598,22598
PEV,1410,0,0,0


## Synonyms are use to consolidate words with same "meaning"

In [34]:
SynonymsCustom = pd.read_csv('Synonyms Custom.txt')
SynonymsDictionary = pd.read_csv('Synonyms Dictionary.txt')

SynonymsDictionaryFiltered = SynonymsDictionary.loc[SynonymsDictionary['Word2']!=""]
SynonymsDictionaryFiltered = SynonymsDictionary.loc[SynonymsDictionaryFiltered['Word2'].notnull()]
SynonymsDictionaryFiltered = SynonymsDictionaryFiltered.loc[SynonymsDictionaryFiltered['Word2']!=SynonymsDictionaryFiltered["Word"]]
Synonyms = SynonymsCustom.append(SynonymsDictionaryFiltered, ignore_index=True)
Synonyms

Unnamed: 0,Word,Word2
0,€,euros
1,união europeia,ue
2,acção social,ação social
3,através do,através de
4,aumento da,aumento de
...,...,...
6580,zee,ZEE
6581,zees,ZEE
6582,zelam,zelar
6583,zonas,zona


In [36]:
WordList = FullWordList.merge(Synonyms, on=['Word'], how='left', indicator=True)
WordList

Unnamed: 0.2,Word,Partido,Q_Words,Count,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Word2,_merge
0,manifesto,PEV,1,1,,,,,left_only
1,ecologista,PEV,1,1,,,,,left_only
2,manifesto ecologista,PEV,2,1,,,,,left_only
3,recuperar,PEV,1,1,,,,,left_only
4,voz,PEV,1,1,,,,,left_only
...,...,...,...,...,...,...,...,...,...
432662,mundo mais justo,PCP,3,1,273043.0,164802.0,164802.0,,left_only
432663,mais justo pacífico,PCP,3,1,273044.0,164803.0,164803.0,,left_only
432664,justo pacífico desenvolvido,PCP,3,1,273045.0,164804.0,164804.0,,left_only
432665,pacífico desenvolvido sustentável,PCP,3,1,273046.0,164805.0,164805.0,,left_only


In [37]:
def GetSynonym(a, b):
    if pd.isna(b):
        return a
    else:
        return b

In [38]:
WordList['Word'] = WordList.apply(lambda x: GetSynonym(x['Word'], x['Word2']), axis = 1)
WordList.drop('_merge', axis = 1, inplace=True)
WordList.drop('Word2', axis = 1, inplace=True)
WordList.head(20)

Unnamed: 0.2,Word,Partido,Q_Words,Count,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1
0,manifesto,PEV,1,1,,,
1,ecologista,PEV,1,1,,,
2,manifesto ecologista,PEV,2,1,,,
3,recuperar,PEV,1,1,,,
4,voz,PEV,1,1,,,
5,ecologista,PEV,1,1,,,
6,parlamento,PEV,1,1,,,
7,recuperar voz,PEV,2,1,,,
8,voz ecologista,PEV,2,1,,,
9,ecologista parlamento,PEV,2,1,,,


In [39]:
WordListCount = WordList.groupby(['Partido', 'Word'])['Count'].sum().reset_index(name="Count")
WordListMin = WordList.groupby(['Partido', 'Word'])['Q_Words'].min().reset_index(name="Q_Words")
WordListCount = WordListCount.merge(WordListMin, on=['Partido', 'Word'], how='left', indicator=True)
WordListCount.drop('_merge', axis = 1, inplace=True)
WordListCount

Unnamed: 0,Partido,Word,Count,Q_Words
0,BE,#NAME?,8,1
1,BE,% p,1,2
2,BE,% p ib,1,3
3,BE,% realmente,1,2
4,BE,% realmente executado,1,3
...,...,...,...,...
296699,PEV,é voto,1,2
296700,PEV,é voto cdu!,1,3
296701,PEV,órgão,1,1
296702,PEV,órgãos colegiais,1,2


## Criteria for word down-selection - density and min count

In [152]:
#Data = [[1.0,int(4000), int(225), 1],[2.0,int(1650), int(45), 2],[3.0,int(1250), int(25),4],[4.0,int(900), int(15),7]]
Data = [[1.0,int(4000), int(10), 1],[2.0,int(525), int(10), 2],[3.0,int(70), int(8),7],[4.0,int(7), int(5),5]]
WordCriteria = pd.DataFrame(Data, columns = ['Q_Words', 'Min_Density', 'Min_Count', 'Weight'])
WordCriteria

Unnamed: 0,Q_Words,Min_Density,Min_Count,Weight
0,1.0,4050,10,1
1,2.0,525,10,2
2,3.0,70,8,7
3,4.0,3,5,5


In [153]:
WordListDensity = WordListCount.merge(PartidoStats, on=['Partido'], how='left', indicator=True)
WordListDensity = WordListDensity.merge(WordCriteria, on=['Q_Words'], how='left')
WordListDensity['Density'] = WordListDensity.apply(lambda x: x['Count']*1000000/x['Count_Words'], axis = 1)
#WordListDensity

In [154]:
SelectedWordsPartido = WordListDensity[(WordListDensity['Density'] >  WordListDensity['Min_Density']) & (WordListDensity['Count'] >  WordListDensity['Min_Count'])]
SelectedWordsPartido.sort_values(by=['Density'],ascending=False)

Unnamed: 0.2,Partido,Word,Count,Q_Words,Count_Words,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,_merge,Min_Density,Min_Count,Weight,Density
295409,PEV,não,23,1,1410,0,0,0,both,4050,10,1,16312.056738
142261,IL,mais,543,1,38357,38357,38357,0,both,4050,10,1,14156.477305
279235,PCP,público,287,1,22598,22598,22598,22598,both,4050,10,1,12700.238959
188554,IL,é,477,1,38357,38357,38357,0,both,4050,10,1,12435.800506
174598,IL,ser,454,1,38357,38357,38357,0,both,4050,10,1,11836.170712
...,...,...,...,...,...,...,...,...,...,...,...,...,...
267661,PCP,micro pequenas médias empresas,6,4,22598,22598,22598,22598,both,3,5,5,265.510222
162816,IL,profissionais liberais trabalhadores independe...,10,4,38357,38357,38357,0,both,3,5,5,260.708606
43058,BE,inquérito condições vida,9,3,36826,36826,36826,36826,both,70,8,7,244.392549
136920,IL,instituições ensino superior,9,3,38357,38357,38357,0,both,70,8,7,234.637745


In [155]:
ToExport = SelectedWordsPartido.copy()
#ToExport.drop('Q_Words', axis = 1, inplace=True)
ToExport['Weight_Count'] = ToExport['Count'] * ToExport['Weight']
ToExport['Weight_Density'] = ToExport['Density'] * ToExport['Weight']
#ToExport.drop('Count_Words', axis = 1, inplace=True)
ToExport.drop('_merge', axis = 1, inplace=True)
ToExport.drop('Unnamed: 0', axis = 1, inplace=True)
ToExport.drop('Unnamed: 0.1', axis = 1, inplace=True)
ToExport.drop('Min_Density', axis = 1, inplace=True)
ToExport.drop('Min_Count', axis = 1, inplace=True)
ToExport.sort_values(by=['Density'],ascending=False)

Unnamed: 0,Partido,Word,Count,Q_Words,Count_Words,Unnamed: 0.1.1,Weight,Density,Weight_Count,Weight_Density
295409,PEV,não,23,1,1410,0,1,16312.056738,23,16312.056738
142261,IL,mais,543,1,38357,0,1,14156.477305,543,14156.477305
279235,PCP,público,287,1,22598,22598,1,12700.238959,287,12700.238959
188554,IL,é,477,1,38357,0,1,12435.800506,477,12435.800506
174598,IL,ser,454,1,38357,0,1,11836.170712,454,11836.170712
...,...,...,...,...,...,...,...,...,...,...
267661,PCP,micro pequenas médias empresas,6,4,22598,22598,5,265.510222,30,1327.551111
162816,IL,profissionais liberais trabalhadores independe...,10,4,38357,0,5,260.708606,50,1303.543030
43058,BE,inquérito condições vida,9,3,36826,36826,7,244.392549,63,1710.747841
136920,IL,instituições ensino superior,9,3,38357,0,7,234.637745,63,1642.464218


Basic count to check if parties are reasonable represented on selected words

In [156]:
ToExport.groupby(['Partido','Q_Words'])['Word'].count()

Partido  Q_Words
BE       1          12
         2          10
         3           4
         4           1
IL       1          14
         2          17
         3           5
         4           1
Livre    1          14
         2          13
         3           2
PCP      1          20
         2          27
         3           2
         4           2
PEV      1           3
Name: Word, dtype: int64

In [157]:
ToExport.to_csv('Word Density by Partido.txt', index=True, mode='w', header=True, encoding='utf-8-sig')

Another Nice Place to Restart - last minute update to word exclusion

In [105]:
ToExclude = pd.read_csv('2Exclude.txt')
ToExport = pd.read_csv('Word Density by Partido.txt')
ToExport#[['Count']>10]

Unnamed: 0.1,Unnamed: 0,Partido,Word,Count,Q_Words,Count_Words,Unnamed: 0.1.1,Weight,Density,Weight_Count,Weight_Density
0,711,BE,2023 catarina reis oliveira,3,4,36826,36826,7,81.464183,21,570.249280
1,3917,BE,administração pública,33,2,36826,36826,2,896.106012,66,1792.212024
2,4044,BE,adoção crianças mais velhas,3,4,36826,36826,7,81.464183,21,570.249280
3,8609,BE,at g lance 2023,4,4,36826,36826,7,108.618911,28,760.332374
4,12537,BE,carta social mt s,3,4,36826,36826,7,81.464183,21,570.249280
...,...,...,...,...,...,...,...,...,...,...,...
213,292036,PCP,vida profissional vida familiar,3,4,22598,22598,7,132.755111,21,929.285778
214,292843,PCP,é,121,1,22598,22598,1,5354.456147,121,5354.456147
215,294798,PEV,garantir,15,1,1410,0,1,10638.297872,15,10638.297872
216,295409,PEV,não,23,1,1410,0,1,16312.056738,23,16312.056738


In [59]:
SelectedWords = ToExport.groupby(['Word'])['Word'].count().reset_index(name="Count")
SelectedWords.drop('Count', axis = 1, inplace=True)
SelectedWords = SelectedWords[~SelectedWords['Word'].isin(ToExclude['2Exclude'])]
SelectedWords

Unnamed: 0,Word
0,(língua gestual línguas
1,"0,43% novos contratos"
2,"0,43%, vez quase"
4,1% orçamento estado
5,"1,5ºc estabelecido acordo"
...,...
1758,é também
1759,é voto cdu!
1760,órgãos colegiais eleitos
1761,último


In [28]:
ToExport2 = SelectedWords.merge(WordListDensity, on=['Word'], how='left', indicator=False)
PartidoCount = ToExport2.groupby(['Partido'])['Partido'].count().reset_index(name="Count_Different")
ToExport2 = PartidoCount.merge(ToExport2, on=['Partido'], how='left', indicator=False)
ToExport2

Unnamed: 0.2,Partido,Count_Different,Word,Count,Q_Words,Unnamed: 0,Count_Words,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Unnamed: 0.1.1.1.1,_merge,Min_Density,Min_Count,Weight,Density
0,BE,101,abandono escolar,4,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,103.191187
1,BE,101,acesso,101,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,2605.577484
2,BE,101,administração pública,14,2,38763,38763,38763,38763,38763,38763,both,1650,45,2,361.169156
3,BE,101,animal,42,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,1083.507468
4,BE,101,apoio,111,1,38763,38763,38763,38763,38763,38763,both,4000,225,1,2863.555452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,PSD,106,todo,149,1,38165,38165,38165,38165,38165,0,both,4000,225,1,3904.100616
888,PSD,106,trabalhar,73,1,38165,38165,38165,38165,38165,0,both,4000,225,1,1912.747282
889,PSD,106,é,394,1,38165,38165,38165,38165,38165,0,both,4000,225,1,10323.594917
890,PSD,106,é essencial,6,2,38165,38165,38165,38165,38165,0,both,1650,45,2,157.212105


## Cross Pivot so that words are columns (dimensions)

In [29]:
DensityPivot = ToExport2.pivot(index=['Partido', 'Count_Words', 'Count_Different'],columns='Word', values='Density')
DensityPivot = DensityPivot.fillna(0)
DensityPivot.to_csv('Data for Clustering - Density.txt', index=True, mode='w', header=True, encoding='utf-8-sig')
DensityPivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Word,abandono escolar,acesso,administração pública,animal,apoio,atingir objetivos desenvolvimento,através,aumento,bem,bem estar,...,sobre,social,socialismo,também,tem,todo,trabalhar,é,é essencial,é necessário
Partido,Count_Words,Count_Different,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
BE,38763,101,103.191187,2605.577484,361.169156,1083.507468,2863.555452,0.0,1754.250187,2631.375281,1547.867812,283.775766,...,3431.106983,6243.066842,0.0,2012.228156,3921.265124,4127.647499,3998.658515,8719.655341,51.595594,438.562547
CDS,1876,82,0.0,1066.098081,0.0,0.0,4264.392324,0.0,533.049041,2132.196162,1599.147122,1066.098081,...,2665.245203,6396.588486,0.0,1599.147122,1599.147122,7995.735608,1599.147122,5863.539446,533.049041,0.0
CH,2309,86,0.0,2165.439584,0.0,0.0,1299.263751,0.0,2165.439584,2165.439584,3464.703335,0.0,...,1732.351667,16024.252923,7362.494586,433.087917,433.087917,2598.527501,1299.263751,5630.142919,0.0,0.0
IL,102905,108,524.755843,3167.970458,1098.100189,437.296536,1613.138331,0.0,1826.927749,2332.24819,2167.047277,194.354016,...,3488.654584,3294.300568,29.153102,2895.874836,4888.003498,3955.104222,1992.128662,11282.25062,553.908945,719.109859
Livre,20294,102,0.0,3449.295358,788.410368,2463.782399,3252.192766,0.0,5814.526461,1379.718143,3794.224894,1773.923327,...,3055.090174,6356.558589,49.275648,1921.750271,1823.198975,7687.001084,4730.462206,3991.327486,147.826944,591.307776
PAN,31879,103,94.105838,2572.22623,564.635026,9096.897644,3293.704319,972.42699,2446.75178,1976.222592,4297.499922,2101.697042,...,2603.594843,3732.864895,0.0,1693.905079,3105.492644,7748.047304,2666.332068,5803.193325,345.054738,501.897801
PCP,28417,104,70.380406,2287.363198,1126.086498,0.0,3167.118274,0.0,3624.590914,2604.075026,2005.841574,0.0,...,1618.74934,5982.334518,0.0,1513.178731,3976.492944,3413.449696,2041.031777,9958.827462,175.951015,527.853046
PS,32349,100,30.912857,3184.024236,1916.597113,123.651427,5564.314198,0.0,4853.318495,2658.505673,2287.551393,247.302853,...,1576.55569,6151.658475,0.0,2318.464249,1143.775696,4915.144208,3462.239946,4204.148505,340.041423,247.302853
PSD,38165,106,26.202018,1807.939211,1100.484737,288.222193,3118.040089,0.0,3196.646142,2174.767457,4035.110704,445.434298,...,1755.535176,6183.676143,0.0,3458.666317,4742.565178,3904.100616,1912.747282,10323.594917,157.212105,733.656492


In [30]:
CountPivot = ToExport2.pivot(index='Partido',columns='Word', values='Count')
CountPivot = CountPivot.fillna(0)
CountPivot.to_csv('Data for Clustering - Count.txt', index=True, mode='w', header=True, encoding='utf-8-sig')
CountPivot

Word,abandono escolar,acesso,administração pública,animal,apoio,atingir objetivos desenvolvimento,através,aumento,bem,bem estar,...,sobre,social,socialismo,também,tem,todo,trabalhar,é,é essencial,é necessário
Partido,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BE,4.0,101.0,14.0,42.0,111.0,0.0,68.0,102.0,60.0,11.0,...,133.0,242.0,0.0,78.0,152.0,160.0,155.0,338.0,2.0,17.0
CDS,0.0,2.0,0.0,0.0,8.0,0.0,1.0,4.0,3.0,2.0,...,5.0,12.0,0.0,3.0,3.0,15.0,3.0,11.0,1.0,0.0
CH,0.0,5.0,0.0,0.0,3.0,0.0,5.0,5.0,8.0,0.0,...,4.0,37.0,17.0,1.0,1.0,6.0,3.0,13.0,0.0,0.0
IL,54.0,326.0,113.0,45.0,166.0,0.0,188.0,240.0,223.0,20.0,...,359.0,339.0,3.0,298.0,503.0,407.0,205.0,1161.0,57.0,74.0
Livre,0.0,70.0,16.0,50.0,66.0,0.0,118.0,28.0,77.0,36.0,...,62.0,129.0,1.0,39.0,37.0,156.0,96.0,81.0,3.0,12.0
PAN,3.0,82.0,18.0,290.0,105.0,31.0,78.0,63.0,137.0,67.0,...,83.0,119.0,0.0,54.0,99.0,247.0,85.0,185.0,11.0,16.0
PCP,2.0,65.0,32.0,0.0,90.0,0.0,103.0,74.0,57.0,0.0,...,46.0,170.0,0.0,43.0,113.0,97.0,58.0,283.0,5.0,15.0
PS,1.0,103.0,62.0,4.0,180.0,0.0,157.0,86.0,74.0,8.0,...,51.0,199.0,0.0,75.0,37.0,159.0,112.0,136.0,11.0,8.0
PSD,1.0,69.0,42.0,11.0,119.0,0.0,122.0,83.0,154.0,17.0,...,67.0,236.0,0.0,132.0,181.0,149.0,73.0,394.0,6.0,28.0


In [31]:
FinalPivot = DensityPivot.merge(CountPivot, on=['Partido'], how='left', indicator=True)
FinalPivot.drop('_merge', axis = 1, inplace=True)
FinalPivot = FinalPivot.fillna(0)
FinalPivot

Word,abandono escolar_x,acesso_x,administração pública_x,animal_x,apoio_x,atingir objetivos desenvolvimento_x,através_x,aumento_x,bem_x,bem estar_x,...,sobre_y,social_y,socialismo_y,também_y,tem_y,todo_y,trabalhar_y,é_y,é essencial_y,é necessário_y
Partido,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BE,103.191187,2605.577484,361.169156,1083.507468,2863.555452,0.0,1754.250187,2631.375281,1547.867812,283.775766,...,133.0,242.0,0.0,78.0,152.0,160.0,155.0,338.0,2.0,17.0
CDS,0.0,1066.098081,0.0,0.0,4264.392324,0.0,533.049041,2132.196162,1599.147122,1066.098081,...,5.0,12.0,0.0,3.0,3.0,15.0,3.0,11.0,1.0,0.0
CH,0.0,2165.439584,0.0,0.0,1299.263751,0.0,2165.439584,2165.439584,3464.703335,0.0,...,4.0,37.0,17.0,1.0,1.0,6.0,3.0,13.0,0.0,0.0
IL,524.755843,3167.970458,1098.100189,437.296536,1613.138331,0.0,1826.927749,2332.24819,2167.047277,194.354016,...,359.0,339.0,3.0,298.0,503.0,407.0,205.0,1161.0,57.0,74.0
Livre,0.0,3449.295358,788.410368,2463.782399,3252.192766,0.0,5814.526461,1379.718143,3794.224894,1773.923327,...,62.0,129.0,1.0,39.0,37.0,156.0,96.0,81.0,3.0,12.0
PAN,94.105838,2572.22623,564.635026,9096.897644,3293.704319,972.42699,2446.75178,1976.222592,4297.499922,2101.697042,...,83.0,119.0,0.0,54.0,99.0,247.0,85.0,185.0,11.0,16.0
PCP,70.380406,2287.363198,1126.086498,0.0,3167.118274,0.0,3624.590914,2604.075026,2005.841574,0.0,...,46.0,170.0,0.0,43.0,113.0,97.0,58.0,283.0,5.0,15.0
PS,30.912857,3184.024236,1916.597113,123.651427,5564.314198,0.0,4853.318495,2658.505673,2287.551393,247.302853,...,51.0,199.0,0.0,75.0,37.0,159.0,112.0,136.0,11.0,8.0
PSD,26.202018,1807.939211,1100.484737,288.222193,3118.040089,0.0,3196.646142,2174.767457,4035.110704,445.434298,...,67.0,236.0,0.0,132.0,181.0,149.0,73.0,394.0,6.0,28.0


In [32]:
FinalPivot.to_csv('Data for Clustering.txt', index=True, mode='w', header=True, encoding='utf-8-sig')