### Bibliotecas

In [1]:
import pandas as pd
from unidecode import unidecode

### Abertura do CSV

In [13]:
raw_df = pd.read_csv('data')
df = raw_df.copy()
df.head()

Unnamed: 0,palavra,tf,dicts,corpus,title,dmap
0,aa,418667,0.9999,1.0,0.0155,cdhjloqrs
1,aabora,17,0.9999,0.0,0.0,cdfghijkq
2,aaboras,0,0.01,0.0,0.0,cfj
3,aac,62309,0.9089,0.0002,0.0,cdghjk
4,aacima,72,0.001,0.0,0.0,lq


### Ordenação por frequência

In [3]:
df = df.sort_values(by='tf', ascending=False)
df.head()

Unnamed: 0,palavra,tf,dicts,corpus,title,dmap
720864,de,2249801797,1.0,1.0,0.2008,abcdghijklmopqst
2105974,que,986271766,1.0,1.0,0.2004,abcdfghijklmnopqst
995709,do,678930810,1.0,1.0,0.2103,abcdghijklmnopqrst
1025648,em,617333684,1.0,0.9997,0.3331,abcdghijklmnopqrst
717689,da,572609964,1.0,1.0,0.1684,bcdghijklmopt


### Filtro das 1000 palavras mais frequentes

In [8]:
df = df.head(1000)
df

Unnamed: 0,palavra,tf,dicts,corpus,title,dmap
720864,de,2249801797,1.0000,1.0000,0.2008,abcdghijklmopqst
2105974,que,986271766,1.0000,1.0000,0.2004,abcdfghijklmnopqst
995709,do,678930810,1.0000,1.0000,0.2103,abcdghijklmnopqrst
1025648,em,617333684,1.0000,0.9997,0.3331,abcdghijklmnopqrst
717689,da,572609964,1.0000,1.0000,0.1684,bcdghijklmopt
...,...,...,...,...,...,...
2173805,redes,4294427,0.9999,0.8874,0.0015,bcdfghijp
1890368,ofertas,4292032,0.9900,0.4268,0.3331,bcdfhij
2200806,rei,4278026,1.0000,0.9997,0.3340,abcdfghijklmnopqrst
178897,altura,4275602,1.0000,0.4268,0.0000,abcdfghijklmnoqrst


### Remoção das palavras com acento ou números

In [7]:
df = df.dropna()
df = df[df['palavra'].str.isalpha()]
df = df[df['palavra'] == df['palavra'].apply(unidecode)]
df['palavra'] = df['palavra'].str.lower()
df.head()


Unnamed: 0,palavra,tf,dicts,corpus,title,dmap
720864,de,2249801797,1.0,1.0,0.2008,abcdghijklmopqst
2105974,que,986271766,1.0,1.0,0.2004,abcdfghijklmnopqst
995709,do,678930810,1.0,1.0,0.2103,abcdghijklmnopqrst
1025648,em,617333684,1.0,0.9997,0.3331,abcdghijklmnopqrst
717689,da,572609964,1.0,1.0,0.1684,bcdghijklmopt


### Adição da coluna length

In [10]:
df['length'] = df['palavra'].apply(len)
df.head()

Unnamed: 0,palavra,tf,dicts,corpus,title,dmap,length
720864,de,2249801797,1.0,1.0,0.2008,abcdghijklmopqst,2
2105974,que,986271766,1.0,1.0,0.2004,abcdfghijklmnopqst,3
995709,do,678930810,1.0,1.0,0.2103,abcdghijklmnopqrst,2
1025648,em,617333684,1.0,0.9997,0.3331,abcdghijklmnopqrst,2
717689,da,572609964,1.0,1.0,0.1684,bcdghijklmopt,2


### Ordenação por length

In [11]:
df = df.sort_values(by='length', ascending=True)
df.head()

Unnamed: 0,palavra,tf,dicts,corpus,title,dmap,length
250822,ao,160692754,1.0,0.9999,0.2028,abcdghijklmnopst,2
1915948,ou,159571855,1.0,0.9989,0.3331,abcdghijklmopst,2
1334420,eu,129990465,1.0,0.9994,0.3506,abcdghijklmoqs,2
2583982,vi,7575803,0.999,0.9994,0.0032,bcdghijm,2
720864,de,2249801797,1.0,1.0,0.2008,abcdghijklmopqst,2


### Exportação apra tabela lua

In [12]:
with open('words.lua', 'w') as lua_file:
  lua_file.write('local words = {\n')
  for _, row in df.iterrows():
    lua_file.write(f"    {{word = '{row['palavra']}', length = {row['length']}}},\n")
  lua_file.write('}\n')
  lua_file.write('return words\n')