In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers numpy nltk ipywidgets tqdm pandas

In [187]:
import urllib.request
import json
import nltk
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import pandas as pd

nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('names')

print(f"nltk words corpus: {len(nltk.corpus.words.words())}")

nltk words corpus: 236736


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


In [238]:
vocab_url = 'https://huggingface.co/openai-community/gpt2/raw/main/vocab.json'
vocab_json_path = 'data/vocab.json'
vocab_csv_path = 'data/vocab.csv'
vocab_txt_path = 'data/vocab.txt'

vocabUrlfile = urllib.request.urlopen(vocab_url).read()
json_data = json.loads(vocabUrlfile)

with open(vocab_json_path,'wb') as output:
    output.write(vocabUrlfile)

df = pd.DataFrame.from_dict(json_data, orient='index')
df = df.reset_index()
df.columns = ['token', 'id']
df.to_csv(vocab_csv_path, index=False)

df['token'].to_csv(vocab_txt_path, index=False, header=False)

print(df)

               token     id
0                  !      0
1                  "      1
2                  #      2
3                  $      3
4                  %      4
...              ...    ...
50252       Ġregress  50252
50253      ĠCollider  50253
50254    Ġinformants  50254
50255         Ġgazed  50255
50256  <|endoftext|>  50256

[50257 rows x 2 columns]


In [241]:
english_dictionary = set([word.lower() for word in nltk.corpus.words.words()])
common_name_dictionary = set(nltk.corpus.names.words('male.txt')) | set(nltk.corpus.names.words('female.txt'))
common_name_dictionary.update({item.lower() for item in common_name_dictionary})

lemmatizer = WordNetLemmatizer()

df['tokenCleaned'] = df['token'].str.replace("Ġ", "").str.lower()

df['isDigit'] = df['tokenCleaned'].str.isdigit()
df['isWord'] = df['tokenCleaned'].map(lambda word: (lemmatizer.lemmatize(word, pos='n') in english_dictionary or lemmatizer.lemmatize(word, pos='v') in english_dictionary))
df['isInWordNet'] = df['tokenCleaned'].map(lambda word: len(nltk.corpus.wordnet.synsets(word)) > 0)
df['isName'] = df['tokenCleaned'].isin(common_name_dictionary)
df['isNonAscii'] = ~df['tokenCleaned'].apply(lambda x: x.isascii())
df['isUncategorized'] = ~(df['isDigit'] | df['isWord'] | df['isInWordNet'] | df['isName'] | df['isNonAscii'])
df['isDuplicated'] = df.duplicated(subset='tokenCleaned', keep='first')


df = df.sort_values(by='tokenCleaned', key=lambda x: x.str.len())

def save_to_txt(df, category, filename, basePath="data/"):
    df_part = df[category & ~df['isDuplicated']]['tokenCleaned']
    print(filename, len(df_part))
    df_part.to_csv(basePath + filename, index=False, header=False)

save_to_txt(df, df['isUncategorized'], 'isUncategorized.txt')
save_to_txt(df, df['isDigit'], 'isDigit.txt')
save_to_txt(df, df['isWord'], 'isWord.txt')
save_to_txt(df, df['isInWordNet'], 'isInWordNet.txt')
save_to_txt(df, df['isName'], 'isName.txt')
save_to_txt(df, df['isNonAscii'], 'isNonAscii.txt')
save_to_txt(df, df['isUncategorized'], 'isUncategorized.txt')
save_to_txt(df, ~df['isUncategorized'] & ~df['isNonAscii'], 'words.txt')

df.to_json('data/vocab-categorized.json', orient='records')

print()
print("Dublicated count:", len(df[df['isDuplicated']]))
print("Uncategorized count:", len(df[df['isUncategorized']]))


isUncategorized.txt 9353
isDigit.txt 1097
isWord.txt 19817
isInWordNet.txt 20128
isName.txt 1676
isNonAscii.txt 674
isUncategorized.txt 9353
words.txt 22917

Dublicated count: 17313
Uncategorized count: 11869
