In [2]:
%load_ext autoreload
%autoreload 2

In [10]:
import utils
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import os
import json

freq_langs = defaultdict(float)
num_cores = os.cpu_count() 
divide_batch = num_cores if num_cores is not None else 16

In [4]:

vocabulary = set()

config = utils.DocProcessingConfig(
    lemmatize=True,
    stopwords=True,
    lowercase=True,
    remove_punctuation=True,
)


with open(utils.CORPUS_PATH, "r") as f:
    raw_docs = json.load(f)
    # raw_docs = ijson.items(f, "item")
    for raw_doc in raw_docs:
        freq_langs[raw_doc['lang']] += 1

total_docs = int(sum(freq_langs.values()))
print("Total docs: ", total_docs)
print("\n## Raw frequencies")
for lang, freq in freq_langs.items():
    print(f'{lang}: {freq}')

print("\n## Normalized frequencies")
for lang, freq in freq_langs.items():
    freq_langs[lang] = freq * 100  / total_docs

for lang, freq in freq_langs.items():
    print(f'{lang}: {freq:.2f}%')



Total docs:  268022

## Raw frequencies
en: 207363.0
fr: 10676.0
de: 10992.0
es: 11019.0
it: 11250.0
ko: 7893.0
ar: 8829.0

## Normalized frequencies
en: 77.37%
fr: 3.98%
de: 4.10%
es: 4.11%
it: 4.20%
ko: 2.94%
ar: 3.29%


In [7]:
for lang in utils.Lang:
    vocab =  utils.load(f".cache/vocab_test_{lang.value}.pkl")
    print(lang.value)
    print(len(vocab))
    print()


de
604444

fr
514244

es
433406

it
624928

ar
175670

en
2161857

ko
350877



In [8]:
i = 0
en_vocab =  utils.load(f".cache/vocab_test_en.pkl")
for word in en_vocab:
    i+= 1
    if i < 2000:
        continue
    
    print(word)
    if i == 50:
        break


supramaximal
trichobathra
franchized
fgth
gojunen
pondattithan
daterie
commencan
rotenhof
overlorded
sebkhat
onwisconsin
kexts
leano
maadhu
selenocosmia
qiangbang
douw
unlife
marzuoli
hashimbekov
nitidum
seilbek
simmerud
puratam
sdiy
annoyd
roslanbek
bergum
dissesion
akkol
lamphong
kpeve
nsacp
kullager
rastell
maydaa
pamenter
tholekavu
bugut
zamas
stigmatella
pylas
wpwx
otthild
nupeland
manetphyan
tonggok
labonair
fas
raychikhlag
gracilispora
lepismatina
acf
nonispa
suntherland
cryptogamist
falavigna
holem
colorwash
pannacotta
haier
shaper
necabettin
redlink
flugboot
sagarbarh
urtaca
rekla
boreviulisoma
unjami
zenturm
metaocaml
francescoi
kayalar
babineaux
qet
amblema
qunqirat
serraescuderio
chandrabati
greywater
libental
munras
marupuru
lahowitz
flieder
dalaraeus
mcbath
kawachiensis
metcha
argusia
eremostachy
kurtoxin
nevse
smogorzewo
minutia
twyhynde
dtbcht
dakhni
paraso


In [16]:
weird = set(list(en_vocab)[:50])

config = utils.DocProcessingConfig(True, True, True, True, True, True)
with ProcessPoolExecutor(max_workers=num_cores) as executor:
    for batch in tqdm(utils.batch_load_documents(executor, divide_batch, config=config, lang=utils.Lang.ENGLISH)):
        for doc in batch:
            words = set(doc.getTokens())
            if any(word in words for word in weird):
                print(doc.getTokens())
        


2it [00:37, 18.71s/it]

['japanese', 'television', 'series', 'metal', 'hero', 'series', 'franchise', 'sequel', 'juukou', 'fighter', 'take', 'place', 'year', 'precede', 'fighter', 'series', 'kabuto', 'air', 'action', 'footage', 'prop', 'beetleborgs', 'metallix', 'series', 'plot', 'year', 'destruction', 'jamahl', 'earth', 'return', 'peace', 'earth', 'academia', 'scientific', 'research', 'organization', 'takuya', 'kai', 'work', 'sage', 'guru', 'new', 'generation', 'insect', 'armor', 'event', 'threat', 'earth', 'threat', 'realize', 'cosmo', 'academia', 'exploration', 'submarine', 'come', 'fissure', 'ocean', 'floor', 'rise', 'huge', 'fly', 'fortress', 'ancient', 'tribe', 'melzard', 'clan', 'rest', 'millennium', 'seek', 'destroy', 'mankind', 'matriarch', 'mother', 'melzard', 'send', 'old', 'son', 'raija', 'dezzle', 'lead', 'attack', 'raija', 'elebamamoth', 'guru', 'infuse', 'neo', 'insect', 'armor', 'insect', 'power', 'create', 'command', 'voicer', 'link', 'human', 'armor', 'kengo', 'tachibana', 'run', 'ayukawa', '

3it [00:56, 18.93s/it]

['article', 'royal', 'italian', 'army', 'regio', 'esercito', 'participate', 'second', 'world', 'war', 'royal', 'italian', 'army', 'reform', 'exist', 'royal', 'army', 'start', 'unification', 'italy', 'risorgimento', 'formation', 'kingdom', 'italy', 'regno', 'end', 'dissolution', 'monarchy', 'royal', 'army', 'precede', 'individual', 'army', 'independent', 'italian', 'state', 'follow', 'italian', 'army', 'esercito', 'italiano', 'italian', 'republic', 'repubblica', 'italiana', 'organization', 'italian', 'army', 'world', 'war', 'royal', 'army', 'nominal', 'commander', 'chief', 'italian', 'royal', 'army', 'majesty', 'king', 'vittorio', 'emanuele', 'iii', 'commander', 'chief', 'italian', 'armed', 'force', 'vittorio', 'emanuele', 'command', 'royal', 'air', 'force', 'regia', 'aeronautica', 'royal', 'navy', 'regia', 'marina', 'reality', 'king', 'military', 'responsibility', 'assume', 'italian', 'prime', 'minister', 'benito', 'mussolini', 'mussolini', 'supreme', 'command', 'comando', 'supremo', '

4it [01:15, 18.83s/it]

['kong', 'animate', 'series', 'animate', 'television', 'series', 'follow', 'king', 'kong', 'title', 'character', 'base', 'film', 'series', 'production', 'bkn', 'international', 'ellipse', 'animation', 'premiere', 'france', 'network', 'kong', 'create', 'compete', 'godzilla', 'series', 'produce', 'bkn', 'air', 'syndication', 'plot', 'king', 'kong', 'fall', 'death', 'empire', 'state', 'building', 'shoot', 'biplane', 'scientist', 'name', 'lorna', 'jenkin', 'take', 'dna', 'sample', 'recreate', 'clone', 'counterpart', 'original', 'kong', 'help', 'dna', 'grandson', 'jason', 'year', 'later', 'lorna', 'jenkin', 'send', 'mail', 'grandson', 'friend', 'eric', 'tan', 'tannenbaum', 'invite', 'kong', 'home', 'kong', 'island', 'know', 'university', 'professor', 'ramone', 'porta', 'tamper', 'mail', 'get', 'invite', 'meet', 'native', 'girl', 'lua', 'group', 'take', 'lorna', 'jenkin', 'lab', 'ramone', 'true', 'color', 'show', 'get', 'access', 'cyber', 'link', 'technology', 'ramone', 'use', 'steal', 'prim

5it [01:34, 18.79s/it]

['follow', 'event', 'occur', 'march', 'march', 'tuesday', 'nasa', 'establish', 'office', 'life', 'science', 'work', 'exobiology', 'base', 'joshua', 'lederberg', 'idea', 'space', 'vehicle', 'sterilize', 'mission', 'order', 'prevent', 'possibility', 'contamination', 'outer', 'space', 'earth', 'microbe', 'march', 'wednesday', 'visit', 'montevideo', 'president', 'united', 'states', 'people', 'fall', 'victim', 'tear', 'gas', 'uruguayan', 'police', 'disperse', 'riot', 'university', 'student', 'dwight', 'eisenhower', 'host', 'newly', 'inaugurate', 'uruguayan', 'president', 'benito', 'nardone', 'see', 'rub', 'eye', 'motorcade', 'pass', 'shortly', 'gas', 'lufthansa', 'german', 'national', 'airline', 'enter', 'jet', 'age', 'flight', 'bear', 'hector', 'calma', 'filipino', 'basketball', 'player', 'manila', 'die', 'taczak', 'polish', 'general', 'march', 'thursday', 'pope', 'john', 'xxiii', 'elevate', 'seven', 'bishop', 'college', 'cardinal', 'roman', 'catholic', 'church', 'bring', 'number', 'member

6it [01:53, 18.99s/it]

['antonio', 'ortega', 'december', 'january', 'spanish', 'sculptor', 'know', 'andalusian', 'imagery', 'biography', 'antonio', 'ortega', 'bear', 'ayamonte', 'county', 'huelva', 'december', 'teenager', 'show', 'restless', 'passion', 'innate', 'ability', 'sculpture', 'produce', 'self', 'teach', 'work', 'year', 'later', 'show', 'master', 'mariano', 'benlliure', 'appear', 'typical', 'mature', 'sculptor', 'carry', 'study', 'madrid', 'attend', 'escuela', 'bellas', 'arte', 'san', 'fernando', 'study', 'sculpture', 'design', 'teaching', 'excellent', 'teacher', 'mariano', 'benlliure', 'capuz', 'manuel', 'benedito', 'juan', 'adsuara', 'work', 'stage', 'production', 'year', 'approach', 'castilian', 'imagery', 'study', 'valladolid', 'find', 'mentor', 'gregorio', 'work', 'workshop', 'san', 'cristobal', 'street', 'huelva', 'share', 'painter', 'pedro', 'soon', 'workshop', 'informal', 'school', 'artist', 'atheneum', 'art', 'humanity', 'frequent', 'artist', 'live', 'pass', 'huelva', 'poet', 'journalist', 

7it [02:11, 18.51s/it]

['king', 'kong', 'american', 'pre', 'code', 'adventure', 'fantasy', 'horror', 'monster', 'film', 'direct', 'produce', 'merian', 'cooper', 'ernest', 'schoedsack', 'screenplay', 'james', 'ashmore', 'creelman', 'ruth', 'rise', 'develop', 'idea', 'conceive', 'cooper', 'edgar', 'wallace', 'star', 'fay', 'wray', 'robert', 'armstrong', 'bruce', 'cabot', 'tell', 'story', 'giant', 'ape', 'dub', 'kong', 'attempt', 'possess', 'beautiful', 'young', 'woman', 'feature', 'stop', 'motion', 'animation', 'willis', 'music', 'score', 'max', 'steiner', 'entry', 'king', 'kong', 'franchise', 'king', 'kong', 'open', 'new', 'york', 'city', 'march', 'rave', 'review', 'rank', 'rotten', 'tomato', 'great', 'horror', 'film', 'time', 'sixth', 'great', 'film', 'time', 'deem', 'culturally', 'historically', 'aesthetically', 'significant', 'library', 'congress', 'select', 'preservation', 'national', 'film', 'registry', 'sequel', 'title', 'son', 'kong', 'fast', 'track', 'release', 'year', 'film', 'follow', 'decade', 'inc

10it [03:17, 19.72s/it]


KeyboardInterrupt: 

In [3]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
db_path = 'tf_idf_and_idf.db'  # Replace with your actual database path
conn = sqlite3.connect(db_path)

# Get a cursor object
cursor = conn.cursor()

# Retrieve the names of all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Print the names of all tables
print("Tables in the database:")
for table_name in tables:
    print(table_name[0])  # table_name is a tuple, get the first element

# Load each table into a DataFrame and show the first five rows of a specific column
specific_column_name = 'doc_id'  # Replace with your actual column name
for table_name in tables:
    if table_name[0].startswith('idf'):
        continue
    table_name = table_name[0]  # Extract the table name from the tuple
    df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)  # Load table into DataFrame
    
    # Show the first five rows of the specific column if it exists
    if specific_column_name in df.columns:
        print(f"\nFirst five rows of '{specific_column_name}' in table '{table_name}':")
        print(df[specific_column_name].head())
    else:
        print(f"Column '{specific_column_name}' does not exist in table '{table_name}'.")

# Close the cursor and connection
cursor.close()
conn.close()

Tables in the database:
tf_idf_de
tf_idf_fr
tf_idf_es
tf_idf_it
tf_idf_ar
tf_idf_en
tf_idf_ko
idf

First five rows of 'doc_id' in table 'tf_idf_de':
Series([], Name: doc_id, dtype: object)

First five rows of 'doc_id' in table 'tf_idf_fr':
Series([], Name: doc_id, dtype: object)

First five rows of 'doc_id' in table 'tf_idf_es':
Series([], Name: doc_id, dtype: object)

First five rows of 'doc_id' in table 'tf_idf_it':
Series([], Name: doc_id, dtype: object)

First five rows of 'doc_id' in table 'tf_idf_ar':
Series([], Name: doc_id, dtype: object)

First five rows of 'doc_id' in table 'tf_idf_en':
0    doc-en-11447
1     doc-en-9696
2     doc-en-4033
3    doc-en-10997
4     doc-en-3366
Name: doc_id, dtype: object

First five rows of 'doc_id' in table 'tf_idf_ko':
Series([], Name: doc_id, dtype: object)
