In [1]:
import pyarrow.dataset as ds
import os
import pandas as pd
from tqdm import tqdm

from spacy.tokens import DocBin
import json
import pickle
import re
from collections import Counter
from multiprocessing import Pool

available_processors = os.cpu_count() - 1

In [2]:
def doc_clean(doc, remove_stopwords=True):
    cleaned_doc = list()
    for i, token in enumerate(doc):
        if token.is_punct:
            continue
        elif remove_stopwords == True and token.is_stop:
            continue
        elif token.pos_ == "SPACE":
            continue
        else:
            cleaned_doc.append(token)
    return cleaned_doc

In [3]:
def get_docbin_from_tg_record_id(x, d2db):
    try:
        dt = d2db[x]["docbin"]
        return dt
        
    except:
        dt = None
        return dt
        

In [4]:
def get_docbin_line_from_tg_record_id(x, d2db):
    try:
        dt = d2db[x]["line"]
        return dt
        
    except:
        dt = None
        return dt
        

In [5]:
def get_sample_lines_tg(df, doc2docbin_path):
    d2db = json.load(open(doc2docbin_path, 'r'))
    df['docbin'] = df.record_id.apply(lambda x: get_docbin_from_tg_record_id(x, d2db))
    df['docbin_line'] = df.record_id.apply(lambda x: get_docbin_line_from_tg_record_id(x, d2db))
    
    grouped = df.groupby('docbin')['docbin_line'].apply(lambda x: list(set(x)))
    sample = [(docbin, lines) for docbin, lines in grouped.items()]
    print(f"Generated sample with {len(sample)} items")
    return sample

In [6]:
def docbin_counter(docbin, query_docs, nlp, remove_stopwords = True, entities = False):
    import spacy
    nlp = spacy.load('nl_core_news_sm')
    
    docbin_subtotal = Counter()
    db = DocBin().from_disk(docbin)
    docs = list(db.get_docs(nlp.vocab))
    
    print('processing docbin_counter')
    if entities == True:
        for line in query_docs:
            doc = docs[int(line)]
            doc_counter = Counter([(token[0], token[1]) for token in doc_clean_entities(doc, remove_stopwords = remove_stopwords)])
            docbin_subtotal.update(doc_counter)
            
    elif entities == False:
        for line in query_docs:
            doc = docs[int(line)]
            doc_counter = Counter([(token.lemma_, token.pos_) for token in doc_clean(doc, remove_stopwords=remove_stopwords)])
            docbin_subtotal.update(doc_counter)
            #print(f'docbin subtotal = {docbin_subtotal}') PRINTS (TERM : LABEL) : COUNT
    return docbin_subtotal
    


In [7]:
import spacy

def docbin_counter_onecore(sample, remove_stopwords = True, entities = False, docbin_folder = "./r_TheRedPill_docbins/", file_prefix = "r_TheRedPill", workers = available_processors):
    counters = None
    
    args = [(docbin_folder + "/" + file_prefix + "_docbin_" + str(int(docbin)) + ".db", 
             query_docs, 
             remove_stopwords, 
             entities
            ) for docbin, query_docs in sample]

    for a in args:
        counters = docbin_counter(a[0],a[1],spacy.load('nl_core_news_sm') ,a[2],a[3])
        break
    
    total = Counter()
    
    for counter in counters:
        total.update(counter)
        
    
    return counters


In [8]:
def fdist2table(fdist, savename = "./heads_of_state_fdist.xlsx"):
    
    keys = list(fdist.keys())
    records = [(key[0], key[1], fdist[key]) for key in keys]
    
    tb = pd.DataFrame.from_records(records, columns = ["word", "label", "count"])
    tb = tb.loc[(tb["label"] != "SPACE") & (tb["label"] != "PUNCT")] # taking out some unnecessary parts of speech
    tb.to_excel(savename, index = False, engine = 'xlsxwriter')
    print("Frequency distribution saved!")
    return tb

In [9]:

filepath = 'S:\\ERP Raw Data\\pien\\THESIS\\docbins'
filepath_parquets = 'S:\\ERP Raw Data\\pien\\THESIS\\parquets'
filepath_fdists = 'S:\\ERP Raw Data\\pien\\THESIS\\fdists'

for file in os.listdir(filepath):

    file_base = file.removesuffix('_docbins')
    
    #paths
    docbin_folder = os.path.join(filepath, file)
    d2db_path = os.path.join(docbin_folder, f"{file_base}_doc2docbin.json")
    parq_direct = os.path.join(filepath_parquets, f'{file_base}.pqt')
    
    smpl = get_sample_lines_tg(pd.read_parquet(parq_direct), d2db_path)
    num_workers = os.cpu_count()
        
    print(f"Processing {file}. Please wait as this may take a while...")
    fdist = docbin_counter_onecore( #docbin_counter_multicore
            smpl,
            remove_stopwords = True,
            entities = False,
            docbin_folder = docbin_folder,
            file_prefix = file_base,
            workers = num_workers
        )

    print(f'Frequency distribution of {file} created.')
        # logic to save files
    

    new_folder = os.path.join(filepath_fdists, f'{file_base}_fdist')
    os.makedirs(new_folder, exist_ok=True)

    savename = f'frequency_distribution_{file_base}'

# save a pickle (with .fdist extension) and an excel file
    pkl_of = os.path.join(new_folder, savename + ".fdist")
    xls_of = os.path.join(new_folder, savename + ".xlsx")

    # save fdist using pickle
    with open(pkl_of, 'wb') as fdist_out:
        pickle.dump(fdist, fdist_out)
    # save fdist using excel
    fdist_df = fdist2table(fdist, xls_of)

    # create preview for user
    fdist_df = fdist_df[fdist_df["count"] >= 50]
    print("Dataframe Preview (count >= 50)")
    print(fdist_df)
    

Generated sample with 5 items
Processing FVDNL_docbins. Please wait as this may take a while...
processing docbin_counter
Frequency distribution of FVDNL_docbins created.
Frequency distribution saved!
Dataframe Preview (count >= 50)
Generated sample with 5 items
Processing Gerrit_Brendel_docbins. Please wait as this may take a while...
processing docbin_counter
Frequency distribution of Gerrit_Brendel_docbins created.
Frequency distribution saved!
Dataframe Preview (count >= 50)
Generated sample with 7 items
Processing InfodefenseNED_docbins. Please wait as this may take a while...
processing docbin_counter
Frequency distribution of InfodefenseNED_docbins created.
Frequency distribution saved!
Dataframe Preview (count >= 50)
Generated sample with 24 items
Processing Mariba2puntnul_docbins. Please wait as this may take a while...
processing docbin_counter
Frequency distribution of Mariba2puntnul_docbins created.
Frequency distribution saved!
Dataframe Preview (count >= 50)
Generated sam