### The following packages were installed:

Other packages such as `spacy` had been installed before. This code was run in a venv in my MBP with Python 3.9 and Conda. 

Stopwords from English used.

For reproducibility: I can just reload pickles at the end of the code (lemmatizing takes a long time)

## st3f: removes drop_lowvalue_words_parallel

In [1]:
import csv, gensim, glob, importlib, json, mpfiles, os, pickle, pyLDAvis, re, spacy, string, sys, warnings
import numpy as np, pandas as pd, gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, TfidfModel
from gensim.test.utils import datapath
from nltk.corpus import stopwords
import pyLDAvis.gensim_models
from gensim.test.utils import datapath
import matplotlib.pyplot as plt
import multiprocessing as mp
from functools import partial
from gensim.corpora import MmCorpus

importlib.reload(mpfiles)

warnings.filterwarnings('ignore')

basedir = '../'
sys.path.append(basedir)

stopwords = stopwords.words("english")
min10kwords = 200
filename_list = []

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

def gen_corpus(id2word, data_words):
    corpus = []
    i = 1
    for text in data_words:
        new = id2word.doc2bow(text)
        corpus.append(new)
        i = i+1
        if i % 500 == 0:
            print('Text entered corpus:', i)
    return (corpus)

def create_textsmp(yr):
    print(f"Creating texts")
    texts = []
    i = 0
    filename_list = []
    results = []
    for qtr in [1,2,3,4]:
        pool = mp.Pool(mp.cpu_count())
        print(f"Pool started with {mp.cpu_count()} cores. Qtr {qtr}")
        for filename in glob.glob(f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/1A files/{yr}/Q{qtr}/*.txt'):
            results.append(pool.apply_async(mpfiles.process_file, args=(filename,)))
            i += 1
            if i % 500 == 0:
                print(f"Created {i} texts so far")
        pool.close()
        pool.join()
    for r in results:
        result = r.get()
        filename, doc = result
        filename_list.append(filename)    
        texts.append(doc)                 
    return texts, filename_list

def lemmatization(old_texts, selection, yr, num_processes=mp.cpu_count()):
    idxs_to_keep = selection['order_in_cik']
    print(f"Starting lemmatization")
    pool = mp.Pool(num_processes)
    results = []
    texts = [old_texts[i] for i in idxs_to_keep]
    for i, text in enumerate(texts):
        results.append(pool.apply_async(mpfiles.lemmatize_text, args=(text,)))
        # Print progress: every 500 iterations print the iteration number
        if (i+1) % 500 == 0:
            print(f"Lemmatized {i+1} texts so far")
    pool.close()
    pool.join()
    texts_out = [r.get() for r in results]
    path = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/lemmatized_texts/{yr}"
# Create the directory if it doesn't exist
    if not os.path.exists(path):
        os.makedirs(path)
    # Save the vector as a pickle file
    with open(os.path.join(path, f"lemmatized_texts{yr}.pkl"), "wb") as f:
        pickle.dump(texts_out, f)
    
    with open(os.path.join(path, f"lem_filter{yr}.pkl"), "wb") as f:
        pickle.dump(selection, f)
    
    return texts_out

def create_crosswalks(filename_list, yr):
    print(f"Creating cross walks for year {yr}")
    fn_list = [fn.split('/')[-1] for fn in filename_list]
    idx_list = list(range(len(filename_list)))
    fn2idx = pd.DataFrame({"idx": idx_list, "filename": fn_list})
    fn2cp = []
    for qtr in [1,2,3,4]:
        fn2cp.append(pd.read_csv(f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/firmdict/{yr}Q{qtr}.csv'))
    fn2cp = pd.concat(fn2cp)
    merged_df = pd.merge(fn2idx, fn2cp, on="filename", how="outer")
    merged_df.to_csv(f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/cp2idx/{yr}.csv')
    return None    

def nwords_cdf(texts):
    len_vec = [len(text) for text in texts]
    sorted_len_vec = np.sort(len_vec)
    y = np.arange(1, len(sorted_len_vec) + 1) / len(sorted_len_vec)

    # Plot with xlim between 0 and 1000
    plt.plot(sorted_len_vec, y)
    plt.xlabel('Number of Words')
    plt.ylabel('Cumulative Probability')
    plt.title('Cumulative Distribution of Number of Words in each 1A for 2022, detail')
    plt.xlim(0, 1000)
    plt.grid(True)
    plt.savefig('/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/text/cdf_words_zoom.png')
    plt.clf()  # Clear the plot

    # Plot without restriction for xlim
    plt.plot(sorted_len_vec, y)
    plt.xlabel('Number of Words')
    plt.ylabel('Cumulative Probability')
    plt.title('Cumulative Distribution of Number of Words in each 1A for 2022')
    plt.grid(True)
    plt.savefig('/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/text/cdf_words.png')
    plt.clf()  # Clear the plot
    return None

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_multigrams(lemmatized_texts, min_count, threshold, scoring):
    print("Generating words using gensim.utils.simple_preprocess...")
    data_words = gen_words(lemmatized_texts)
    print("Creating bigram phrases...")
    bigram_phrases = gensim.models.Phrases(data_words, min_count=min_count, threshold=threshold, scoring = scoring,  connector_words=gensim.models.phrases.ENGLISH_CONNECTOR_WORDS) # higher threshold fewer phrases.
    print("Creating trigram phrases...")
    trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], min_count=min_count, threshold=threshold, scoring = scoring, connector_words=gensim.models.phrases.ENGLISH_CONNECTOR_WORDS)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)
    
    print("Making bigrams and trigrams...")
    data_bigrams = [bigram[doc] for doc in data_words]
    data_bigrams_trigrams = [trigram[bigram[doc]] for doc in data_bigrams]

    print('Bigrams and Trigrams created')
    
    return data_bigrams_trigrams

def make_id2word(data_bigrams_trigrams, pathname, no_below, no_above, keep_n):
    id2word = corpora.Dictionary(data_bigrams_trigrams)
    id2word.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    id2word.save_as_text(pathname)
    return id2word

def bow_texts(texts, id2word):
    corpus = [id2word.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus, id2word = id2word)
    print('Corpus and Tfidf created')
    return corpus, tfidf

def drop_lowvalue_words_parallel(corpus, tfidf, low_value, words_missing_in_tfidf):
    
    print("Dropping low value words.")
    num_processes = mp.cpu_count() - 1
    pool = mp.Pool(processes=num_processes)

    chunk_size = int(len(corpus) / num_processes)
    chunks = [corpus[i:i+chunk_size] for i in range(0, len(corpus), chunk_size)]
    results = pool.map(partial(mpfiles.droplowvwords_chunk, tfidf=tfidf, low_value = low_value, words_missing_in_tfidf = [] ), chunks)
    pool.close()
    pool.join()

    # Concatenate the results
    new_corpus = [bow for chunk in results for bow in chunk]

    return new_corpus

def make_topicmap2(lda_model, topics_per_doc, yr, ciks_to_keep, modelname):
    k = lda_model.num_topics
    cik_list = ciks_to_keep.values
    topic_probs = {f"topic_{i}": [] for i in range(k)}
    max_topic = [max(doc, key = lambda x:x[1])[0] for doc in topics_per_doc]
    # Iterate over each list of tuples in the topic list
    for doc in topics_per_doc:
        # Initialize a dictionary to hold the probabilities for this topic
        topic_dict = {f"topic_{i}": 0.0 for i in range(k)}
        for tup in doc:
            topic_dict[f"topic_{tup[0]}"] = tup[1]
        # Append the topic probabilities to the overall topic_probs dictionary
        for i in range(k):
            topic_probs[f"topic_{i}"].append(topic_dict[f"topic_{i}"])

    # Create a Pandas DataFrame using the topic_probs dictionary and the cik_list
    df = pd.DataFrame.from_dict(topic_probs)
    df['max_topic'] = max_topic
    df['CIK'] = cik_list
    df['year'] = yr
    
    if not os.path.exists(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/{modelname}"):
        os.makedirs(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/{modelname}")
    if isinstance(yr, list):
        df.to_csv(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/{modelname}/topic_map_{min(yr)}_{max(yr)}.csv", index=False)
    else:
        df.to_csv(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/{modelname}/topic_map_{yr}.csv", index=False)
    
    return df
    
    
def concatenate_topic_maps(myfolder):
    topic_map_files = [f for f in os.listdir(myfolder) if f.startswith('topic_map') and f.endswith('.csv')]
    dfs = [pd.read_csv(os.path.join(myfolder, f)) for f in topic_map_files]
    concatenated_df = pd.concat(dfs)
    return concatenated_df    
# import importlib
# importlib.reload(mpfiles)
    
def filter_corpus(texts, filename_list, cequity_mapper, yr, min10kwords):
    text_length = [len(text) for text in texts]
    cik = [int(re.search(r'/(\d+)_', fn).group(1)) for fn in filename_list]
    if yr <= 2020:
        cequity_mapper = cequity_mapper[cequity_mapper['year'] == yr]
    else:
        cequity_mapper = cequity_mapper[cequity_mapper['year'] == 2020]        
    order_in_cik = list(range(len(cik)))
    stats_texts = pd.DataFrame({"order_in_cik": order_in_cik, "cik": cik, "text_length": text_length})
    fullfilter = pd.merge(stats_texts, cequity_mapper, on="cik", how="inner")
    fullfilter['crit_LEN'] = fullfilter['text_length'] > min10kwords
    fullfilter['crit_ALL'] = fullfilter['crit_ALL'] == 1
    fullfilter['crit_ALL2'] = list(np.logical_and(np.array(fullfilter['crit_ALL']),np.array(fullfilter['crit_LEN'])))
    selection = fullfilter[fullfilter['crit_ALL2']]
    selection = selection.drop_duplicates(subset = "cik", keep = "first")
    idxs_to_keep = selection['order_in_cik']
    ciks_to_keep = selection['cik']    
    
    return selection, idxs_to_keep, ciks_to_keep

The following command creates the crosswalk between filename and corpus index:

In [2]:
vec = []
for yr in range(2006,2023):
    file_path = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/lemmatized_texts/{yr}/lemmatized_texts{yr}.pkl"
    with open(file_path, 'rb') as f:
        lemmatized_texts = pickle.load(f)
        vec.append(len(lemmatized_texts))

In [3]:
def lemmat_counts():
    years = list(range(2006, 2023))
    vec = []
    for yr in range(2006,2023):
        file_path = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/lemmatized_texts/{yr}/lemmatized_texts{yr}.pkl"
        with open(file_path, 'rb') as f:
            lemmatized_texts = pickle.load(f)
        vec.append(len(lemmatized_texts))
    df = pd.DataFrame({'Year': years, 'Count': vec})
    # Save the dataframe to CSV
    df.to_csv('/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/descriptive/lemmat_counts.csv', index=False)

In [4]:
# len_vec = [len(text) for text in texts]
# sorted_len_vec = np.sort(len_vec)
# y = np.arange(1, len(sorted_len_vec) + 1) / len(sorted_len_vec)

# plt.plot(sorted_len_vec, y)
# plt.xlabel('Number of Words')
# plt.ylabel('Cumulative Probability')
# plt.title('Cumulative Distribution of Number of Words in each 1A')
# plt.xlim(0,1000)
# plt.grid(True)
# plt.show()

In [None]:
cpsrc = 'full'
remake_id2word = False
remake_lemmatization = False
remake_ldamodel = True
remake_multiyear_topic_map = True
remake_corpus = False
show_model = True
no_below = 5
no_above = 0.8
keep_n = None
yearlist = [year for year in range(2006, 2023, 1)]
min_count = 10
thr = 10
scorfun = "default"
#numtopiclist = [8, 10]
numtopiclist = [3,5,6,8,10]
dicname = f"dic{cpsrc}mc{min_count}thr{str(thr).replace('.', '_')}{scorfun[:3]}nob{no_below}noa{str(no_above).replace('.', '_')}"
dicpath = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/id2word/{dicname}.txt"
corpuspath = f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}/corpus_full.mm' 
cequity_mapper = pd.read_csv("/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/input/cequity_mapper.csv")
if remake_lemmatization:    
    for yr in yearlist:
        print(f"Starting year {yr}")
        texts, filename_list = create_textsmp(yr)
        # Lemmatized texts still follow order in filename_list
        selection, idxs_to_keep, ciks_to_keep = filter_corpus(texts, filename_list, cequity_mapper, yr, min10kwords)
        lemmatized_texts = lemmatization(texts, selection, yr)
        filelist = glob.glob(f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/1A files/{yr}/*.txt')
        create_crosswalks(filelist, yr)
        
if remake_corpus:
    if remake_id2word:
        lemmatized_texts = []
        yr_vec = []
        idxs_to_keep = pd.Series()
        ciks_to_keep = pd.Series()
        for yr in yearlist:
            file_path = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/lemmatized_texts/{yr}/lemmatized_texts{yr}.pkl"
            # Load the file using pickle
            with open(file_path, 'rb') as f:
                lemmatized_texts = lemmatized_texts + pickle.load(f)
            filter_path = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/lemmatized_texts/{yr}/lem_filter{yr}.pkl"
            with open(filter_path, 'rb') as f:
                selection = pickle.load(f)
            idxs_to_keep = idxs_to_keep.append(selection['order_in_cik'])
            ciks_to_keep = ciks_to_keep.append(selection['cik'])
            yr_vec = yr_vec + [yr for _ in selection['cik']]
        os.makedirs(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}/", exist_ok=True)
        corpus_info = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}/corpus_info.pkl"
        with open(corpus_info, "wb") as f:
            pickle.dump((idxs_to_keep, ciks_to_keep, yr_vec), f)
        data_bigrams_trigrams = make_multigrams(lemmatized_texts, min_count = min_count, threshold = thr, scoring = scorfun)
        id2word = make_id2word(data_bigrams_trigrams, dicpath, no_below, no_above, keep_n)
        print(f'I have just created a dictionary with length {len(id2word)}. What do you think?')
    else:
        id2word = corpora.Dictionary.load_from_text(dicpath)
    corpus, tfidf = bow_texts(data_bigrams_trigrams, id2word)
    #corpus = drop_lowvalue_words_parallel(corpus, tfidf, 0.03, [])
    if not os.path.exists(f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}'):
        os.makedirs(f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}')
    MmCorpus.serialize(corpuspath, corpus)
else:
    corpus_info = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}/corpus_info.pkl"
    with open(corpus_info, "rb") as f:
        idxs_to_keep, ciks_to_keep, yr_vec = pickle.load(f)
    corpus = MmCorpus(corpuspath)
    id2word = corpora.Dictionary.load_from_text(dicpath)

if remake_multiyear_topic_map:   
    for num_topics in numtopiclist:
        modelname = f"{dicname}_{num_topics}t"
        modelpath = datapath(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/code/step_by_step/{modelname}")
        if remake_ldamodel:
            print(f"Making model {modelname}...")
            lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics,
                                               random_state=100, passes = 10) #update_every=1,chunksize=100,passes=10,#alpha="auto"
            lda_model.save(modelpath)
            topics_per_doc = [lda_model.get_document_topics(doc) for doc in corpus]
            print(f"Model {modelname} created.")
            #         pyLDAvis.enable_notebook()
            #         vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = "mmds", R = 50)
            #         vis
        else:
            print(f"Retrieving model {modelname}...")
            lda_model = LdaModel.load(modelpath)
            topics_per_doc = [lda_model[unseen_doc] for unseen_doc in corpus]
        if show_model:
            pyLDAvis.enable_notebook()
            vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = "mmds", R = 50)
            vis
        make_topicmap2(lda_model, topics_per_doc, yr_vec, ciks_to_keep, modelname)


Making model dicfullmc10thr10defnob5noa0_8_3t...


In [4]:
cpsrc = 'full'
remake_id2word = False
remake_lemmatization = False
remake_ldamodel = True
remake_multiyear_topic_map = True
remake_corpus = False
show_model = True
no_below = 5
no_above = 0.8
keep_n = None
yearlist = [year for year in range(2006, 2023, 1)]
min_count = 10
thr = 10
scorfun = "default"
#numtopiclist = [8, 10]
numtopiclist = [3,5,6,8,10]
dicname = f"dic{cpsrc}mc{min_count}thr{str(thr).replace('.', '_')}{scorfun[:3]}nob{no_below}noa{str(no_above).replace('.', '_')}"
dicpath = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/id2word/{dicname}.txt"
id2word = corpora.Dictionary.load_from_text(dicpath)
len(id2word)

102850

In [25]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = "mmds", R = 50)
vis

In [None]:
cpsrc = 'full'
remake_id2word = False
remake_lemmatization = False
remake_ldamodel = True
remake_multiyear_topic_map = True
remake_corpus = False
yearlist = [year for year in range(2006, 2023, 1)]
min_count = 10
thr = 10
scorfun = "default"
num_topics = 5
no_below = 5
no_above = 0.8
keep_n = None
dicname = f"dic{cpsrc}mc{min_count}thr{str(thr).replace('.', '_')}{scorfun[:3]}nob{no_below}noa{str(no_above).replace('.', '_')}"
#f"dic{cpsrc}mc{min_count}thr{str(thr).replace('.', '_')}_{scorfun}_flt"
modelname = f"{dicname}_{num_topics}t"
modelpath = datapath(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/code/step_by_step/{modelname}")
dicpath = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/id2word/{dicname}.txt"
corpuspath = f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}/corpus_full.mm' 

lda_model = LdaModel.load(modelpath)
corpus = MmCorpus(corpuspath)
id2word = corpora.Dictionary.load_from_text(dicpath)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = "mmds", R = 50)
vis

# Shortcut: if you have the corpus, id2word, lda_model stored somewhere

In [9]:
yr = 2022
nt = 6
#dicname = "dic2022mc5thr10_default_flt"
corpuspath = f'/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/corpora/{dicname}/corpus_{yr}.mm' 
dicpath = f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/output/id2word/{dicname}.txt"
corpus = MmCorpus(corpuspath)
id2word = corpora.Dictionary.load_from_text(dicpath)
modelname = f"dicfullmc5thr10_default_flt_{nt}t"
#modelname = f"dic2022mc5thr0_5_npmi_flt_{nt}t"

modelpath = datapath(f"/Users/pedrovallocci/Documents/PhD (local)/Research/By Topic/Measuring knowledge capital risk/code/step_by_step/{modelname}")

lda_model = LdaModel.load(modelpath)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds = "mmds", R = 50)
vis