In [1]:
## imports
import pandas as pd
from gensim.models import Word2Vec
from sklearn.utils import resample
import numpy as np
import json
import matplotlib.pyplot as plt
import pickle
import re
import numpy as np
import subprocess

In [None]:
def preprocess_text(text):
    
    """
    
    - Clean and tokenize text for Word2Vec
    - Each year should be a list of lists YEAR_CORPUS = [[100 words], [next 100 words] ...] 
    - save each year_corpus as a .pkl format for later use and retraining (See /CORPUS sub-directory)
    - Use 100 word windows to accurately capture the complexity of financial documents
    
    """
    if pd.isna(text) or text == '':
        return []
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove special characters and numbers, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Split into tokens
    tokens = text.split()
    
    # Filter out very short tokens (optional)
    tokens = [token for token in tokens if len(token) > 2]
    
    return tokens

def text_to_sentences(tokens, sentence_length=100):
    """
    
    - Split long token list into sentences of fixed length
    
    """
    sentences = []
    
    for i in range(0, len(tokens), sentence_length):
        sentence = tokens[i:i + sentence_length]
        if len(sentence) >= 10:  # minimum sentence length
            sentences.append(sentence)
    
    return sentences

def create_word2vec_data(df, year_column='year', text_columns=['item_1A', 'item_1', 'item_7', 'item_15', 'item_8']):
    """
    
    - Convert tabular data to Word2Vec format for each year
    - Word2Vec format is a list of lists: Inner list is words in a sentence, outer list is all sentences in a year (time slice)
    
    """
    
    # Get unique years
    years = sorted(df[year_column].unique())
    
    for year in years:
        print(f"Processing {year}...")
        
        # Filter to this year
        year_df = df[df[year_column] == year]
        
        # Collect all text for this year
        all_sentences = []
        
        for _, row in year_df.iterrows():
            for col in text_columns:
                if col in row and pd.notna(row[col]):
                    # Preprocess the text
                    tokens = preprocess_text(row[col])
                    
                    if len(tokens) > 0:
                        # Split into sentences
                        sentences = text_to_sentences(tokens, sentence_length=100)
                        all_sentences.extend(sentences)
        
        print(f"  {year}: {len(all_sentences)} sentences, ~{sum(len(s) for s in all_sentences):,} tokens")
        
        # Save as pickle file (preserves list structure)
        with open(f'CORPUS/{year}_sentences.pkl', 'wb') as f:
            pickle.dump(all_sentences, f)
        
        print(f"  Saved: {year}_sentences.pkl")

# Load your filtered data
df = pd.read_csv('sp500_edgar_10k_columns_filtered.csv')

# You'll need to adjust these column names based on your actual data
year_column = 'year'  # adjust if your year column has a different name
text_columns = ['item_1A', 'item_1', 'item_7', 'item_15', 'item_8']  # adjust based on your actual column names

# Create Word2Vec data for each year
create_word2vec_data(df, year_column=year_column, text_columns=text_columns)

In [None]:
# Function to load sentences for Word2Vec (use this in your training code)
def load_sentences(year):
    """
   
    - Load sentences for a specific year
    - load and return a pickled object from the CORPUS directory
    
    """
    with open(f'CORPUS/{year}_sentences.pkl', 'rb') as f:
        sentences = pickle.load(f)
    return sentences

In [None]:

def train_and_bootstrap(corpus, year, N_ITERATIONS):

    """

    - a word2vec model over N boostrap iterations
    - for each iteration calculate cosine similarity scores
    - save boostrap similarity scores in a dataframe
    - adapted from Rodman (2020): 
    
    """

    #load base model
    model = Word2Vec.load(f'BASE_MODELS/base_model_{year}.model')

    #set up dictionary to store data 
    # Row indices will indicate the bootstrap iteration, Column headers are the context words
    cos_similarity_data = {

        'reputation':[],
        'harm':[],
        'liability':[],
        'obligation':[],
        'litigation':[],
        'legislation':[],
        'respect':[],
        'invest':[],
        'uphold':[],
        'protect':[],
        'empower':[],
        'trust':[]

    }

    for i in range(N_ITERATIONS): #boostrapping 
        
        sentence_samples = resample(corpus, replace=True, n_samples=len(corpus)) #sampling the whole corpus WITH replacement
        
        model.train(sentence_samples, total_examples = len(sentence_samples), epochs = 5) #train on boostrapped sample
        
        ## recalculate similaritie scores
        cos_similarity_data['reputation'].append(model.wv.similarity('privacy', 'reputation'))
        cos_similarity_data['harm'].append(model.wv.similarity('privacy', 'harm'))
        cos_similarity_data['liability'].append(model.wv.similarity('privacy', 'liability'))
        cos_similarity_data['obligation'].append(model.wv.similarity('privacy', 'obligation'))
        cos_similarity_data['litigation'].append(model.wv.similarity('privacy', 'litigation'))
        cos_similarity_data['legislation'].append(model.wv.similarity('privacy', 'legislation'))
        cos_similarity_data['respect'].append(model.wv.similarity('privacy', 'respect'))
        cos_similarity_data['invest'].append(model.wv.similarity('privacy', 'invest'))
        cos_similarity_data['uphold'].append(model.wv.similarity('privacy', 'uphold'))
        cos_similarity_data['protect'].append(model.wv.similarity('privacy', 'protect'))
        cos_similarity_data['empower'].append(model.wv.similarity('privacy', 'empower'))
        cos_similarity_data['trust'].append(model.wv.similarity('privacy', 'trust'))

        print(f'Finished iteration {i}/{N_ITERATIONS} of year: {year}')

    data_df = pd.DataFrame(cos_similarity_data)
    
    return data_df

For each year, I load the corresponding `.pkl` file from the `CORPUS` directory and train and save a "base" Word2Vec model. Then I reload that model to run the `train_and_boostrap` function defined above which writes our target statistics to CSV.

In [11]:
#### 2010

sentences_2010 = load_sentences(2010)

print(f'Number of sentences in 2010: {len(sentences_2010)}')

base_model_2010 = Word2Vec(sentences_2010, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2010.save('BASE_MODELS/base_model_2010.model')

print('Saved base model for year: 2010')

results_2010 = train_and_bootstrap(sentences_2010, 2010, 50)

results_2010.to_csv("RESULTS/2010_results.csv")

Number of sentences in 2010: 103651
Saved base model for year: 2010
Finished iteration 0/50 of year: 2010
Finished iteration 1/50 of year: 2010
Finished iteration 2/50 of year: 2010
Finished iteration 3/50 of year: 2010
Finished iteration 4/50 of year: 2010
Finished iteration 5/50 of year: 2010
Finished iteration 6/50 of year: 2010
Finished iteration 7/50 of year: 2010
Finished iteration 8/50 of year: 2010
Finished iteration 9/50 of year: 2010
Finished iteration 10/50 of year: 2010
Finished iteration 11/50 of year: 2010
Finished iteration 12/50 of year: 2010
Finished iteration 13/50 of year: 2010
Finished iteration 14/50 of year: 2010
Finished iteration 15/50 of year: 2010
Finished iteration 16/50 of year: 2010
Finished iteration 17/50 of year: 2010
Finished iteration 18/50 of year: 2010
Finished iteration 19/50 of year: 2010
Finished iteration 20/50 of year: 2010
Finished iteration 21/50 of year: 2010
Finished iteration 22/50 of year: 2010
Finished iteration 23/50 of year: 2010
Finish

In [None]:
#### 2011

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2011 = load_sentences(2011)

print(f'Number of sentences in 2011: {len(sentences_2011)}')

base_model_2011 = Word2Vec(sentences_2011, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2011.save('BASE_MODELS/base_model_2011.model')

print('Saved base model for year: 2011')

results_2011 = train_and_bootstrap(sentences_2011, 2011, 50)

results_2011.to_csv("RESULTS/2011_results.csv")

caffeinate_process.terminate()

Python(94744) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Number of sentences in 2011: 104988
Saved base model for year: 2011
Finished iteration 0/50 of year: 2011
Finished iteration 1/50 of year: 2011
Finished iteration 2/50 of year: 2011
Finished iteration 3/50 of year: 2011
Finished iteration 4/50 of year: 2011
Finished iteration 5/50 of year: 2011
Finished iteration 6/50 of year: 2011
Finished iteration 7/50 of year: 2011
Finished iteration 8/50 of year: 2011
Finished iteration 9/50 of year: 2011
Finished iteration 10/50 of year: 2011
Finished iteration 11/50 of year: 2011
Finished iteration 12/50 of year: 2011
Finished iteration 13/50 of year: 2011
Finished iteration 14/50 of year: 2011
Finished iteration 15/50 of year: 2011
Finished iteration 16/50 of year: 2011
Finished iteration 17/50 of year: 2011
Finished iteration 18/50 of year: 2011
Finished iteration 19/50 of year: 2011
Finished iteration 20/50 of year: 2011
Finished iteration 21/50 of year: 2011
Finished iteration 22/50 of year: 2011
Finished iteration 23/50 of year: 2011
Finish

In [5]:
#### 2012

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2012 = load_sentences(2012)

print(f'Number of sentences in 2012: {len(sentences_2012)}')

base_model_2012 = Word2Vec(sentences_2012, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2012.save('BASE_MODELS/base_model_2012.model')

print('Saved base model for year: 2012')

results_2012 = train_and_bootstrap(sentences_2012, 2012, 50)

results_2012.to_csv("RESULTS/2012_results.csv")

caffeinate_process.terminate()

Number of sentences in 2012: 109317
Saved base model for year: 2012
Finished iteration 0/50 of year: 2012
Finished iteration 1/50 of year: 2012
Finished iteration 2/50 of year: 2012
Finished iteration 3/50 of year: 2012
Finished iteration 4/50 of year: 2012
Finished iteration 5/50 of year: 2012
Finished iteration 6/50 of year: 2012
Finished iteration 7/50 of year: 2012
Finished iteration 8/50 of year: 2012
Finished iteration 9/50 of year: 2012
Finished iteration 10/50 of year: 2012
Finished iteration 11/50 of year: 2012
Finished iteration 12/50 of year: 2012
Finished iteration 13/50 of year: 2012
Finished iteration 14/50 of year: 2012
Finished iteration 15/50 of year: 2012
Finished iteration 16/50 of year: 2012
Finished iteration 17/50 of year: 2012
Finished iteration 18/50 of year: 2012
Finished iteration 19/50 of year: 2012
Finished iteration 20/50 of year: 2012
Finished iteration 21/50 of year: 2012
Finished iteration 22/50 of year: 2012
Finished iteration 23/50 of year: 2012
Finish

In [6]:
#### 2013

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2013 = load_sentences(2013)

print(f'Number of sentences in 2013: {len(sentences_2013)}')

base_model_2013 = Word2Vec(sentences_2013, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2013.save('BASE_MODELS/base_model_2013.model')

print('Saved base model for year: 2013')

results_2013 = train_and_bootstrap(sentences_2013, 2013, 50)

results_2013.to_csv("RESULTS/2013_results.csv")

caffeinate_process.terminate()

Number of sentences in 2013: 110577
Saved base model for year: 2013
Finished iteration 0/50 of year: 2013
Finished iteration 1/50 of year: 2013
Finished iteration 2/50 of year: 2013
Finished iteration 3/50 of year: 2013
Finished iteration 4/50 of year: 2013
Finished iteration 5/50 of year: 2013
Finished iteration 6/50 of year: 2013
Finished iteration 7/50 of year: 2013
Finished iteration 8/50 of year: 2013
Finished iteration 9/50 of year: 2013
Finished iteration 10/50 of year: 2013
Finished iteration 11/50 of year: 2013
Finished iteration 12/50 of year: 2013
Finished iteration 13/50 of year: 2013
Finished iteration 14/50 of year: 2013
Finished iteration 15/50 of year: 2013
Finished iteration 16/50 of year: 2013
Finished iteration 17/50 of year: 2013
Finished iteration 18/50 of year: 2013
Finished iteration 19/50 of year: 2013
Finished iteration 20/50 of year: 2013
Finished iteration 21/50 of year: 2013
Finished iteration 22/50 of year: 2013
Finished iteration 23/50 of year: 2013
Finish

In [7]:
#### 2014

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2014 = load_sentences(2014)

print(f'Number of sentences in 2014: {len(sentences_2014)}')

base_model_2014 = Word2Vec(sentences_2014, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2014.save('BASE_MODELS/base_model_2014.model')

print('Saved base model for year: 2014')

results_2014 = train_and_bootstrap(sentences_2014, 2014, 50)

results_2014.to_csv("RESULTS/2014_results.csv")

caffeinate_process.terminate()

Number of sentences in 2014: 110296
Saved base model for year: 2014
Finished iteration 0/50 of year: 2014
Finished iteration 1/50 of year: 2014
Finished iteration 2/50 of year: 2014
Finished iteration 3/50 of year: 2014
Finished iteration 4/50 of year: 2014
Finished iteration 5/50 of year: 2014
Finished iteration 6/50 of year: 2014
Finished iteration 7/50 of year: 2014
Finished iteration 8/50 of year: 2014
Finished iteration 9/50 of year: 2014
Finished iteration 10/50 of year: 2014
Finished iteration 11/50 of year: 2014
Finished iteration 12/50 of year: 2014
Finished iteration 13/50 of year: 2014
Finished iteration 14/50 of year: 2014
Finished iteration 15/50 of year: 2014
Finished iteration 16/50 of year: 2014
Finished iteration 17/50 of year: 2014
Finished iteration 18/50 of year: 2014
Finished iteration 19/50 of year: 2014
Finished iteration 20/50 of year: 2014
Finished iteration 21/50 of year: 2014
Finished iteration 22/50 of year: 2014
Finished iteration 23/50 of year: 2014
Finish

In [8]:
#### 2015

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2015 = load_sentences(2015)

print(f'Number of sentences in 2015: {len(sentences_2015)}')

base_model_2015 = Word2Vec(sentences_2015, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2015.save('BASE_MODELS/base_model_2015.model')

print('Saved base model for year: 2015')

results_2015 = train_and_bootstrap(sentences_2015, 2015, 50)

results_2015.to_csv("RESULTS/2015_results.csv")

caffeinate_process.terminate()

Number of sentences in 2015: 108935
Saved base model for year: 2015
Finished iteration 0/50 of year: 2015
Finished iteration 1/50 of year: 2015
Finished iteration 2/50 of year: 2015
Finished iteration 3/50 of year: 2015
Finished iteration 4/50 of year: 2015
Finished iteration 5/50 of year: 2015
Finished iteration 6/50 of year: 2015
Finished iteration 7/50 of year: 2015
Finished iteration 8/50 of year: 2015
Finished iteration 9/50 of year: 2015
Finished iteration 10/50 of year: 2015
Finished iteration 11/50 of year: 2015
Finished iteration 12/50 of year: 2015
Finished iteration 13/50 of year: 2015
Finished iteration 14/50 of year: 2015
Finished iteration 15/50 of year: 2015
Finished iteration 16/50 of year: 2015
Finished iteration 17/50 of year: 2015
Finished iteration 18/50 of year: 2015
Finished iteration 19/50 of year: 2015
Finished iteration 20/50 of year: 2015
Finished iteration 21/50 of year: 2015
Finished iteration 22/50 of year: 2015
Finished iteration 23/50 of year: 2015
Finish

In [9]:
#### 2016

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2016 = load_sentences(2016)

print(f'Number of sentences in 2016: {len(sentences_2016)}')

base_model_2016 = Word2Vec(sentences_2016, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2016.save('BASE_MODELS/base_model_2016.model')

print('Saved base model for year: 2016')

results_2016 = train_and_bootstrap(sentences_2016, 2016, 50)

results_2016.to_csv("RESULTS/2016_results.csv")

caffeinate_process.terminate()

Number of sentences in 2016: 110639
Saved base model for year: 2016
Finished iteration 0/50 of year: 2016
Finished iteration 1/50 of year: 2016
Finished iteration 2/50 of year: 2016
Finished iteration 3/50 of year: 2016
Finished iteration 4/50 of year: 2016
Finished iteration 5/50 of year: 2016
Finished iteration 6/50 of year: 2016
Finished iteration 7/50 of year: 2016
Finished iteration 8/50 of year: 2016
Finished iteration 9/50 of year: 2016
Finished iteration 10/50 of year: 2016
Finished iteration 11/50 of year: 2016
Finished iteration 12/50 of year: 2016
Finished iteration 13/50 of year: 2016
Finished iteration 14/50 of year: 2016
Finished iteration 15/50 of year: 2016
Finished iteration 16/50 of year: 2016
Finished iteration 17/50 of year: 2016
Finished iteration 18/50 of year: 2016
Finished iteration 19/50 of year: 2016
Finished iteration 20/50 of year: 2016
Finished iteration 21/50 of year: 2016
Finished iteration 22/50 of year: 2016
Finished iteration 23/50 of year: 2016
Finish

In [10]:
#### 2017

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2017 = load_sentences(2017)

print(f'Number of sentences in 2017: {len(sentences_2017)}')

base_model_2017 = Word2Vec(sentences_2017, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2017.save('BASE_MODELS/base_model_2017.model')

print('Saved base model for year: 2017')

results_2017 = train_and_bootstrap(sentences_2017, 2017, 50)

results_2017.to_csv("RESULTS/2017_results.csv")

caffeinate_process.terminate()

Number of sentences in 2017: 112799
Saved base model for year: 2017
Finished iteration 0/50 of year: 2017
Finished iteration 1/50 of year: 2017
Finished iteration 2/50 of year: 2017
Finished iteration 3/50 of year: 2017
Finished iteration 4/50 of year: 2017
Finished iteration 5/50 of year: 2017
Finished iteration 6/50 of year: 2017
Finished iteration 7/50 of year: 2017
Finished iteration 8/50 of year: 2017
Finished iteration 9/50 of year: 2017
Finished iteration 10/50 of year: 2017
Finished iteration 11/50 of year: 2017
Finished iteration 12/50 of year: 2017
Finished iteration 13/50 of year: 2017
Finished iteration 14/50 of year: 2017
Finished iteration 15/50 of year: 2017
Finished iteration 16/50 of year: 2017
Finished iteration 17/50 of year: 2017
Finished iteration 18/50 of year: 2017
Finished iteration 19/50 of year: 2017
Finished iteration 20/50 of year: 2017
Finished iteration 21/50 of year: 2017
Finished iteration 22/50 of year: 2017
Finished iteration 23/50 of year: 2017
Finish

In [11]:
#### 2018

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2018 = load_sentences(2018)

print(f'Number of sentences in 2018: {len(sentences_2018)}')

base_model_2018 = Word2Vec(sentences_2018, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2018.save('BASE_MODELS/base_model_2018.model')

print('Saved base model for year: 2018')

results_2018 = train_and_bootstrap(sentences_2018, 2018, 50)

results_2018.to_csv("RESULTS/2018_results.csv")

caffeinate_process.terminate()

Number of sentences in 2018: 113592
Saved base model for year: 2018
Finished iteration 0/50 of year: 2018
Finished iteration 1/50 of year: 2018
Finished iteration 2/50 of year: 2018
Finished iteration 3/50 of year: 2018
Finished iteration 4/50 of year: 2018
Finished iteration 5/50 of year: 2018
Finished iteration 6/50 of year: 2018
Finished iteration 7/50 of year: 2018
Finished iteration 8/50 of year: 2018
Finished iteration 9/50 of year: 2018
Finished iteration 10/50 of year: 2018
Finished iteration 11/50 of year: 2018
Finished iteration 12/50 of year: 2018
Finished iteration 13/50 of year: 2018
Finished iteration 14/50 of year: 2018
Finished iteration 15/50 of year: 2018
Finished iteration 16/50 of year: 2018
Finished iteration 17/50 of year: 2018
Finished iteration 18/50 of year: 2018
Finished iteration 19/50 of year: 2018
Finished iteration 20/50 of year: 2018
Finished iteration 21/50 of year: 2018
Finished iteration 22/50 of year: 2018
Finished iteration 23/50 of year: 2018
Finish

In [12]:
#### 2019

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2019 = load_sentences(2019)

print(f'Number of sentences in 2019: {len(sentences_2019)}')

base_model_2019 = Word2Vec(sentences_2019, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2019.save('BASE_MODELS/base_model_2019.model')

print('Saved base model for year: 2019')

results_2019 = train_and_bootstrap(sentences_2019, 2019, 50)

results_2019.to_csv("RESULTS/2019_results.csv")

caffeinate_process.terminate()

Python(35449) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Number of sentences in 2019: 116063
Saved base model for year: 2019
Finished iteration 0/50 of year: 2019
Finished iteration 1/50 of year: 2019
Finished iteration 2/50 of year: 2019
Finished iteration 3/50 of year: 2019
Finished iteration 4/50 of year: 2019
Finished iteration 5/50 of year: 2019
Finished iteration 6/50 of year: 2019
Finished iteration 7/50 of year: 2019
Finished iteration 8/50 of year: 2019
Finished iteration 9/50 of year: 2019
Finished iteration 10/50 of year: 2019
Finished iteration 11/50 of year: 2019
Finished iteration 12/50 of year: 2019
Finished iteration 13/50 of year: 2019
Finished iteration 14/50 of year: 2019
Finished iteration 15/50 of year: 2019
Finished iteration 16/50 of year: 2019
Finished iteration 17/50 of year: 2019
Finished iteration 18/50 of year: 2019
Finished iteration 19/50 of year: 2019
Finished iteration 20/50 of year: 2019
Finished iteration 21/50 of year: 2019
Finished iteration 22/50 of year: 2019
Finished iteration 23/50 of year: 2019
Finish

In [13]:
#### 2020

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2020 = load_sentences(2020)

print(f'Number of sentences in 2020: {len(sentences_2020)}')

base_model_2020 = Word2Vec(sentences_2020, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2020.save('BASE_MODELS/base_model_2020.model')

print('Saved base model for year: 2020')

results_2020 = train_and_bootstrap(sentences_2020, 2020, 50)

results_2020.to_csv("RESULTS/2020_results.csv")

caffeinate_process.terminate()

Python(35798) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Number of sentences in 2020: 115809
Saved base model for year: 2020
Finished iteration 0/50 of year: 2020
Finished iteration 1/50 of year: 2020
Finished iteration 2/50 of year: 2020
Finished iteration 3/50 of year: 2020
Finished iteration 4/50 of year: 2020
Finished iteration 5/50 of year: 2020
Finished iteration 6/50 of year: 2020
Finished iteration 7/50 of year: 2020
Finished iteration 8/50 of year: 2020
Finished iteration 9/50 of year: 2020
Finished iteration 10/50 of year: 2020
Finished iteration 11/50 of year: 2020
Finished iteration 12/50 of year: 2020
Finished iteration 13/50 of year: 2020
Finished iteration 14/50 of year: 2020
Finished iteration 15/50 of year: 2020
Finished iteration 16/50 of year: 2020
Finished iteration 17/50 of year: 2020
Finished iteration 18/50 of year: 2020
Finished iteration 19/50 of year: 2020
Finished iteration 20/50 of year: 2020
Finished iteration 21/50 of year: 2020
Finished iteration 22/50 of year: 2020
Finished iteration 23/50 of year: 2020
Finish

In [14]:
#### 2021

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2021 = load_sentences(2021)

print(f'Number of sentences in 2021: {len(sentences_2021)}')

base_model_2021 = Word2Vec(sentences_2021, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2021.save('BASE_MODELS/base_model_2021.model')

print('Saved base model for year: 2021')

results_2021 = train_and_bootstrap(sentences_2021, 2021, 50)

results_2021.to_csv("RESULTS/2021_results.csv")

caffeinate_process.terminate()

Python(79614) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Number of sentences in 2021: 121463
Saved base model for year: 2021
Finished iteration 0/50 of year: 2021
Finished iteration 1/50 of year: 2021
Finished iteration 2/50 of year: 2021
Finished iteration 3/50 of year: 2021
Finished iteration 4/50 of year: 2021
Finished iteration 5/50 of year: 2021
Finished iteration 6/50 of year: 2021
Finished iteration 7/50 of year: 2021
Finished iteration 8/50 of year: 2021
Finished iteration 9/50 of year: 2021
Finished iteration 10/50 of year: 2021
Finished iteration 11/50 of year: 2021
Finished iteration 12/50 of year: 2021
Finished iteration 13/50 of year: 2021
Finished iteration 14/50 of year: 2021
Finished iteration 15/50 of year: 2021
Finished iteration 16/50 of year: 2021
Finished iteration 17/50 of year: 2021
Finished iteration 18/50 of year: 2021
Finished iteration 19/50 of year: 2021
Finished iteration 20/50 of year: 2021
Finished iteration 21/50 of year: 2021
Finished iteration 22/50 of year: 2021
Finished iteration 23/50 of year: 2021
Finish

In [15]:
#### 2022

caffeinate_process = subprocess.Popen(['caffeinate', '-i'])

sentences_2022 = load_sentences(2022)

print(f'Number of sentences in 2022: {len(sentences_2022)}')

base_model_2022 = Word2Vec(sentences_2022, vector_size = 300, min_count = 5, epochs = 10, 
                     sg = 1, hs = 0, negative = 5, window = 10, workers = 4)

base_model_2022.save('BASE_MODELS/base_model_2022.model')

print('Saved base model for year: 2022')

results_2022 = train_and_bootstrap(sentences_2022, 2022, 50)

results_2022.to_csv("RESULTS/2022_results.csv")

caffeinate_process.terminate()

Python(30710) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Number of sentences in 2022: 119083
Saved base model for year: 2022
Finished iteration 0/50 of year: 2022
Finished iteration 1/50 of year: 2022
Finished iteration 2/50 of year: 2022
Finished iteration 3/50 of year: 2022
Finished iteration 4/50 of year: 2022
Finished iteration 5/50 of year: 2022
Finished iteration 6/50 of year: 2022
Finished iteration 7/50 of year: 2022
Finished iteration 8/50 of year: 2022
Finished iteration 9/50 of year: 2022
Finished iteration 10/50 of year: 2022
Finished iteration 11/50 of year: 2022
Finished iteration 12/50 of year: 2022
Finished iteration 13/50 of year: 2022
Finished iteration 14/50 of year: 2022
Finished iteration 15/50 of year: 2022
Finished iteration 16/50 of year: 2022
Finished iteration 17/50 of year: 2022
Finished iteration 18/50 of year: 2022
Finished iteration 19/50 of year: 2022
Finished iteration 20/50 of year: 2022
Finished iteration 21/50 of year: 2022
Finished iteration 22/50 of year: 2022
Finished iteration 23/50 of year: 2022
Finish