# Word2Vec Comparison: 2010 vs. 2020 Economic Abstracts

## Imports and Setup

In [None]:
import gensim
import pandas as pd
import logging

# Set up logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load and Prepare Data

In [None]:
def load_and_prepare_data(file_path):
    """Load CSV file and prepare tokenized sentences"""
    df = pd.read_csv(file_path, header=None)
    text_data = df[0].astype(str).tolist()
    sentences = [sentence.split() for sentence in text_data]
    print(f"Loaded {len(sentences)} documents from {file_path}")
    return sentences

# Load both datasets
sentences_2010 = load_and_prepare_data("2010 -cleaned_abstracts.csv")
sentences_2020 = load_and_prepare_data("2020 -cleaned_abstracts.csv")

## Train Word2Vec Models

In [None]:
# Common parameters for both models
params = {
    'vector_size': 300,
    'window': 10,
    'min_count': 2,
    'workers': 10,
    'sg': 1
}

# Train 2010 model
model_2010 = gensim.models.Word2Vec(sentences_2010, **params)
model_2010.train(sentences_2010, total_examples=len(sentences_2010), epochs=30)

# Train 2020 model
model_2020 = gensim.models.Word2Vec(sentences_2020, **params)
model_2020.train(sentences_2020, total_examples=len(sentences_2020), epochs=30)

## Comparison Functions

In [None]:
def compare_word_across_years(word, model1, model2, year1="2010", year2="2020"):
    try:
        vec1 = model1.wv[word]
        vec2 = model2.wv[word]
        similarity = model1.wv.cosine_similarities(vec1, [vec2])[0]

        print(f"\nAnalysis of word: '{word}'")
        print(f"Cosine similarity between {year1} and {year2}: {similarity:.4f}")

        print(f"\nTop 10 neighbors in {year1} model:")
        for neighbor, sim in model1.wv.most_similar(word, topn=10):
            print(f"{neighbor}: {sim:.3f}")

        print(f"\nTop 10 neighbors in {year2} model:")
        for neighbor, sim in model2.wv.most_similar(word, topn=10):
            print(f"{neighbor}: {sim:.3f}")

    except KeyError:
        print(f"Word '{word}' not in vocabulary for one or both models")

## Compare Key Economic Terms

In [None]:
key_terms = ["market", "volatility", "liquidity", "risk", "innovation"]

for term in key_terms:
    compare_word_across_years(term, model_2010, model_2020)

## Summary Analysis

In [None]:
def calculate_all_similarities(terms, model1, model2):
    similarities = {}
    for term in terms:
        try:
            vec1 = model1.wv[term]
            vec2 = model2.wv[term]
            similarities[term] = model1.wv.cosine_similarities(vec1, [vec2])[0]
        except KeyError:
            similarities[term] = None

    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1] if x[1] is not None else -1, reverse=True)

    print("\nSummary of Cosine Similarities:")
    for term, sim in sorted_similarities:
        if sim is not None:
            print(f"{term}: {sim:.4f}")
        else:
            print(f"{term}: Not in vocabulary")
    return sorted_similarities

similarity_results = calculate_all_similarities(key_terms, model_2010, model_2020)