In [1]:
# Loading

import polars as pl
import glob
import re
import os
from pathlib import Path

# Function to read books and create initial dataframe
def load_corpus(directory_path, period_label):
    book_data = []
    
    # Get all text files in directory
    for filepath in glob.glob(f"{directory_path}/*.txt"):
        book_id = Path(filepath).stem
        
        # Read file in chunks to handle large files
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            
        book_data.append({
            "book_id": book_id,
            "period": period_label,
            "full_text": text,
            "word_count": len(text.split())
        })
    
    # Create polars DataFrame
    return pl.from_dicts(book_data)

# Load books from both periods
high_inequality_df = load_corpus("data/pre/", "high")
low_inequality_df = load_corpus("data/post/", "low")

# Combine into one DataFrame
corpus_df = pl.concat([high_inequality_df, low_inequality_df])

# Show summary statistics
print(corpus_df.group_by("period").agg(
    pl.len().alias("book_count"),
    pl.sum("word_count").alias("total_words")
))

shape: (2, 3)
┌────────┬────────────┬─────────────┐
│ period ┆ book_count ┆ total_words │
│ ---    ┆ ---        ┆ ---         │
│ str    ┆ u32        ┆ i64         │
╞════════╪════════════╪═════════════╡
│ low    ┆ 2          ┆ 148805      │
│ high   ┆ 7          ┆ 1324014     │
└────────┴────────────┴─────────────┘


In [2]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Define economic vocabulary (expand this based on your research)
economic_terms = {
    "wealth": ["wealth", "rich", "fortune", "affluent", "prosperity", "opulence", "luxury"],
    "poverty": ["poverty", "poor", "destitute", "hunger", "squalor", "misery", "beggar"],
    "upper_class": ["aristocrat", "nobleman", "gentleman", "lord", "lady", "elite"],
    "lower_class": ["worker", "laborer", "servant", "peasant", "commoner"]
}

# Flatten the terms list for easier searching
all_terms = []
term_categories = {}
for category, terms in economic_terms.items():
    for term in terms:
        all_terms.append(term)
        term_categories[term] = category

# Function to extract sentences containing economic terms
def extract_economic_sentences(text):
    sentences = sent_tokenize(text)
    economic_sentences = []
    
    for sentence in sentences:
        sentence_lower = sentence.lower()
        found_terms = []
        
        for term in all_terms:
            # Use word boundary for exact match
            pattern = r'\b' + term + r'\b'
            if re.search(pattern, sentence_lower):
                found_terms.append(term)
        
        if found_terms:
            economic_sentences.append({
                "sentence": sentence,
                "terms": found_terms,
                "categories": [term_categories[term] for term in found_terms]
            })
    
    return economic_sentences

# Apply extraction (warning: this can be memory intensive for large texts)
def process_books(df):
    # Process in smaller batches if needed
    results = []
    
    for row in df.iter_rows(named=True):
        book_sentences = extract_economic_sentences(row["full_text"])
        for sentence_data in book_sentences:
            results.append({
                "book_id": row["book_id"],
                "period": row["period"],
                "sentence": sentence_data["sentence"],
                "terms": sentence_data["terms"],
                "categories": sentence_data["categories"]
            })
    
    return pl.from_dicts(results)

# Process in chunks to avoid memory issues
sentence_df = process_books(corpus_df)

[nltk_data] Downloading package punkt to /home/m9o8/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Calculate term frequencies by period and category
term_freq_df = (sentence_df
    .with_columns([
        pl.col("terms").list.len().alias("term_count")
    ])
    .explode("terms", "categories")
    .group_by(["period", "categories", "terms"])
    .agg([
        pl.len().alias("frequency")
    ])
    .sort(["period", "categories", "frequency"], descending=[False, False, True])
)

# Implement sentiment analysis on the sentences
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

# Apply sentiment analysis with Polars
sentiment_df = sentence_df.with_columns([
    pl.col("sentence").map_elements(get_sentiment).alias("sentiment_score")
])

# Aggregate sentiment by period and category
sentiment_by_category = (sentiment_df
    .explode("categories")
    .group_by(["period", "categories"])
    .agg([
        pl.mean("sentiment_score").alias("avg_sentiment"),
        pl.std("sentiment_score").alias("std_sentiment"),
        pl.len().alias("sentence_count")
    ])
    .sort(["period", "categories"])
)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/m9o8/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  sentiment_df = sentence_df.with_columns([


## Classifier

In [4]:
# Prepare data for classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

# Group sentences by book to get book-level features
book_sentences = (sentence_df
    .group_by(["book_id", "period"])
    .agg([
        pl.col("sentence").implode().flatten().alias("sentences")
    ])
)

# Join all sentences from each book
book_sentences = book_sentences.with_columns([
    pl.col("sentences").list.join(" ").alias("text")
])

# Convert to Python lists for sklearn
texts = book_sentences["text"].to_list()
labels = book_sentences["period"].to_list()

# Create TF-IDF features
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)
X = vectorizer.fit_transform(texts)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Get most important features
feature_names = vectorizer.get_feature_names_out()
feature_importance = clf.feature_importances_
features_df = pl.DataFrame({
    "feature": feature_names,
    "importance": feature_importance
}).sort("importance", descending=True).head(20)

              precision    recall  f1-score   support

        high       0.50      1.00      0.67         1
         low       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Visuals

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot term frequency differences between periods
def plot_term_differences():
    # Pivot the data
    plot_data = (term_freq_df
        .pivot(
            values="frequency",
            index=["terms", "categories"],
            on="period"
        )
        .with_columns([
            (pl.col("high") - pl.col("low")).alias("difference")
        ])
        .sort("difference", descending=True)
    )
    
    # Extract top 15 terms for plotting
    top_terms = plot_data.head(15)
    
    # Create plot
    plt.figure(figsize=(12, 8))
    
    # Convert to Pandas for easier plotting (Polars plotting still developing)
    pd_data = top_terms.to_pandas().set_index("terms")
    
    # Plot
    pd_data[["high", "low"]].plot(kind="barh", ax=plt.gca())
    plt.title("Top 15 Economic Terms by Period Difference")
    plt.xlabel("Frequency")
    plt.ylabel("Terms")
    plt.tight_layout()
    plt.savefig("imgs/term_frequency_comparison.svg")
    plt.close()

# Plot sentiment analysis results
def plot_sentiment_analysis():
    # Convert to Pandas for plotting
    pd_data = sentiment_by_category.to_pandas()
    
    plt.figure(figsize=(10, 6))
    
    # Create grouped bar chart
    sns.barplot(
        x="categories", 
        y="avg_sentiment", 
        hue="period",
        data=pd_data
    )
    
    plt.title("Average Sentiment by Economic Category and Period")
    plt.xlabel("Economic Category")
    plt.ylabel("Average Sentiment Score")
    plt.tight_layout()
    plt.savefig("imgs/sentiment_by_category.svg")
    plt.close()

# Create visualizations
plot_term_differences()
plot_sentiment_analysis()

## Word Embeddings for Economic Context

In [6]:
import polars as pl
import numpy as np
from gensim.models import Word2Vec
import spacy

# Load SpaCy for better tokenization and parsing
nlp = spacy.load("en_core_web_sm", disable=["ner"])

nlp.max_length = 4000000

def preprocess_for_embeddings(text):
    """Process text for embedding training, preserving sentence boundaries"""
    doc = nlp(text)
    sentences = []
    for sent in doc.sents:
        # Keep only content words, convert to lowercase
        tokens = [token.lemma_.lower() for token in sent 
                 if not token.is_stop and not token.is_punct and token.is_alpha]
        if tokens:
            sentences.append(tokens)
    return sentences

# Process books and train period-specific embeddings
def train_period_embeddings(df, period):
    all_sentences = []
    
    # Process each book
    for row in df.filter(pl.col("period") == period).iter_rows(named=True):
        # Process in chunks for very large texts
        processed = preprocess_for_embeddings(row["full_text"])
        all_sentences.extend(processed)
    
    # Train Word2Vec model on this period's corpus
    model = Word2Vec(sentences=all_sentences, vector_size=100, window=5, min_count=5, workers=4)
    model.save(f"embeddings_{period}.model")
    return model

# Train models for each period
high_inequality_model = train_period_embeddings(corpus_df, "high")
low_inequality_model = train_period_embeddings(corpus_df, "low")

# Find economic concept spaces through semantic similarity
def explore_economic_concepts(model, seed_terms=["rich", "poor", "money", "wealth"]):
    """Expand economic vocabulary through embeddings"""
    economic_space = {}
    for term in seed_terms:
        if term in model.wv:
            similar_terms = model.wv.most_similar(term, topn=20)
            economic_space[term] = similar_terms
    return economic_space

# Compare economic vocabularies between periods
high_economic_space = explore_economic_concepts(high_inequality_model)
low_economic_space = explore_economic_concepts(low_inequality_model)

# Find words with largest shifts in economic associations
def compare_word_associations(word, model1, model2, topn=10):
    """Compare how a word's associations differ between periods"""
    if word not in model1.wv or word not in model2.wv:
        return None
    
    assoc1 = set([w for w, _ in model1.wv.most_similar(word, topn=topn)])
    assoc2 = set([w for w, _ in model2.wv.most_similar(word, topn=topn)])
    
    return {
        "unique_to_model1": list(assoc1 - assoc2),
        "unique_to_model2": list(assoc2 - assoc1),
        "common": list(assoc1 & assoc2)
    }

## Topic Modeling Approach

In [7]:
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Extract sentences by book
def get_book_sentences(df):
    books_data = []
    for row in df.iter_rows(named=True):
        doc = nlp(row["full_text"])
        sentences = [sent.text for sent in doc.sents]
        books_data.append({
            "book_id": row["book_id"],
            "period": row["period"],
            "sentences": sentences
        })
    return pl.from_dicts(books_data)

# Preprocess for topic modeling
def preprocess_for_topics(sentences):
    processed_docs = []
    for sent in sentences:
        doc = nlp(sent)
        # Keep lemmatized forms of content words
        tokens = [token.lemma_.lower() for token in doc 
                 if not token.is_stop and not token.is_punct 
                 and token.is_alpha and len(token.text) > 2]
        if tokens:
            processed_docs.append(tokens)
    return processed_docs

# Apply topic modeling to find economic themes
def extract_economic_topics(book_sentences_df, num_topics=10):
    all_docs = []
    book_ids = []
    periods = []
    
    for row in book_sentences_df.iter_rows(named=True):
        processed = preprocess_for_topics(row["sentences"])
        all_docs.extend(processed)
        book_ids.extend([row["book_id"]] * len(processed))
        periods.extend([row["period"]] * len(processed))
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(all_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in all_docs]
    
    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=10,
        alpha='auto',
        random_state=42
    )
    
    # Get dominant topic for each document
    doc_topics = []
    for i, doc_bow in enumerate(corpus):
        top_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
        top_topic = max(top_topics, key=lambda x: x[1])
        
        doc_topics.append({
            "book_id": book_ids[i],
            "period": periods[i],
            "top_topic_id": top_topic[0],
            "top_topic_prob": top_topic[1],
            "all_topics": dict(top_topics)
        })
    
    #doc_topics_df = pl.from_dicts(doc_topics)
    
    # Extract topic words for interpretation
    topics_words = {}
    for topic_id in range(num_topics):
        topics_words[topic_id] = [word for word, prob in lda_model.show_topic(topic_id, topn=20)]
    
    return doc_topics, topics_words, lda_model, corpus, dictionary

In [8]:
doc_topics, topic_words, lda_model, corpus, dictionary = extract_economic_topics(get_book_sentences(corpus_df))