In [None]:
import os, re, math, collections, itertools, json
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import warnings
from collections import Counter, defaultdict
import nltk
import csv 
import csv, re, warnings
from pathlib import Path

In [None]:
HERE = _dh[-1]
PAPERS_OCRS = HERE / "gallica_data"  # Should be downloaded from Gallica and placed in the repo directory
OUT_DIR = HERE / "context_analysis"

PosixPath('/home/malakrch/DSAI_project/gallica_data')

In [None]:
try:
    FR_STOPWORDS = set(nltk.corpus.stopwords.words('french'))
except LookupError:
    nltk.download('stopwords')
    FR_STOPWORDS = set(nltk.corpus.stopwords.words('french'))


In [None]:

YEAR_START, YEAR_END = 1870, 1940     # years of interest

# mapping “journal title” ‑> folder‑name
repo_titles = {
      "L'Action française": "L_Action_francaise",
      "L'Aurore": "L_Aurore",
      "Le Constitutionnel": "Le_Constitutionnel",
      "La Croix": "La_Croix",
      "Figaro : journal non politique": "Le_Figaro",
      "Le Populaire": "Le_Populaire",
      "L'Humanité": "L_Humanite",
      "Le Temps": "Le_Temps",
      #"Le Petit Journal": "Le_Petit_Journal",
      #"Le Petit Parisien": "Le_Petit_Parisien",
      #"La Justice": "La_Justice"
}


ANARCHIST_TOKEN_RE = re.compile(r'anarchist(?:e|es)?|anarchisme', re.IGNORECASE)

FAMILY_REGEX = {
    'anarchist': re.compile(r'anarchist(?:e|es)?|anarchisme|anarchie', re.IGNORECASE),
    'communis'  : re.compile(r'communis\w*',                   re.IGNORECASE),
    'socialis'  : re.compile(r'socialis\w*',                   re.IGNORECASE),
    'revolution': re.compile(r'r[eé]volution\w*',              re.IGNORECASE),
}

WINDOW_SIZE = 5     #number tokens for context windows
TOKEN_RE = re.compile(r'[a-zàâçéèêëîïôûùüÿñæœ]+', re.IGNORECASE)


In [8]:
def window_tokens(tokens, w=5):
    """Yield context tokens around every anarchist hit (stop-words removed)."""
    for i, tok in enumerate(tokens):
        if ANARCHIST_TOKEN_RE.fullmatch(tok):
            left  = max(0, i - w)
            right = min(len(tokens), i + w + 1)
            for t in tokens[left:right]:
                if t != tok and t not in FR_STOPWORDS:
                    yield t

In [None]:
global_counts       = Counter()                                
ctx_counts          = defaultdict(lambda: defaultdict(Counter))   
anarchist_hit_counts= defaultdict(lambda: defaultdict(int))       

for title, folder in repo_titles.items():
    paper_dir = PAPERS_OCRS / folder
    if not paper_dir.exists():
        warnings.warn(f"Missing directory {paper_dir}")
        continue

    for year in range(YEAR_START, YEAR_END + 1):
        year_dir = paper_dir / str(year)
        if not year_dir.exists():
            continue

        for path in year_dir.glob("*.txt"):
            try:
                txt = path.read_text(encoding="utf-8", errors="ignore").lower()
            except Exception as e:
                warnings.warn(f"Could not read {path}: {e}")
                continue

            tokens = TOKEN_RE.findall(txt)

            #total word count for the year
            global_counts.update(tokens)

            #anarchist hits frequency
            a_hits = sum(1 for t in tokens if FAMILY_REGEX['anarchist'].fullmatch(t))
            if a_hits:
                anarchist_hit_counts[title][year] += a_hits
                
                #words found in context of anarchist hits
                ctx_counts[title][year].update(window_tokens(tokens))
            

In [None]:
N = sum(global_counts.values())

out_path = OUT_DIR / "pmi_context_words_by_paper_year.csv"
with out_path.open("w", newline="", encoding="utf-8") as fout:
    writer = csv.writer(fout)
    writer.writerow(["paper","year","word","ctx_freq","PMI"])

    for paper, year_counters in ctx_counts.items():
        for year, counter in year_counters.items():
            c_x = anarchist_hit_counts[paper][year]         
            if c_x == 0:
                continue

            for word, c_xy in counter.items():
                c_y = global_counts[word]
                if c_y < 5 or c_xy < 3:                      # skip very rare words
                    continue
                pmi = math.log2((c_xy * N) / (c_x * c_y))
                writer.writerow([paper, year, word, c_xy, round(pmi, 3)])


In [None]:
full = pd.read_csv(OUT_DIR/"pmi_context_words_by_paper_year.csv")

#take the 10 rows with the largest PMI inside every (paper, year) group
top10 = (
    full.sort_values(["paper", "year", "PMI"], ascending=[True, True, False])
        .groupby(["paper", "year"], as_index=False)
        .head(10)               
        .reset_index(drop=True) 
)

top10.to_csv(OUT_DIR / "top10_context_words_by_paper_year.csv", index=False)

print(top10.head(20))



                             paper  year       word  ctx_freq    PMI
0   Figaro : journal non politique  1871          p         4  3.495
1   Figaro : journal non politique  1871  slategray         4  1.783
2   Figaro : journal non politique  1871      color         4  1.783
3   Figaro : journal non politique  1871       span         7  1.590
4   Figaro : journal non politique  1871      style         3  1.368
5   Figaro : journal non politique  1872  slategray         4  2.783
6   Figaro : journal non politique  1872       span         7  2.590
7   Figaro : journal non politique  1872      style         3  2.368
8   Figaro : journal non politique  1872      color         3  2.368
9   Figaro : journal non politique  1874       span         3  2.368
10  Figaro : journal non politique  1876  slategray         5  2.882
11  Figaro : journal non politique  1876       span         8  2.560
12  Figaro : journal non politique  1876      style         4  2.560
13  Figaro : journal non politique

In [None]:
for title, folder in repo_titles.items():
    paper_dir = PAPERS_OCRS / folder
    if not paper_dir.exists():
        warnings.warn(f"Missing directory: {paper_dir}")
        continue

    out_path  = HERE / "context_analysis" / f"{folder}_frequencies.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)

    with out_path.open("w", newline="", encoding="utf-8") as fout:
        writer = csv.writer(fout)
        writer.writerow([
            "paper", "year",
            "anarchist_hits", "communis_hits",
            "socialis_hits", "revolution_hits"
        ])

        for year in range(YEAR_START, YEAR_END + 1):
            year_dir = paper_dir / str(year)
            if not year_dir.exists():
                continue

            for path in year_dir.glob("*.txt"):
                try:
                    txt = path.read_text(encoding="utf-8", errors="ignore").lower()
                except Exception as e:
                    warnings.warn(f"Could not read {path}: {e}")
                    continue

                
                counts = {
                    fam: len(rx.findall(txt))
                    for fam, rx in FAMILY_REGEX.items()
                }

                writer.writerow([
                    title, year,
                    counts["anarchist"],
                    counts["communis"],
                    counts["socialis"],
                    counts["revolution"],
                ])
