# IMPORTS

In [1]:
import pandas as pd
import numpy as np

In [None]:
%run scrape2.py

In [2]:
from clean_minimal import clean_minimal

# Nettoie tout ce qui existe dans txt/ vers txt_clean/
clean_minimal(in_dir="txt", out_dir="txt_clean")

✓ cleaned → txt_clean/1987/05/1987-05-cl.txt
✓ cleaned → txt_clean/1987/05/1987-05-sf.txt
✓ cleaned → txt_clean/1987/05/1987-05-ph.txt
✓ cleaned → txt_clean/1987/05/1987-05-at.txt
✓ cleaned → txt_clean/1987/05/1987-05-kc.txt
✓ cleaned → txt_clean/1987/05/1987-05-ch.txt
✓ cleaned → txt_clean/1987/05/1987-05-su.txt
✓ cleaned → txt_clean/1987/05/1987-05-mi.txt
✓ cleaned → txt_clean/1987/05/1987-05-da.txt
✓ cleaned → txt_clean/1987/05/1987-05-ny.txt
✓ cleaned → txt_clean/1987/05/1987-05-bo.txt
✓ cleaned → txt_clean/1987/05/1987-05-ri.txt
✓ cleaned → txt_clean/1987/05/1987-05-sl.txt
✓ cleaned → txt_clean/1987/03/1987-03-ch.txt
✓ cleaned → txt_clean/1987/03/1987-03-su.txt
✓ cleaned → txt_clean/1987/03/1987-03-kc.txt
✓ cleaned → txt_clean/1987/03/1987-03-sl.txt
✓ cleaned → txt_clean/1987/03/1987-03-cl.txt
✓ cleaned → txt_clean/1987/03/1987-03-ri.txt
✓ cleaned → txt_clean/1987/03/1987-03-mi.txt
✓ cleaned → txt_clean/1987/03/1987-03-bo.txt
✓ cleaned → txt_clean/1987/03/1987-03-ny.txt
✓ cleaned 

### on construit le dataframe de base : 13 colonnes pour chacun des rapports

In [2]:
import os
import glob

def build_beigebook_df(in_dir="txt_clean"):
    # 13 colonnes: 12 districts + national (su)
    cols = ["at","bo","ch","cl","da","kc","mi","ny","ph","ri","sf","sl","su"]
    buckets = {}  # key: "YYYY-MM"  -> value: dict {col: text}

    # Parcourt tous les fichiers .../YYYY/MM/YYYY-MM-<region>.txt
    pattern = os.path.join(in_dir, "*", "*", "*.txt")
    for path in glob.glob(pattern):
        # Exemples de path: txt_clean/2018/06/2018-06-ny.txt
        fname = os.path.basename(path)           # "2018-06-ny.txt"
        parts = fname[:-4].split("-")            # ["2018","06","ny"]
        if len(parts) < 3:
            continue
        year, month, region = parts[0], parts[1], parts[2]
        if region not in cols:
            # Sécurité : on ignore les fichiers inattendus
            continue

        yyyymm = f"{year}-{month}"
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        if yyyymm not in buckets:
            buckets[yyyymm] = {}
        buckets[yyyymm][region] = text

    # Construction du DataFrame
    df = pd.DataFrame.from_dict(buckets, orient="index", columns=cols)
    # Trie les lignes par date et met un index datetime (1er du mois)
    df.index = pd.to_datetime(df.index + "-01", format="%Y-%m-%d")
    df = df.sort_index()
    df.index.name = "date"
    return df

In [3]:
df = build_beigebook_df(in_dir="txt_clean")

### dataframe avec les taux

In [5]:
import pandas_datareader.data as web

# --- Période complète ---
start, end = "1970-01-01", "2025-12-31"

# --- 1️. Charger les séries FRED ---
# Avant 2008 : DFEDTAR (taux cible unique)
# Depuis 2008 : DFEDTARU (upper), DFEDTARL (lower)
print("Chargement des séries depuis FRED...")

pre = web.DataReader("DFEDTAR", "fred", start, "2008-12-31").rename(columns={"DFEDTAR": "target_rate"})
upper = web.DataReader("DFEDTARU", "fred", "2008-01-01", end).rename(columns={"DFEDTARU": "target_upper"})
lower = web.DataReader("DFEDTARL", "fred", "2008-01-01", end).rename(columns={"DFEDTARL": "target_lower"})

# --- 2️. Calculer le midpoint (moyenne du range) ---
if not upper.empty and not lower.empty:
    mid = (upper["target_upper"] + lower["target_lower"]) / 2
    mid.name = "target_mid"
else:
    mid = pd.Series(dtype=float, name="target_mid")

# --- 3️. Fusionner tout ---
# On garde DFEDTAR jusqu'à 2008, puis le midpoint ensuite
rates = pd.concat([pre["target_rate"], mid], axis=1)

# Combiner proprement : utiliser target_rate si dispo, sinon target_mid
rates["fed_target"] = rates["target_rate"].combine_first(rates["target_mid"])

# --- 4. Combiner le tout ---
df_taux = rates.join([upper, lower])
df_taux = df_taux.sort_index()

# --- 6. Nettoyage final ---
df_taux["fed_target"] = df_taux["fed_target"].ffill()
df_taux = df_taux[["fed_target", "target_upper", "target_lower"]]

Chargement des séries depuis FRED...


In [6]:
print("Données chargées avec succès !")
print(f"Plage temporelle : {df_taux.index.min().date()} → {df_taux.index.max().date()}")
print(f"Nombre d’observations : {len(df_taux):,}")
df_taux

Données chargées avec succès !
Plage temporelle : 1982-09-27 → 2025-11-08
Nombre d’observations : 15,749


Unnamed: 0_level_0,fed_target,target_upper,target_lower
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1982-09-27,10.250,,
1982-09-28,10.250,,
1982-09-29,10.250,,
1982-09-30,10.250,,
1982-10-01,10.000,,
...,...,...,...
2025-11-04,3.875,4.0,3.75
2025-11-05,3.875,4.0,3.75
2025-11-06,3.875,4.0,3.75
2025-11-07,3.875,4.0,3.75


### il faut récupérer les dates des meetings

In [7]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

HDRS = {"User-Agent": "Mozilla/5.0 (compatible; FOMCDateBuilder/1.0)"}

# --- 1) Historique 1970–2019 : parse les pages "fomchistoricalYYYY.htm" ---
def fetch_historical_year(year: int) -> list[pd.Timestamp]:
    url = f"https://www.federalreserve.gov/monetarypolicy/fomchistorical{year}.htm"
    r = requests.get(url, headers=HDRS, timeout=20)
    if r.status_code != 200:
        return []
    soup = BeautifulSoup(r.text, "html.parser")

    # Les titres ressemblent à "January 12 Meeting - 1971"
    # On capte "Month Day - YEAR"
    rx = re.compile(r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}).*?\b" + str(year) + r"\b", re.I)
    dates = []
    for tag in soup.find_all(text=rx):
        m = rx.search(tag.strip())
        if not m:
            continue
        dstr = f"{m.group(1)} {m.group(2)}, {year}"
        try:
            dates.append(pd.to_datetime(dstr))
        except Exception:
            pass
    return sorted(set(dates))

# --- 2) 2020–2025 : parse le calendrier consolidé (les blocs "January 30–31, 2024") ---
def fetch_calendars_2020_2025() -> list[pd.Timestamp]:
    url = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
    r = requests.get(url, headers=HDRS, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    dates = []
    # Exemple texte: "January 30–31, 2024" → on prend le **dernier jour** (jour du communiqué)
    rx = re.compile(r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2})(?:[–-](\d{1,2}))?,\s*(20\d{2})")
    for tag in soup.find_all(text=rx):
        txt = " ".join(tag.strip().split())
        m = rx.search(txt)
        if not m:
            continue
        month, d1, d2, year = m.group(1), int(m.group(2)), m.group(3), int(m.group(4))
        day = int(d2) if d2 else d1  # 2e jour si réunion sur deux jours
        dstr = f"{month} {day}, {year}"
        try:
            dates.append(pd.to_datetime(dstr))
        except Exception:
            pass

    # 2020 inclut aussi des réunions extraordinaires (p.ex. 3/3, 3/15)
    # On complète via les communiqués FOMC 2020 si besoin (facultatif).
    return sorted(set(dates))

def build_fomc_meetings(year_start=1970, year_end=2025) -> pd.DataFrame:
    all_dates = []
    # 1970–2019 via "historical"
    for y in range(year_start, min(2019, year_end) + 1):
        all_dates += fetch_historical_year(y)
    # 2020–2025 via "calendars"
    if year_end >= 2020:
        all_dates += fetch_calendars_2020_2025()

    df_meet = pd.DataFrame({"date": sorted(set(d for d in all_dates if year_start <= d.year <= year_end))})
    return df_meet

# -- build & save --
fomc_meetings = build_fomc_meetings(1970, 2025)
fomc_meetings.to_csv("fomc_meetings.csv", index=False)
print(len(fomc_meetings), "meeting dates")
fomc_meetings.head(12), fomc_meetings.tail(12)

  for tag in soup.find_all(text=rx):


778 meeting dates


  for tag in soup.find_all(text=rx):


(         date
 0  1970-01-15
 1  1970-02-10
 2  1970-03-10
 3  1970-04-07
 4  1970-05-05
 5  1970-05-26
 6  1970-06-23
 7  1970-07-21
 8  1970-08-18
 9  1970-09-15
 10 1970-10-20
 11 1970-11-17,
           date
 766 2024-07-03
 767 2024-08-21
 768 2024-10-09
 769 2024-11-26
 770 2025-01-08
 771 2025-02-19
 772 2025-04-09
 773 2025-05-28
 774 2025-07-09
 775 2025-08-20
 776 2025-10-08
 777 2025-10-29)

### on récupère aussi les dates des beige books

In [None]:
import os, re, time, glob
import pandas as pd
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE = "https://www.minneapolisfed.org/beige-book-reports/"
UA = "Mozilla/5.0 (compatible; BeigeBookDateScraper/1.0)"

DATE_RX  = re.compile(r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s*\d{4}")

def make_session():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    retry = Retry(
        total=3, backoff_factor=0.5,
        status_forcelist=(500, 502, 503, 504),
        allowed_methods=frozenset(["GET"]),
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    return s

def _soup(html: str):
    try:
        import html5lib  # noqa
        parser = "html5lib"
    except Exception:
        parser = "html.parser"
    return BeautifulSoup(html, parser)

def extract_date_from_html(html: str):
    soup = _soup(html)
    container = (soup.find("div", class_="col-sm-12 col-lg-8 offset-lg-1")
                 or soup.find("main") or soup.find("article") or soup.body)
    text = container.get_text("\n", strip=True) if container else soup.get_text(" ", strip=True)
    m = DATE_RX.search(text)
    if m:
        try:
            return pd.to_datetime(m.group(0))
        except Exception:
            return None
    # petit fallback: regarder les 10 premières lignes
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()][:10]
    for ln in lines:
        m = DATE_RX.search(ln)
        if m:
            try:
                return pd.to_datetime(m.group(0))
            except Exception:
                return None
    return None

def fetch_date_for_month(session, year, month, candidate_regions):
    """Essaie au plus 2-3 URLs: su -> national-summary -> 1er district disponible localement."""
    # 1) national
    for slug in ("su", "national-summary"):
        url = f"{BASE}{year}/{year}-{month:02d}-{slug}"
        r = session.get(url, timeout=15)
        if r.status_code == 200:
            dt = extract_date_from_html(r.text)
            if dt is not None:
                return {"date": dt.normalize(), "year": year, "month": month, "source_url": url}
    # 2) un seul district
    if candidate_regions:
        region = candidate_regions[0]
        url = f"{BASE}{year}/{year}-{month:02d}-{region}"
        r = session.get(url, timeout=15)
        if r.status_code == 200:
            dt = extract_date_from_html(r.text)
            if dt is not None:
                return {"date": dt.normalize(), "year": year, "month": month, "source_url": url}
    return None

def list_published_months_from_disk(in_dir="txt"):
    """
    Détecte les (year, month) où tu as au moins UN fichier non vide => publication réelle.
    Retourne dict[(year,month)] = [liste de régions présentes].
    """
    present = {}
    pattern = os.path.join(in_dir, "*", "*", "*.txt")
    for path in glob.glob(pattern):
        # path ex: txt/1971/07/1971-07-ny.txt
        base = os.path.basename(path)
        parts = base[:-4].split("-")
        if len(parts) < 3:
            continue
        year, month, region = parts[0], parts[1], parts[2]
        try:
            y, m = int(year), int(month)
        except:
            continue
        # ignore fichiers vides
        try:
            if os.path.getsize(path) == 0:
                continue
        except:
            continue
        present.setdefault((y, m), []).append(region)
    return present

def build_beige_book_calendar_fast(in_dir="txt", start_year=1970, end_year=2025, max_workers=8, polite_sleep=0.0):
    """
    Super rapide: ne frappe que les mois réellement publiés (déduits du disque),
    et tente au plus 2-3 URLs par mois (su, national-summary, 1er district trouvé).
    """
    # 1) détecter mois publiés localement
    published = list_published_months_from_disk(in_dir)
    tasks = []
    sess = make_session()

    # 2) préparer les jobs filtrés par période
    for (y, m), regions in published.items():
        if not (start_year <= y <= end_year):
            continue
        # ordonner les régions pour avoir un district consistant
        regions_sorted = sorted(set(regions))
        tasks.append((y, m, regions_sorted))

    rows = []
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {
            ex.submit(fetch_date_for_month, sess, y, m, regs): (y, m)
            for (y, m, regs) in tasks
        }
        for fut in as_completed(futs):
            y, m = futs[fut]
            try:
                res = fut.result()
                if res:
                    rows.append(res)
            except Exception:
                # on ignore l'erreur et continue
                pass
            if polite_sleep:
                time.sleep(polite_sleep)

    df_pub = pd.DataFrame(rows).drop_duplicates(subset=["date"]).sort_values("date").reset_index(drop=True)
    return df_pub

In [None]:
df_bb_calendar = build_beige_book_calendar_fast(in_dir="txt", start_year=1970, end_year=2025, max_workers=12)
df_bb_calendar.to_csv("beige_book_publications.csv", index=False)

In [10]:
df_bb_calendar

Unnamed: 0,date,year,month,source_url
0,1970-05-20,1970,5,https://www.minneapolisfed.org/beige-book-repo...
1,1970-06-17,1970,6,https://www.minneapolisfed.org/beige-book-repo...
2,1970-07-15,1970,7,https://www.minneapolisfed.org/beige-book-repo...
3,1970-08-12,1970,8,https://www.minneapolisfed.org/beige-book-repo...
4,1970-09-09,1970,9,https://www.minneapolisfed.org/beige-book-repo...
...,...,...,...,...
476,2025-04-23,2025,4,https://www.minneapolisfed.org/beige-book-repo...
477,2025-06-04,2025,6,https://www.minneapolisfed.org/beige-book-repo...
478,2025-07-16,2025,7,https://www.minneapolisfed.org/beige-book-repo...
479,2025-09-03,2025,9,https://www.minneapolisfed.org/beige-book-repo...


### on associe aux dates des beige books les taux

In [11]:
import pandas as pd

# --- Créer la colonne unifiée de taux cible ---
df_taux["target_mid"] = (df_taux["target_upper"] + df_taux["target_lower"]) / 2
df_taux["policy_rate"] = df_taux["fed_target"].combine_first(df_taux["target_mid"])

# --- S'assurer que les index sont bien datés ---
df_taux = df_taux.sort_index()
df_taux.index = pd.to_datetime(df_taux.index)
df_bb_calendar["date"] = pd.to_datetime(df_bb_calendar["date"])

# --- Pour chaque Beige Book : on veut le taux APRES sa publication ---
# Donc on cherche la première date de df_taux STRICTEMENT postérieure à df_bb_calendar["date"]

def find_next_rate_date(bb_date, taux_index):
    future_dates = taux_index[taux_index > bb_date]
    return future_dates.min() if len(future_dates) > 0 else pd.NaT

# Appliquer sur tout le DataFrame
df_bb_calendar["next_rate_date"] = df_bb_calendar["date"].apply(lambda d: find_next_rate_date(d, df_taux.index))

# --- Récupérer les taux correspondants ---
df_bb_calendar["fed_rate"] = df_bb_calendar["next_rate_date"].map(df_taux["policy_rate"])

# --- Nettoyage final ---
df_bb_final = df_bb_calendar.dropna(subset=["fed_rate"]).reset_index(drop=True)
df_bb_final = df_bb_final[["date", "year", "month", "fed_rate"]]

In [12]:
df_bb_final

Unnamed: 0,date,year,month,fed_rate
0,1970-05-20,1970,5,10.250
1,1970-06-17,1970,6,10.250
2,1970-07-15,1970,7,10.250
3,1970-08-12,1970,8,10.250
4,1970-09-09,1970,9,10.250
...,...,...,...,...
476,2025-04-23,2025,4,4.375
477,2025-06-04,2025,6,4.375
478,2025-07-16,2025,7,4.375
479,2025-09-03,2025,9,4.375


### tableau final

In [13]:
df_travail = df.copy()

assert len(df_travail) == len(df_bb_final), "Les deux DataFrames n'ont pas le même nombre de lignes !"

df_travail["fed_rate"] = df_bb_final["fed_rate"].values
df_travail

Unnamed: 0_level_0,at,bo,ch,cl,da,kc,mi,ny,ph,ri,sf,sl,su,fed_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1970-05-01,The mood of our directors varies from pessimis...,Discussions Monday (May 18) and last Friday wi...,In published statements and private conversati...,Economic activity in the District has been dis...,Continued inflation is the major concern of El...,Consensus Based on Discussions with Members of...,Although indications of softening in the Ninth...,"In the Second Federal Reserve District, the Bo...",The business slowdown in the regional economy ...,"Production, Sales, and Inventories District su...",General There is no consistent pattern of view...,A select group of knowledgeable persons in the...,This initial report of economic conditions in ...,10.250
1970-06-01,Summary of Findings Sixth District directors h...,Seven members of our Board of Directors and th...,Businessmen and economists in the Seventh Dist...,Economic conditions in this District appear to...,Economic conditions in the Eleventh District r...,The general pace of activity in the District i...,Business and labor leaders in the Ninth Federa...,Remarks by leading bankers and businessmen in ...,"In the Third Federal Reserve District, views o...","Information obtained in the Fifth District, pr...",This report is based upon a survey of Head Off...,Based on discussions with our Branch and Main ...,Comments on economic conditions in the twelve ...,10.250
1970-07-01,According to reports from directors and others...,The general tenor of the comments offered by o...,Businessmen and economists in the Seventh Dist...,Economic activity in the Fourth District impro...,Views of head office board members and visitin...,The overall economic picture in the Tenth Dist...,The softening in the Ninth Federal Reserve Dis...,"In the Second Federal Reserve District, Direct...",This report is based on comments of our Board ...,"Information obtained in the Fifth District, th...","Overall, the views of our directors have not c...",There has been some dampening of optimism in r...,"Current comment by businessmen and bankers, as...",10.250
1970-08-01,This report is based on a special poll made of...,Cautious optimism over the prospects for a sus...,Confidence in the overall stability of the fin...,The limited data available for July suggest th...,The responses of corporate executives of 15 of...,The effects of the national economic recession...,Ninth District business economists feel that t...,Sentiment was mixed among Federal Reserve Bank...,Intelligence for this report was obtained from...,Surveys of businessmen and bankers in the Fift...,Twelfth District businessmen are talking incre...,Expectations of an early upturn in business ac...,The consensus of the reports by the twelve Fed...,10.250
1970-09-01,"The consensus of directors, bankers, and busin...",Commercial banking conditions in the First Dis...,The almost universal view among informed obser...,Signs of recovery in the District's economy ar...,Data for this report were gathered by telephon...,Business and economic conditions remain reason...,Economic conditions in the Ninth Federal Reser...,Opinions were mixed among Federal Reserve bank...,Confusion is the most prevalent mood in the Th...,Surveys of businessmen and bankers in the Fift...,There is a somewhat more optimistic tone to th...,A few scattered signs of improvement in econom...,The reports in this Redbook are more optimisti...,10.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-01,Summary of Economic Activity The Sixth Distric...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,"Summary of Economic Activity On balance, conta...",Summary of Economic Activity Growth in the Ele...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Business activity...,"Summary of Economic Activity On balance, the F...",Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,This document summarizes comments received fro...,4.375
2025-06-01,Summary of Economic Activity The Sixth Distric...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Contacts' reports...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Overall activity ...,Summary of Economic Activity Ninth District ec...,Summary of Economic Activity Economic activity...,"Summary of Economic Activity On balance, busin...",Summary of Economic Activity The Fifth Distric...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Louis based on information collected on or bef...,4.375
2025-07-01,Summary of Economic Activity The economy of th...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Contacts' reports...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Ninth District ec...,Summary of Economic Activity Economic activity...,"Summary of Economic Activity On balance, busin...",Summary of Economic Activity The Fifth Distric...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,This document summarizes comments received fro...,4.375
2025-09-01,Summary of Economic Activity The Sixth Distric...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Fourth District c...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Business activity...,Summary of Economic Activity The Fifth Distric...,Summary of Economic Activity Economic activity...,Summary of Economic Activity Economic activity...,This document summarizes comments received fro...,4.375


# SCORE DE SENTIMENT - LM

In [None]:
# 1️. Import des librairies
import pysentiment2 as ps
from tqdm import tqdm
tqdm.pandas()

# 2️. Charger le modèle Loughran–McDonald intégré
lm = ps.LM()

# 3️. Fonction de calcul du score de sentiment
def lm_sentiment_score(text):
    """
    Calcule un score de sentiment normalisé à partir du lexique Loughran–McDonald :
    (Positive - Negative) / (Positive + Negative)
    """
    tokens = lm.tokenize(str(text))
    score = lm.get_score(tokens)
    total = score["Positive"] + score["Negative"]
    if total == 0:
        return 0
    return (score["Positive"] - score["Negative"]) / total

# 4️. Colonnes textuelles du Beige Book
text_cols = ['at','bo','ch','cl','da','kc','mi','ny','ph','ri','sf','sl','su']

# 5️. Création de df_dic avec 13 scores de sentiment + fed_rate
df_dic = pd.DataFrame(index=df_travail.index)

for col in text_cols:
    df_dic[col] = df_travail[col].progress_apply(lm_sentiment_score)

df_dic["fed_rate"] = df_travail["fed_rate"]

100%|██████████| 481/481 [00:06<00:00, 77.45it/s]
100%|██████████| 481/481 [00:06<00:00, 73.24it/s] 
100%|██████████| 481/481 [00:06<00:00, 70.96it/s]
100%|██████████| 481/481 [00:06<00:00, 72.12it/s]
100%|██████████| 481/481 [00:06<00:00, 72.41it/s] 
100%|██████████| 481/481 [00:06<00:00, 75.77it/s]
100%|██████████| 481/481 [00:06<00:00, 79.15it/s] 
100%|██████████| 481/481 [00:06<00:00, 77.05it/s] 
100%|██████████| 481/481 [00:06<00:00, 76.15it/s] 
100%|██████████| 481/481 [00:06<00:00, 74.89it/s] 
100%|██████████| 481/481 [00:05<00:00, 84.34it/s] 
100%|██████████| 481/481 [00:05<00:00, 87.25it/s] 
100%|██████████| 481/481 [00:08<00:00, 55.15it/s] 


In [23]:
df_dic

Unnamed: 0_level_0,at,bo,ch,cl,da,kc,mi,ny,ph,ri,sf,sl,su,fed_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1970-05-01,0.185185,-0.354839,-0.640000,-0.310345,-0.395349,-0.190476,-0.136364,-0.391304,-0.714286,-0.652174,-0.333333,-0.290323,-0.379310,10.250
1970-06-01,-0.257143,-0.513514,-0.560976,-0.500000,-0.454545,0.028571,-0.757576,-0.642857,-0.916667,-0.675676,-0.396226,-0.250000,-0.803922,10.250
1970-07-01,-0.357143,-0.483871,-0.675676,-0.421053,-0.153846,-0.225806,-0.148936,-0.541667,-0.866667,-0.446809,-0.627907,-0.217391,-0.695652,10.250
1970-08-01,-0.517241,-0.083333,-0.277778,-0.466667,-0.837838,-0.482759,-0.272727,-0.166667,-0.391304,-0.142857,-0.757576,-0.405405,-0.485714,10.250
1970-09-01,-0.590909,-0.368421,-0.200000,-0.483871,-0.222222,0.000000,-0.181818,-0.125000,-0.142857,-0.090909,-0.151515,-0.357143,-0.478261,10.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-01,-0.562500,-0.481481,-0.304348,-0.210526,-0.478261,-0.384615,-0.254902,-0.403509,-0.746032,-0.346939,-0.244444,-0.473684,-0.672727,4.375
2025-06-01,-0.466667,-0.362319,-0.366667,-0.294118,-0.587302,-0.291667,-0.043478,-0.523810,-0.450980,-0.533333,-0.419355,-0.395349,-0.576923,4.375
2025-07-01,-0.703704,0.080000,-0.396226,-0.416667,-0.569231,-0.222222,0.056604,-0.549296,-0.612903,-0.250000,-0.296296,-0.260870,-0.512195,4.375
2025-09-01,-0.696970,-0.043478,-0.047619,-0.405405,-0.381818,-0.160000,-0.333333,-0.437500,-0.244444,-0.076923,-0.344828,-0.181818,-0.304348,4.375


# SCORE DE SENTIMENT - FINBERT

In [None]:
#!pip install transformers torch --upgrade

In [15]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ---------- 1) Charger FinBERT ----------
MODEL_NAME = "ProsusAI/finbert"   # labels: negative / neutral / positive
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME)
model      = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
device     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ---------- 2) Utilitaires: chunking + inférence ----------
def _chunk_ids(input_ids, max_len=510):
    """Coupe une séquence d’IDs en segments <= max_len (hors [CLS]/[SEP])."""
    return [input_ids[i:i+max_len] for i in range(0, len(input_ids), max_len)] or [[]]

@torch.inference_mode()
def finbert_pos_neg_probs(text: str, batch_size: int = 8) -> tuple[float, float]:
    """
    Retourne (P(pos), P(neg)) pour un texte arbitraire.
    - Chunking si >512 tokens
    - Moyenne des probabilités sur les chunks
    """
    if not isinstance(text, str) or not text.strip():
        return (np.nan, np.nan)

    # encode en ids bruts (sans tokens spéciaux)
    ids = tokenizer.encode(text, add_special_tokens=False)
    pieces = _chunk_ids(ids, max_len=510)  # 510 + [CLS]+[SEP] = 512

    probs_list = []
    for i in range(0, len(pieces), batch_size):
        batch = pieces[i:i+batch_size]
        # ajoute [CLS] ... [SEP]
        input_ids = [[tokenizer.cls_token_id] + p + [tokenizer.sep_token_id] for p in batch]
        attn      = [[1]*len(x) for x in input_ids]

        # padding
        maxL = max(len(x) for x in input_ids)
        input_ids = [x + [tokenizer.pad_token_id]*(maxL-len(x)) for x in input_ids]
        attn      = [a + [0]*(maxL-len(a)) for a in attn]

        tens_ids = torch.tensor(input_ids, dtype=torch.long, device=device)
        tens_att = torch.tensor(attn,       dtype=torch.long, device=device)

        logits = model(input_ids=tens_ids, attention_mask=tens_att).logits
        probs  = torch.softmax(logits, dim=-1).detach().cpu().numpy()  # shape (B,3)
        probs_list.append(probs)

    probs_all = np.vstack(probs_list) if probs_list else np.zeros((1,3))

    # identification des colonnes via id2label (robuste)
    # ProsusAI/finbert: 0=negative, 1=neutral, 2=positive
    id2label = {i: model.config.id2label[i].lower() for i in range(model.config.num_labels)}
    neg_idx = [k for k,v in id2label.items() if "neg" in v][0]
    pos_idx = [k for k,v in id2label.items() if "pos" in v][0]

    p_pos = float(probs_all[:, pos_idx].mean())
    p_neg = float(probs_all[:, neg_idx].mean())
    return (p_pos, p_neg)

def finbert_ratio_score(text: str) -> float:
    """Calcule (P(pos)-P(neg))/(P(pos)+P(neg)); NaN si dénominateur nul."""
    p_pos, p_neg = finbert_pos_neg_probs(text)
    denom = p_pos + p_neg
    if not np.isfinite(denom) or denom <= 0:
        return np.nan
    return float((p_pos - p_neg) / denom)

# ---------- 3) Construire df_finB ----------
text_cols = ["at","bo","ch","cl","da","kc","mi","ny","ph","ri","sf","sl","su"]

# sécurité: on ne garde que les colonnes présentes
text_cols = [c for c in text_cols if c in df_travail.columns]

df_finB = pd.DataFrame(index=df_travail.index)

# applique le score pour chaque colonne texte
for col in text_cols:
    print(f"Scoring FinBERT → {col} ...")
    df_finB[col] = df_travail[col].fillna("").apply(finbert_ratio_score)

# ajoute Y en fin de tableau
df_finB["fed_rate"] = df_travail["fed_rate"].astype(float)

  from .autonotebook import tqdm as notebook_tqdm
Token indices sequence length is longer than the specified maximum sequence length for this model (702 > 512). Running this sequence through the model will result in indexing errors


Scoring FinBERT → at ...
Scoring FinBERT → bo ...
Scoring FinBERT → ch ...
Scoring FinBERT → cl ...
Scoring FinBERT → da ...
Scoring FinBERT → kc ...
Scoring FinBERT → mi ...
Scoring FinBERT → ny ...
Scoring FinBERT → ph ...
Scoring FinBERT → ri ...
Scoring FinBERT → sf ...
Scoring FinBERT → sl ...
Scoring FinBERT → su ...


In [18]:
df_finB

Unnamed: 0_level_0,at,bo,ch,cl,da,kc,mi,ny,ph,ri,sf,sl,su,fed_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1970-05-01,-0.076165,-0.950561,-0.975317,-0.968160,-0.019191,-0.962179,-0.960608,-0.442674,-0.958776,-0.968031,-0.910795,-0.899071,-0.974909,10.250
1970-06-01,-0.899751,-0.905267,-0.941071,-0.943294,-0.977316,0.080377,-0.981936,-0.460625,-0.966221,-0.960963,-0.957204,-0.914762,-0.966129,10.250
1970-07-01,0.002493,-0.540178,-0.976643,-0.962548,-0.221561,-0.902567,-0.972912,-0.799590,0.113514,-0.958365,-0.434773,-0.919647,-0.963973,10.250
1970-08-01,-0.931182,-0.369597,-0.913416,-0.922800,-0.956641,-0.963019,-0.935646,-0.044426,-0.934241,-0.932929,-0.972783,-0.849449,-0.968553,10.250
1970-09-01,-0.924825,-0.908914,-0.939495,-0.898278,-0.798448,-0.680111,-0.925098,-0.502383,-0.972451,-0.792651,-0.666252,-0.973053,-0.903389,10.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-01,-0.956507,-0.764116,-0.773956,-0.878935,-0.952048,-0.827919,-0.972541,-0.965588,-0.951316,-0.959150,-0.896596,-0.944738,-0.916547,4.375
2025-06-01,-0.835518,-0.840169,-0.606826,-0.674603,-0.693922,-0.883314,-0.835574,-0.895713,-0.519515,-0.891957,-0.801872,-0.960940,-0.950817,4.375
2025-07-01,-0.819406,-0.281352,-0.888502,-0.841776,-0.849361,-0.656892,-0.843242,-0.937077,-0.875205,-0.746250,-0.914505,-0.965249,-0.769031,4.375
2025-09-01,-0.956153,-0.658661,-0.421579,-0.794651,-0.660176,-0.732652,-0.929020,-0.561177,-0.666421,-0.615728,-0.916967,-0.886356,-0.906879,4.375
