# Lib import

In [None]:
import pandas as pd
import numpy as np

# Data import

In [None]:
import pathlib

filename_list = list(pathlib.Path('/content/drive/MyDrive/Colab Notebooks/ressources/senat_parse/').glob('*.txt'))

In [None]:
len(filename_list)

2374

# Single intervention parsing

In [None]:
#debug restricted space
filename = filename_list[0]

In [None]:
def clean_text(input_string):
  return input_string.replace('\n', ' ').replace('\r', ' ').strip()

In [None]:
def parse_filename_html_to_df(filename):
  #get date from filename
  x = re.search('[0-9]{4}[0-9]{2}[0-9]{2}', str(filename))
  date_string = x.group()
  date_string = pd.to_datetime(date_string)

  #open a file
  with open(filename) as file:  
    data = file.read() 

  #parse HTML to search tags and classes
  from bs4 import BeautifulSoup
  html_text = data
  soup = BeautifulSoup(html_text, 'html.parser')
  soup.find_all('intervenant')

  #parse each intervention to create rows
  row_list = []
  for intervention in soup.find_all("div", {"class": "intervenant"}):
    
    #speaker section
    speaker_info = intervention.find("span", {"class": "orateur_nom"})
    
    speaker_name = speaker_info.get_text().strip() if speaker_info else "name not found"
    speaker_name = clean_text(speaker_name)
    
    speaker_link = speaker_info.find("a", href=True).attrs['href'] if (speaker_info.find("a", href=True) if speaker_info else None) else 'N/A'

    speaker_quality = intervention.find("span", {"class": "orateur_qualite"})
    speaker_quality = speaker_quality.get_text() if speaker_quality else "N/A"
    # speaker_quality = speaker_quality.replace('\n', ' ').replace('\r', ' ').strip()
    speaker_quality = clean_text(speaker_quality)

    #intervention content
    intervention_text = intervention.find('p', text=True, recursive=False).get_text() if intervention.find('p', text=True, recursive=False) else intervention.get_text()
    intervention_text = clean_text(intervention_text)
    row_list.append([speaker_name, speaker_quality, speaker_link, intervention_text])
    
    # debug only
    # print(speaker_name, speaker_quality, speaker_link, intervention_text)

  #early double return
  if len(row_list) == 0:
    did_work = False
    complete_debate = False
    return did_work, complete_debate

  #convert to df and rename cols
  parsed_intervention = pd.DataFrame(row_list)
  parsed_intervention.columns = ['speaker_name', 'speaker_quality', 'speaker_link', 'speaker_intervention']
  # add general meta data to all records
  parsed_intervention['title'] = soup.find('title').get_text()
  parsed_intervention['date'] = date_string
  #reorder columns
  parsed_intervention = parsed_intervention[['date', 'title', 'speaker_name', 'speaker_quality', 'speaker_link', 'speaker_intervention']]
  
  #double return
  did_work = True
  complete_debate = parsed_intervention
  return did_work, complete_debate

parsed_intervention = parse_filename_html_to_df(filename)

In [None]:
parsed_intervention

# Complete parse of all interventions

In [None]:
from tqdm.auto import tqdm

In [None]:
debate_df_list = []
for filename in tqdm(filename_list):
  did_work, complete_debate = parse_filename_html_to_df(filename)
  if (did_work):
    debate_df_list.append(complete_debate) 

  0%|          | 0/2374 [00:00<?, ?it/s]

In [None]:
complete_debate_database = pd.concat(debate_df_list)

In [None]:
complete_debate_database

Unnamed: 0,date,title,speaker_name,speaker_quality,speaker_link,speaker_intervention
0,2016-05-03,Séance du 3 mai 2016 (compte rendu intégral de...,M. le président.,,/senateur/larcher_gerard86034e.html,(La séance est ouverte à quinze heures quinze.)
1,2016-05-03,Séance du 3 mai 2016 (compte rendu intégral de...,M. le président,,/senateur/larcher_gerard86034e.html,Il n’y a pas d’observation ?…
2,2016-05-03,Séance du 3 mai 2016 (compte rendu intégral de...,M. le président.,,/senateur/larcher_gerard86034e.html,"Avant de passer au scrutin, je vais donner la ..."
3,2016-05-03,Séance du 3 mai 2016 (compte rendu intégral de...,M. le président.,,/senateur/larcher_gerard86034e.html,"La parole est à Mme Corinne Bouchoux, pour le ..."
4,2016-05-03,Séance du 3 mai 2016 (compte rendu intégral de...,Mme Corinne Bouchoux.,,/senateur/bouchoux_corinne11029k.html,"Sur la base d’une méthode originale, les avis ..."
...,...,...,...,...,...,...
20,2008-01-09,Séance du 9 janvier 2008 (compte rendu intégra...,"Mme Christine Lagarde,",ministre.,,"Le troisième constat, c'est que la voie du « r..."
21,2008-01-09,Séance du 9 janvier 2008 (compte rendu intégra...,"M. Nicolas About,",président de la commission des affaires sociales.,/senfic/about_nicolas95004y.html,"M. Nicolas About, président de la commission d..."
22,2008-01-09,Séance du 9 janvier 2008 (compte rendu intégra...,"Mme Christine Lagarde,",ministre.,,Tout cela doit permettre de mettre en place ra...
23,2008-01-09,Séance du 9 janvier 2008 (compte rendu intégra...,M. le président.,,/senfic/gouteyron_adrien78006d.html,M. le président. La parole est à Mme le rappor...


# Export to csv

As the number of samples is reasonable, we export as a simple CSV file

In [None]:
complete_debate_database.to_csv('/content/drive/MyDrive/Colab Notebooks/ressources/senat_parse/complete_database/complete_debate_database.csv', sep=';', encoding='utf-8')