In [1]:
print("hello")

hello


In [2]:
# pandas pour lire le fichier CSV
import pandas as pd
# re pour découper le texte en phrases
import re

In [3]:
# afficher les 3 premières lignes du fichier (juste pour vérifier)
with open("discours_US.csv", "r", encoding="utf-8", errors="replace") as f:
    for _ in range(3):
        print(f.readline())

"speaker"	"text"	"date"	"descr"	"link"

"CLINTON"	": I'm getting ready for a lot of things, a lot of things. I'm getting ready to do something, too. I'm running for president. Americans have fought their way back from tough economic times, but the deck is still stacked in favor of those at the top. Everyday Americans need a champion and I want to be that champion. So you can do more than just get by, you can get ahead and stay ahead, because when families are strong, America is strong. So I'm hitting the road to earn your vote, because it's your time and I hope you'll join me on this journey."	"April 12, 2015"	"Video Remarks Announcing Candidacy for President"	"http://www.presidency.ucsb.edu/ws/index.php?pid=110028"

"CLINTON"	"[ ] : I'll be graduating in May, and on graduation I'll be heading to Annapolis, the naval academy, to major in systems engineering. All right, good luck. Really? Year and a half. Wow. That's great! OK. Great. Thanks. Thank you. I'll introduce... That's good. Th

In [3]:
# charger le fichier (tabulation) + ignorer les lignes cassées
df = pd.read_csv("discours_US.csv", sep="\t", on_bad_lines="skip", engine="python")
# afficher les 5 premières lignes
df.head()


Unnamed: 0,speaker,text,date,descr,link
0,CLINTON,": I'm getting ready for a lot of things, a lot...","April 12, 2015",Video Remarks Announcing Candidacy for President,http://www.presidency.ucsb.edu/ws/index.php?pi...
1,CLINTON,"[ ] : I'll be graduating in May, and on gradua...","April 14, 2015",Remarks in a Question and Answer Session at Ki...,http://www.presidency.ucsb.edu/ws/index.php?pi...
2,CLINTON,"So, congratulations on this new poll number in...","October 16, 2015",Interview with Jake tapper of CNN,http://www.presidency.ucsb.edu/ws/index.php?pi...
3,CLINTON,"Thank you, Madam Secretary. This is a big inte...","January 5, 2016",Interview with Chris Matthews of MSNBC,http://www.presidency.ucsb.edu/ws/index.php?pi...
4,CLINTON,Wow! What a night. An unbelievable night. What...,"February 1, 2016",Remarks in Des Moines Following the Iowa Caucus,http://www.presidency.ucsb.edu/ws/index.php?pi...


In [4]:
# compter les discours par auteur
df["speaker"].value_counts()


speaker
CLINTON    13
TRUMP      12
Name: count, dtype: int64

In [5]:
# importer mes classes (TD4/TD5)
from TD4_5_corpus import Corpus
from TD4_5_document import Document

# créer un corpus vide
corpus = Corpus("discours_US")


In [6]:
# découper un texte en phrases
def decouper_en_phrases(texte):
    t = str(texte)
    phrases = re.split(r"[.!?]+", t)
    phrases = [p.strip() for p in phrases if p.strip() != ""]
    return phrases


In [7]:
# parcourir le DataFrame
for _, row in df.iterrows():
    auteur = row["speaker"]
    texte = row["text"]
    # découper le discours en phrases
    phrases = decouper_en_phrases(texte)
    # ajouter chaque phrase comme document
    for p in phrases:
        doc = Document(
            titre="phrase_discours",
            auteur=auteur,
            date=str(row.get("date", "")),
            url=str(row.get("link", "")),
            texte=p
        )
        corpus.add_document(doc)
# afficher un résumé du corpus
corpus

Corpus(discours_US, ndoc=4173, naut=2)

In [8]:
# test search
print("TD6 search :")
print(corpus.search(r"freedom")[:3])
# test concorde
print("\nTD6 concorde :")
corpus.concorde(r"freedom", contexte=25).head()

TD6 search :
['This happens every time there is a Freedom of Information Act request', 'replace this disaster with reforms that give you choice and freedom and control in healthcare – at a much lower cost', ' a plan to provide every disadvantaged child in America the freedom to choose the private, public, magnet or religious school o']

TD6 concorde :


Unnamed: 0,doc_id,contexte_gauche,motif_trouve,contexte_droit
0,206,ns every time there is a,Freedom,of Information Act reque
1,1478,that give you choice and,freedom,and control in healthcar
2,2641,ged child in America the,freedom,"to choose the private, p"
3,2651,religious and political,freedom,for the Cuban people
4,2656,"a, I will push to expand",freedom,for all of our people


In [9]:
# importer le moteur
from TD7_moteur_recherche import SearchEngine

# créer le moteur avec le corpus
moteur = SearchEngine(corpus)


100%|██████████| 4173/4173 [00:00<00:00, 44539.07it/s]


In [10]:
moteur.search("freedom", 5)


Unnamed: 0,doc_id,score,titre,auteur,date,url
0,2656,0.478868,phrase_discours,TRUMP,"September 16, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...
1,206,0.394399,phrase_discours,CLINTON,"October 16, 2015",http://www.presidency.ucsb.edu/ws/index.php?pi...
2,2651,0.373987,phrase_discours,TRUMP,"September 16, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...
3,1478,0.319747,phrase_discours,TRUMP,"August 18, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...
4,2894,0.299965,phrase_discours,TRUMP,"October 22, 2016",http://www.presidency.ucsb.edu/ws/index.php?pi...


In [11]:
# widgets pour interface
import ipywidgets as widgets

# display pour afficher dans notebook
from IPython.display import display


In [None]:
# titre
titre = widgets.HTML("<h3 style='margin:0'>Moteur de recherche</h3>")
# champ de requête
champ = widgets.Text(
    placeholder="ex : freedom america",
    description="Mots clés :",
    layout=widgets.Layout(width="520px")
)
# slider pour le nombre de résultats
slider = widgets.IntSlider(
    value=5,
    min=1,
    max=30,
    step=1,
    description="Top K :",
    continuous_update=False,
    layout=widgets.Layout(width="520px")
)
# bouton
bouton = widgets.Button(
    description="Rechercher",
    button_style="primary",
    icon="search",
    layout=widgets.Layout(width="160px")
)
# zone résultat
out = widgets.Output(
    layout=widgets.Layout(
        border="1px solid #ccc",
        padding="10px",
        width="760px",
        height="320px",
        overflow_y="auto"
    )
)


In [13]:
# quand on clique sur le bouton
def clique_bouton(b):
    # vider la zone avant d'afficher
    out.clear_output()
    with out:
        # récupérer la requête
        q = champ.value.strip()
        # si vide
        if q == "":
            print("Écris une requête (ex : freedom)")
            return
        # lancer la recherche
        res = moteur.search(q, slider.value)
        # garder seulement scores > 0 
        if "score" in res.columns:
            res = res[res["score"] > 0]
        # afficher le tableau
        display(res)
# lier bouton : fonction
bouton.on_click(clique_bouton)


In [14]:
# organiser l'interface
ui = widgets.VBox([
    titre,
    widgets.HBox([champ]),
    widgets.HBox([slider, bouton]),
    out
])

# afficher
display(ui)


VBox(children=(HTML(value="<h3 style='margin:0'>Moteur de recherche</h3>"), HBox(children=(Text(value='', desc…