# **Module de Machine Learning**

# **Partie 2  : Web Scraping**

### Librairies

In [1]:
from bs4 import BeautifulSoup
import requests
import csv

### Script de récupération de données financières (Financial Times)

#### Récupération d'une page web

In [2]:
response = requests.get("https://www.ft.com/global-economy")

response
# Si response renvoie <Response 200> la requête à bien fonctionner et la variable response contient la page web correspondant à l'url.
# En revanche <Response 404> indique une erreur.

<Response [200]>

#### Parsing du contenu de la page web

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

soup.title
# Normalement, on devrait avoir récupérer les données du site du Financial Times

<title>Global Economy | Financial Times</title>

### Isolation des données qui nous intéressent

In [4]:
tag_a = soup.find_all("a", class_="js-teaser-heading-link")
tag_a

[<a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/3641177a-d0bc-40ce-845c-0eea4d608f41">FirstFT: Options trading surges as turmoil looms</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/9e61dfe9-cf18-4ad1-8837-864925069fa6">Global economy fends off geopolitical and banking threats</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/cee3a43a-c549-4856-8c7b-d8152deb61ac">Milestones and meetings</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/46d56c75-57d5-44ef-812c-5e58865f0179">If tech is driving the ‘productivity bandwagon’, it’s time to hit the brakes</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/8af4ed19-4cf5-40d6-9e94-6e0abb591556">Ambition is needed to rebuild trust in multilateralism</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/c744d5d2-72b2-43be-a105-2f5d364da281">Will

In [5]:
for i in range(len(tag_a) - 2):
  print(tag_a[i].contents)
# On retire 2 car on ne veut pas récupérer les 2 dernières lignes qui ne 
# contiennent pas de phrases à caractère financier

['FirstFT: Options trading surges as turmoil looms']
['Global economy fends off geopolitical and banking threats']
['Milestones and meetings']
['If tech is driving the ‘productivity bandwagon’, it’s time to hit the brakes']
['Ambition is needed to rebuild trust in multilateralism']
['Will US inflation continue to slow?']
['US data raises hopes Fed’s efforts to curb inflation are working']
['US jobs growth slowed in March as Fed put brakes on economy']
['Russia’s foreign minister threatens to scrap Ukraine grain deal']
['FirstFT: US jobs market shows signs of cooling']
['The battle for UK businesses to hold down customer price rises']
['South Korean biotech companies seek to diversify from China as US tensions rise']
['Germany seeks to limit Brussels’ scope on national debt reduction plans']
['Global economy set for years of weak growth, IMF chief Georgieva warns']
['FirstFT: Hedge funds cash in on banking chaos ']
['When do banking failures matter for the economy?']
['India’s central b

In [6]:
phrases_web = [['Phrases']]
for j in range(len(tag_a) - 2):
  phrases_web.append(tag_a[j].contents)
print("ok")
# Ici on rassemble les phrases qu'on vient d'identifier dans un tableau

ok


In [7]:
for k in range(len(phrases_web)):
  print(phrases_web[k])
# On vérifie que nos phrases sont bien dans le tableau

['Phrases']
['FirstFT: Options trading surges as turmoil looms']
['Global economy fends off geopolitical and banking threats']
['Milestones and meetings']
['If tech is driving the ‘productivity bandwagon’, it’s time to hit the brakes']
['Ambition is needed to rebuild trust in multilateralism']
['Will US inflation continue to slow?']
['US data raises hopes Fed’s efforts to curb inflation are working']
['US jobs growth slowed in March as Fed put brakes on economy']
['Russia’s foreign minister threatens to scrap Ukraine grain deal']
['FirstFT: US jobs market shows signs of cooling']
['The battle for UK businesses to hold down customer price rises']
['South Korean biotech companies seek to diversify from China as US tensions rise']
['Germany seeks to limit Brussels’ scope on national debt reduction plans']
['Global economy set for years of weak growth, IMF chief Georgieva warns']
['FirstFT: Hedge funds cash in on banking chaos ']
['When do banking failures matter for the economy?']
['India

### Ecriture des données dans un fichier csv

In [8]:
type(phrases_web[0][0])
# On vérifie le type de nos phrases

str

In [9]:
for l in range(len(phrases_web)):
  phrases_web[l][0] = str(phrases_web[l][0])
type(phrases_web[0][0])
# Le type de nos phrases est un type propre à la librairie BeautifulSoup.
# On convertit les phrases en objet de type string pour éviter d'éventuels problème lors de l'écriture en csv.

str

In [10]:
with open("../data/web.csv", "w", newline='', encoding='utf-8') as csvfile:
  writer = csv.writer(csvfile, delimiter=',')
  writer.writerows(phrases_web)
  print("ok")

ok


### Récupération des données et du modèle

In [11]:
import pandas as pan
from joblib import load

In [12]:
# Récupération des phrases scrapées
dataframe_web = pan.read_csv("../data/web.csv")
dataframe_web

Unnamed: 0,Phrases
0,FirstFT: Options trading surges as turmoil looms
1,Global economy fends off geopolitical and bank...
2,Milestones and meetings
3,If tech is driving the ‘productivity bandwagon...
4,Ambition is needed to rebuild trust in multila...
5,Will US inflation continue to slow?
6,US data raises hopes Fed’s efforts to curb inf...
7,US jobs growth slowed in March as Fed put brak...
8,Russia’s foreign minister threatens to scrap U...
9,FirstFT: US jobs market shows signs of cooling


In [13]:
# Récupération du modèle de la Partie 1
model = load("model.logiR")

### Prédictions sur les données récupérées sur le web

In [14]:
model

In [15]:
predictions = model.predict(dataframe_web.Phrases)
predictions

array(['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral',
       'positive', 'negative', 'positive', 'negative', 'neutral',
       'positive', 'neutral', 'positive', 'neutral', 'neutral', 'neutral',
       'neutral', 'neutral', 'positive', 'positive', 'positive',
       'positive', 'neutral', 'neutral'], dtype=object)

pandas.core.series.Series