# **Module de Machine Learning**

# **Partie 2  : Web Scraping**

### Librairies

In [12]:
from bs4 import BeautifulSoup
import requests
import csv

### Script de récupération de données financières (Financial Times)

#### Récupération d'une page web

In [2]:
response = requests.get("https://www.ft.com/global-economy")

response
# Si response renvoie <Response 200> la requête à bien fonctionner et la variable response contient la page web correspondant à l'url.
# En revanche <Response 404> indique une erreur.

<Response [200]>

#### Parsing du contenu de la page web

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

soup.title
# Normalement, on devrait avoir récupérer les données du site du Financial Times

<title>Global Economy | Financial Times</title>

### Isolation des données qui nous intéressent

In [4]:
tag_a = soup.find_all("a", class_="js-teaser-heading-link")
tag_a

[<a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/6890be22-9280-460a-bd2a-3c45f2cf531f">BoE’s chief economist hints at May interest rate rise</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/92d95586-f1eb-4148-ae32-1864f7deeb43">Waging war on trade will be costly</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/95745636-2d21-46aa-b0f1-6bda1c0fdd0b">Personal inflation calculator: what is your inflation rate?</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/b7340b0f-7919-40d6-9f93-604f005e6551">US job openings fall to lowest level in almost two years</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/535de452-e27d-4b8b-84bf-58ab4e910e86">Opec isn’t scaring anyone</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/b757a212-6898-45c4-9cb4-d2d851b4a70e">FirstFT: Trump prepares to face cha

In [5]:
for i in range(len(tag_a) - 2):
  print(tag_a[i].contents)
# On retire 2 car on ne veut pas récupérer les 2 dernières lignes qui ne 
# contiennent pas de phrases à caractère financier

['BoE’s chief economist hints at May interest rate rise']
['Waging war on trade will be costly']
['Personal inflation calculator: what is your inflation rate?']
['US job openings fall to lowest level in almost two years']
['Opec isn’t scaring anyone']
['FirstFT: Trump prepares to face charges']
['How Spain has taken on the problem of precarious work  ']
['China Inc keen on setting up shop in the US despite tensions']
['Surprise cut by Opec+ fuels optimism for oil companies']
['Israel political crisis could cut 2.8% a year from GDP, central bank warns']
['High inflation boosts public finances, IMF says']
['Europe’s aversion to anti-coercion']
['FirstFT: Oil prices surge']
['China’s ports dominance undermines western aims to loosen trade ties']
['The financial turmoil is not over']
['How China is winning the race for Africa’s lithium']
['Peace on Earth postponed']
['Iraqi authorities reach preliminary deal to resume oil exports to Turkey']
['UK financial system ‘locking out’ growing numb

In [6]:
phrases_web = []
for j in range(len(tag_a) - 2):
  phrases_web.append(tag_a[j].contents)
print("ok")
# Ici on rassemble les phrases qu'on vient d'identifier dans un tableau

ok


In [7]:
for k in range(len(phrases_web)):
  print(phrases_web[k])
# On vérifie que nos phrases sont bien dans le tableau

['BoE’s chief economist hints at May interest rate rise']
['Waging war on trade will be costly']
['Personal inflation calculator: what is your inflation rate?']
['US job openings fall to lowest level in almost two years']
['Opec isn’t scaring anyone']
['FirstFT: Trump prepares to face charges']
['How Spain has taken on the problem of precarious work  ']
['China Inc keen on setting up shop in the US despite tensions']
['Surprise cut by Opec+ fuels optimism for oil companies']
['Israel political crisis could cut 2.8% a year from GDP, central bank warns']
['High inflation boosts public finances, IMF says']
['Europe’s aversion to anti-coercion']
['FirstFT: Oil prices surge']
['China’s ports dominance undermines western aims to loosen trade ties']
['The financial turmoil is not over']
['How China is winning the race for Africa’s lithium']
['Peace on Earth postponed']
['Iraqi authorities reach preliminary deal to resume oil exports to Turkey']
['UK financial system ‘locking out’ growing numb

### Ecriture des données dans un fichier csv

In [8]:
type(phrases_web[0][0])
# On vérifie le type de nos phrases

bs4.element.NavigableString

In [9]:
for l in range(len(phrases_web)):
  phrases_web[l][0] = str(phrases_web[l][0])
type(phrases_web[0][0])
# Le type de nos phrases est un type propre à la librairie BeautifulSoup.
# On convertit les phrases en objet de type string pour éviter d'éventuels problème lors de l'écriture en csv.

str

In [10]:
with open("../data/web.csv", "w", newline='', encoding='utf-8') as csvfile:
  writer = csv.writer(csvfile, delimiter=',')
  writer.writerows(phrases_web)
  print("ok")

ok


### Récupération des données et du modèle

In [13]:
import pandas as pan
from joblib import load

In [14]:
# Récupération des phrases scrapées
dataframe_web = pan.read_csv(../data/web.csv)
dataframe_web

Unnamed: 0,BoE’s chief economist hints at May interest rate rise
0,Waging war on trade will be costly
1,Personal inflation calculator: what is your in...
2,US job openings fall to lowest level in almost...
3,Opec isn’t scaring anyone
4,FirstFT: Trump prepares to face charges
5,How Spain has taken on the problem of precario...
6,China Inc keen on setting up shop in the US de...
7,Surprise cut by Opec+ fuels optimism for oil c...
8,Israel political crisis could cut 2.8% a year ...
9,"High inflation boosts public finances, IMF says"


In [17]:
# Récupération du modèle de la Partie 1
model = load(model.logiR)