# **Module de Machine Learning**

# **Partie 2  : Web Scraping**

### Librairies

In [1]:
from bs4 import BeautifulSoup
import requests
import csv

### Script de récupération de données financières (Financial Times)

#### Récupération d'une page web

In [2]:
response = requests.get("https://www.ft.com/global-economy")

response
# Si response renvoie <Response 200> la requête à bien fonctionner et la variable response contient la page web correspondant à l'url.
# En revanche <Response 404> indique une erreur.

<Response [200]>

#### Parsing du contenu de la page web

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

soup.title
# Normalement, on devrait avoir récupérer les données du site du Financial Times

<title>Global Economy | Financial Times</title>

### Isolation des données qui nous intéressent

In [4]:
tag_a = soup.find_all("a", class_="js-teaser-heading-link")
tag_a

[<a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/0c42c8c9-4922-4c4e-882c-2be3b24a72a6">FirstFT: Russia takes passports of senior officials to stop defections and leaks </a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/a7c2b2f6-e106-436d-a599-cfcc69d18b05">China’s ports dominance undermines western aims to loosen trade ties</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/8951f7fe-43af-49d9-8641-a0d79bbd5bda">The financial turmoil is not over</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/02d6f35d-e646-40f7-894c-ffcc6acd9b25">How China is winning the race for Africa’s lithium</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/44420b37-8fbb-410d-bc4f-081537b2ee38">Peace on Earth postponed</a>,
 <a class="js-teaser-heading-link" data-trackable="heading-link" href="/content/b6f01166-919c-4372-9bca-88f0e67f44c3">Iraqi 

In [5]:
for i in range(len(tag_a) - 2):
  print(tag_a[i].contents)
# On retire 2 car on ne veut pas récupérer les 2 dernières lignes qui ne 
# contiennent pas de phrases à caractère financier

['FirstFT: Russia takes passports of senior officials to stop defections and leaks ']
['China’s ports dominance undermines western aims to loosen trade ties']
['The financial turmoil is not over']
['How China is winning the race for Africa’s lithium']
['Peace on Earth postponed']
['Iraqi authorities reach preliminary deal to resume oil exports to Turkey']
['UK financial system ‘locking out’ growing numbers of people']
['Canada warns US against waging ‘carbon subsidy war’']
['Will the pace of US hiring slow? ']
['Car loan cost surge pressures manufacturers to reinstate discounts']
['Europe’s foothold slips in Africa']
['China escalates tech battle with review of US chipmaker Micron ']
['Inflation falls ease pressure on central banks']
['Turkish banks: unorthodox approach to inflation fighting will take a toll']
['We are living through a trillion-dollar rebalancing']
['UAE cites ‘sanctions risks’ as it cancels licence for Russia’s MTS Bank ']
['UK government threatened with legal action 

In [6]:
phrases_web = []
for j in range(len(tag_a) - 2):
  phrases_web.append(tag_a[j].contents)
print("ok")
# Ici on rassemble les phrases qu'on vient d'identifier dans un tableau

ok


In [7]:
for k in range(len(phrases_web)):
  print(phrases_web[k])
# On vérifie que nos phrases sont bien dans le tableau

['FirstFT: Russia takes passports of senior officials to stop defections and leaks ']
['China’s ports dominance undermines western aims to loosen trade ties']
['The financial turmoil is not over']
['How China is winning the race for Africa’s lithium']
['Peace on Earth postponed']
['Iraqi authorities reach preliminary deal to resume oil exports to Turkey']
['UK financial system ‘locking out’ growing numbers of people']
['Canada warns US against waging ‘carbon subsidy war’']
['Will the pace of US hiring slow? ']
['Car loan cost surge pressures manufacturers to reinstate discounts']
['Europe’s foothold slips in Africa']
['China escalates tech battle with review of US chipmaker Micron ']
['Inflation falls ease pressure on central banks']
['Turkish banks: unorthodox approach to inflation fighting will take a toll']
['We are living through a trillion-dollar rebalancing']
['UAE cites ‘sanctions risks’ as it cancels licence for Russia’s MTS Bank ']
['UK government threatened with legal action 

### Ecriture des données dans un fichier csv

In [8]:
type(phrases_web[0][0])
# On vérifie le type de nos phrases

bs4.element.NavigableString

In [9]:
for l in range(len(phrases_web)):
  phrases_web[l][0] = str(phrases_web[l][0])
type(phrases_web[0][0])
# Le type de nos phrases est un type propre à la librairie BeautifulSoup.
# On convertit les phrases en objet de type string pour éviter d'éventuels problème lors de l'écriture en csv.

str

In [10]:
with open("../data/web.csv", "w", newline='', encoding='utf-8') as csvfile:
  writer = csv.writer(csvfile, delimiter=',')
  writer.writerows(phrases_web)
  print("ok")

FileNotFoundError: ignored

### Récupération des données et du modèle

In [None]:
import pandas as pan
from joblib import load

In [None]:
# Récupération des phrases scrapées
dataframe_web = pan.read_csv(../data/web.csv)
dataframe_web

In [None]:
# Récupération du modèle de la Partie 1
model = load(model.logiR)