In [3]:
from collections import namedtuple

import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Scrape MEPS pages

In [19]:
def fetch_results(html):
    
    # Create a BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')

    # Extract information
    a_tag = soup.find_all('a', class_='t-y')
    print(len(a_tag))

    results = []

    for tag in a_tag:
    
        name = tag.find('span', class_='t-item').text
        href = tag['href']
        res = (name, href)
        results.append(res)

    return results


def count_a_tags(page_source_html):

    soup = BeautifulSoup(page_source_html, 'html.parser')
    a_tag = soup.find_all('a', class_='t-y')
    
    return len(a_tag)


def extract_content_from_url(url, verbose = False):

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all elements with the class "content"
            content_elements = soup.find_all(class_="contents")

            # Extract and print the content

            concatenated_text = ""

            for element in content_elements:
                if verbose:
                    print(element.text)
                concatenated_text += element.text + "\n"

            return concatenated_text

        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")



def find_date_from_url(url_string):

    split_string = url_string.split("/")
    input_string = split_string[-1]

    # Define a regular expression pattern to match the date
    pattern = r'-(\d{4}-\d{2}-\d{2})'
    match = re.search(pattern, input_string)

    if match:
        # Extract the date and language code from the match
        date = match.group(1)
        print(f'Date: {date}')
    else:
        print("No match found for Date")

    return date


def find_language_from_url(url_string):

    split_string = url_string.split("/")
    input_string = split_string[-1]

    pattern_lan = r'([A-Z]{2})\.html'
    match = re.search(pattern_lan, input_string)

    if match:
        # Extract the date and language code from the match
        lan = match.group(1)
        print(f'Language: {lan}')
    else:
        print("No match found for Language")

    return lan

In [6]:
href = 'https://www.europarl.europa.eu/meps/en/124867/BRANDO_BENIFEI/main-activities/plenary-speeches#detailedcardmep'

In [42]:
# Inizialize Web Drive and get page 
driver = webdriver.Chrome()
driver.get(href)

The chromedriver version (116.0.5845.96) detected in PATH at /usr/local/bin/chromedriver might not be compatible with the detected chrome version (117.0.5938.149); currently, chromedriver 117.0.5938.149 is recommended for chrome 117.*, so it is advised to delete the driver in PATH and retry


In [43]:
# Agree on cookies to allow further action
cookie_button = driver.find_element(By.CLASS_NAME, 'epjs_agree')
cookie_button

actions = ActionChains(driver)
actions.move_to_element(cookie_button).click().perform()

In [44]:
# Find all the interventions made by MP

load_more_button = driver.find_element(By.CLASS_NAME, 'europarl-expandable-async-loadmore')
actions = ActionChains(driver)

cnt = 0 

for i in range(20):
    
    time.sleep(5)

    try:

        if load_more_button.is_displayed():

            cnt += 1

            actions.move_to_element(load_more_button).click().perform()
            print('Iteration :', cnt)
            print('Fetched ', count_a_tags(driver.page_source))

    except Exception as e:
        print(f"An error occurred")
        break

    
results = fetch_results(driver.page_source)

Iteration : 1
Fetched  10
Iteration : 2
Fetched  20
Iteration : 3
Fetched  30
Iteration : 4
Fetched  40
Iteration : 5
Fetched  50
Iteration : 6
Fetched  60
Iteration : 7
Fetched  70
Iteration : 8
Fetched  80
Iteration : 9
Fetched  90
Iteration : 10
Fetched  100
Iteration : 11
Fetched  110
Iteration : 12
Fetched  120
Iteration : 13
Fetched  130
Iteration : 14
Fetched  140
An error occurred
145


In [45]:
results = fetch_results(driver.page_source)

mp = 'Brando Benifei'
results = [(mp,) + x for x in results]

145


In [48]:
# Create a namedtuple with the following names "MP", "Date", "Language", "Title", "Url", "Content"
# Then generate many namedtuples using looping trough a list that contains all the elements


# Create a namedtuple with the specified field names
mp_list = namedtuple("MpActivities", ["MP", "Date", "Language", "Title", "Url", "Content"])


# Generate namedtuples for each element in the list
namedtuples_list = []

for result in results:

    time.sleep(2)

    mp, title, url  = result
    content = extract_content_from_url(url, verbose = False)
    language_code = find_language_from_url(url)
    date = find_date_from_url(url)
    
    named_tuple = mp_list(MP=mp, Date=date, Language=language_code, Title=title, Url=url, Content=content)
    namedtuples_list.append(named_tuple)




Language: IT
Date: 2023-10-04
Language: IT
Date: 2023-10-03
Language: IT
Date: 2023-10-02
Language: IT
Date: 2023-09-13
Language: IT
Date: 2023-09-12
Language: IT
Date: 2023-09-11
Language: EN
Date: 2023-09-11
Language: EN
Date: 2023-07-13
Language: IT
Date: 2023-07-12
Language: IT
Date: 2023-07-11
Language: IT
Date: 2023-06-15
Language: EN
Date: 2023-06-14
Language: IT
Date: 2023-06-13
Language: EN
Date: 2023-06-13
Language: IT
Date: 2023-06-13
Language: IT
Date: 2023-06-13
Language: IT
Date: 2023-05-31
Language: IT
Date: 2023-05-10
Language: IT
Date: 2023-05-10
Language: IT
Date: 2023-04-20
Language: IT
Date: 2023-04-19
Language: IT
Date: 2023-04-18
Language: EN
Date: 2023-04-18
Language: EN
Date: 2023-03-29
Language: IT
Date: 2023-03-29
Language: IT
Date: 2023-03-16
Language: IT
Date: 2023-03-15
Language: IT
Date: 2023-03-15
Language: EN
Date: 2023-03-15
Language: IT
Date: 2023-03-14
Language: IT
Date: 2023-02-14
Language: IT
Date: 2023-02-14
Language: IT
Date: 2023-02-14
Language: 

In [53]:
res_df.iloc[0]['Content']

'\xa0\xa0Brando Benifei (S&D).   – Signor Presidente, onorevoli colleghi, ieri in quest\'Aula abbiamo commemorato il decimo anniversario della tragedia di Lampedusa: 368 uomini, donne e bambini che hanno perso la vita nel tentativo di cercare una vita migliore.\nFa rabbia pensare che un accordo sul pacchetto migrazione e asilo sia ancora bloccato perché alcuni ritengono che la presenza di navi delle ONG sia un incentivo per l\'immigrazione, mentre i dati ci dicono che non è così. Ciononostante, esponenti di Fratelli d\'Italia, in quest\'Aula, su questi temi, ieri hanno detto cose vergognose.\nDopo il naufragio, quel naufragio terribile che ricordiamo, c\'è stata Mare Nostrum, una missione di ricerca e soccorso, che va ripristinata adesso come missione europea. Invece, dopo la tragedia di Cutro il governo italiano ha criminalizzato chi salva vite in mare e ha istituito un "pizzo" per uscire dai CPR, questo sì, un "pizzo di Stato".\nServe tempo, dicono. Tempo non ce n\'è più, dopo dieci 

In [63]:
res_df = pd.DataFrame(namedtuples_list)
res_df.to_csv(f"{mp}.csv", index = False)

In [66]:
res_df['Content'].iloc[10]

"\xa0\xa0Brando Benifei (S&D).   – Signora Presidente, onorevoli colleghi, il fiume Po completamente asciutto, fiumi e torrenti scomparsi e assenza di pioggia per mesi. L'Italia questa primavera appariva esattamente così. Solo un mese fa, invece, un altro scenario completamente diverso: l'Emilia-Romagna sott'acqua, fiumi esondati, frane, morti, sfollati. Questo non è un paradosso. Il cambiamento climatico sta alterando i cicli idrologici, allungando i periodi di siccità alternati da frequenti periodi di fortissime piogge. \nCosì non va bene, dobbiamo agire. Oggi si voterà la risoluzione sulla Giornata mondiale per le vittime del cambiamento climatico e tra due giorni si celebra la Giornata mondiale contro la desertificazione e la siccità. Questi temi sono delle priorità del Green Deal europeo e dobbiamo continuare quindi a lavorare per la strategia per l'adattamento ai cambiamenti climatici, così come su tutte le proposte della Commissione. \nDobbiamo essere efficaci, realisti, concret

# Text analysis using Chat GPT

In [67]:
%load_ext autoreload
%autoreload 2

import sys 
sys.path.append('/..')
sys.path.append('../..')
sys.path.append('..')

import json
import os
from dotenv import load_dotenv, find_dotenv
import openai


# Load Environment Variables
_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")



In [68]:
def get_completion(prompt, model="gpt-3.5-turbo", temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,  # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [125]:
prompt = f"""

    You are provided the text of a politician speech at the 
    European Parliament. The text is delimited by {delimiter} You are asked 
    to extract at least three topics from the text. 
    Provide only one word to describe each topic. For each topic provide a sentiment


    {delimiter}
    {content}
    {delimiter}

    Provide the result in english. 

    Provide the result in Json format.

        """

In [115]:
samp = res_df.sample(10)



for s in samp.iterrows():

    print(s[1].Title)
    
    content = s[1].Content
    delimiter = "####"

    res = get_completion(prompt)
    print(res)

    print("--------------")

Whitewashing of the anti-European extreme right in the EU (topical debate)
Topics:
1. Fascism - Sentiment: Negative
2. Russia - Sentiment: Neutral
3. Opposition - Sentiment: Positive

Result in English:
{
  "topics": [
    {
      "topic": "Fascism",
      "sentiment": "Negative"
    },
    {
      "topic": "Russia",
      "sentiment": "Neutral"
    },
    {
      "topic": "Opposition",
      "sentiment": "Positive"
    }
  ]
}

Result in JSON format:
{
  "topics": [
    {
      "topic": "Fascism",
      "sentiment": "Negative"
    },
    {
      "topic": "Russia",
      "sentiment": "Neutral"
    },
    {
      "topic": "Opposition",
      "sentiment": "Positive"
    }
  ]
}
--------------
New developments in allegations of corruption and foreign interference, including those related to Morocco, and the need to increase transparency, integrity and accountability in the European institutions (debate)
{
  "topics": [
    {
      "topic": "Scandal",
      "sentiment": "Negative"
    },
 

In [116]:
res

'{\n  "topics": [\n    {\n      "word": "FEAD",\n      "sentiment": "positive"\n    },\n    {\n      "word": "poverty",\n      "sentiment": "negative"\n    },\n    {\n      "word": "inclusion",\n      "sentiment": "positive"\n    }\n  ]\n}'

In [119]:
import json

def parse_json_string(json_string):
    try:
        # Use json.loads to parse the JSON string into a Python dictionary
        json_data = json.loads(json_string)
        return json_data
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None
    
js = parse_json_string(res)


# if parsed_json is not None:
#     # Print the parsed JSON object
#     print(json.dumps(parsed_json, indent=2))

In [124]:
js['topics']

[{'word': 'FEAD', 'sentiment': 'positive'},
 {'word': 'poverty', 'sentiment': 'negative'},
 {'word': 'inclusion', 'sentiment': 'positive'}]

In [70]:
res = get_completion(prompt)

In [71]:
res

'####\n{\n  "procedimento": "1. Portare a ebollizione una pentola. Nel frattempo...",\n  "ingredienti": [\n    {\n      "nome": "spaghetti",\n      "quantità": "320g"\n    },\n    {\n      "nome": "guanciale",\n      "quantità": "150g"\n    }\n  ],\n  "porzioni": 4\n}\n####'