# ESG-Driven Portfolio Optimization: Personalized Factor Investing Data Analysis

In [26]:
import pandas as pd
import requests
import json
from PIL import Image
import transformers
import os
import datetime
from lxml import etree
import sqlite3

### I Data import by webscraping

##### 1) Tickers

In [27]:
# Define URLs for different categories
URLs = {
    'MA': "https://finance.yahoo.com/markets/stocks/most-active/",
    'TN': "https://finance.yahoo.com/markets/stocks/trending/",
    'GA': "https://finance.yahoo.com/markets/stocks/gainers/",
    'TL': "https://finance.yahoo.com/markets/stocks/losers/"
}

def fetch_and_parse(url):
    page = requests.get(url).content
    tree = etree.HTML(page)
    nodes = tree.xpath("//*[contains(concat(' ', @class, ' '), concat(' ', 'yf-138ga19', ' '))]")
    texts = [node.text for node in nodes]
    cleaned = [text.strip() for text in texts if text and text.strip()]
    tickers = [cleaned[i] for i in range(0, len(cleaned), 2)]
    return tickers

def data_exists_for_today(conn, date):
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM stock_categories WHERE date = ?", (date,))
    count = cursor.fetchone()[0]
    return count > 0

def main():
    current_date = datetime.date.today().isoformat()
    db_path = os.path.join(os.getcwd(), 'stock_data.db')

    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)

    # Check if data for today already exists
    if data_exists_for_today(conn, current_date):
        print(f"Data for {current_date} already exists. Skipping insertion.")
        conn.close()
        return

    # Fetch and parse data for each category
    data = {category: fetch_and_parse(url) for category, url in URLs.items()}

    # Create a set of all unique tickers
    all_tickers = set()
    for tickers in data.values():
        all_tickers.update(tickers)

    # Create a DataFrame with all tickers and categories
    df = pd.DataFrame(index=sorted(all_tickers), columns=URLs.keys())

    # Fill the DataFrame
    for category, tickers in data.items():
        df[category] = df.index.isin(tickers).astype(int)

    # Reset index to make 'ticker' a column
    df = df.reset_index().rename(columns={'index': 'ticker'})

    # Display the first few rows of the DataFrame
    print(df.head())

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(os.getcwd(), 'stock_categories.csv')
    df.to_csv(csv_path, index=False)
    print(f"Data saved to {csv_path}")

    # Prepare data for insertion
    data_to_insert = []
    current_timestamp = datetime.datetime.now().isoformat()
    for category, tickers in data.items():
        for ticker in tickers:
            data_to_insert.append((current_date, ticker, category, current_timestamp))

    # Create table if it doesn't exist
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS stock_categories
    (date TEXT, ticker TEXT, category TEXT, timestamp TEXT)
    ''')

    # Insert data into database
    query = '''
    INSERT INTO stock_categories (date, ticker, category, timestamp)
    VALUES (?, ?, ?, ?)
    '''
    cursor.executemany(query, data_to_insert)
    conn.commit()

    # Close the cursor and connection
    cursor.close()
    conn.close()

    print(f"Data inserted successfully into {db_path}")

    # Check content of database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Execute the SQL query
    query = """
    SELECT DISTINCT ticker
    FROM stock_categories
    WHERE date = ?
      AND category = 'MA'
    ORDER BY ticker;
    """
    cursor.execute(query, (current_date,))

    # Fetch and print the results
    results = cursor.fetchall()
    print(f"\nTickers in the 'MA' category for {current_date}:")
    for row in results:
        print(row[0])

    # Close the cursor and connection
    cursor.close()
    conn.close()

if __name__ == "__main__":
    main()

Data for 2024-10-24 already exists. Skipping insertion.


In [28]:
# Check content of database
# Connect to the SQLite database
db_path = os.path.join(os.getcwd(), 'stock_data.db')
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Get today's date in ISO format
today = datetime.date.today().isoformat()

# Execute the SQL query
query = """
SELECT DISTINCT ticker
FROM stock_categories
WHERE date = ?
  AND category = 'MA'
ORDER BY ticker;
"""
cursor.execute(query, (today,))

# Fetch and print the results
results = cursor.fetchall()
print(f"\nTickers in the 'MA' category for {today}:")
for row in results:
    print(row[0])


Tickers in the 'MA' category for 2024-10-24:
AAL
AAPL
AGNC
ALTM
AMZN
BAC
BBD
CLSK
DJT
F
GRAB
INTC
IONQ
KO
LCID
MARA
NIO
NVDA
OKLO
PLTR
PTON
SNAP
SOFI
T
TSLA


In [29]:
# Execute a query to get some rows from the table
cursor.execute("SELECT * FROM stock_categories LIMIT 1")

# Extract column names using cursor.description
column_names = [description[0] for description in cursor.description]

print("Column names:", column_names)

Column names: ['date', 'ticker', 'category', 'timestamp']


In [30]:
# Get all unique tickers from the database
cursor.execute("SELECT DISTINCT ticker FROM stock_categories")
tickers = cursor.fetchall()

# Iterate over each ticker to get unique categories for each ticker
for ticker in tickers:
    specific_ticker = ticker[0]  # Extract the ticker from the tuple

    # Get all unique categories for the specific ticker
    cursor.execute("SELECT DISTINCT category FROM stock_categories WHERE ticker = ?", (specific_ticker,))
    categories = cursor.fetchall()

    # Print the results for each ticker
    print(f"Ticker: {specific_ticker}")
    print("Categories:", [category[0] for category in categories])

Ticker: NVDA
Categories: ['MA']
Ticker: T
Categories: ['MA', 'TN', 'GA']
Ticker: TSLA
Categories: ['MA', 'TN']
Ticker: GRAB
Categories: ['MA']
Ticker: DJT
Categories: ['MA', 'GA']
Ticker: MARA
Categories: ['MA']
Ticker: AAPL
Categories: ['MA']
Ticker: LCID
Categories: ['MA']
Ticker: INTC
Categories: ['MA']
Ticker: AGNC
Categories: ['MA']
Ticker: SOFI
Categories: ['MA']
Ticker: NIO
Categories: ['MA']
Ticker: OKLO
Categories: ['MA', 'TL']
Ticker: F
Categories: ['MA']
Ticker: PLTR
Categories: ['MA']
Ticker: CLSK
Categories: ['MA']
Ticker: PTON
Categories: ['MA', 'TN', 'GA']
Ticker: SNAP
Categories: ['MA']
Ticker: IONQ
Categories: ['MA']
Ticker: BBD
Categories: ['MA']
Ticker: BAC
Categories: ['MA']
Ticker: AAL
Categories: ['MA']
Ticker: ALTM
Categories: ['MA']
Ticker: AMZN
Categories: ['MA']
Ticker: KO
Categories: ['MA']
Ticker: IBM
Categories: ['TN']
Ticker: LRCX
Categories: ['TN']
Ticker: NOW
Categories: ['TN']
Ticker: SAVE
Categories: ['TN']
Ticker: QS
Categories: ['TN']
Ticker: VKTX
Ca

##### 2) Market capitalization 

In [None]:
#<span class="label yf-mrt107">Market Cap (intraday)</span>
#DO IT WITH JSON



##### 3) Sustainability 

In [None]:
#Total ESG Risk Score: <div class="scoreRank yf-y3c2sq"><h4 class="border yf-y3c2sq">23.9</h4> <span class="yf-y3c2sq">44th percentile</span></div>


In [35]:
def fetch_and_parse_sust(url_sust):
    page = requests.get(url_sust).content
    tree = etree.HTML(page)
    nodes = tree.xpath("//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'yf-y3c2sq', ' ' ))]")
    texts = [node.text for node in nodes]
    cleaned = [text.strip() for text in texts if text and text.strip()]
    result = [cleaned[i] for i in range(0, len(cleaned), 2)]
    return result

In [36]:
# Example URL for scraping
url = 'https://finance.yahoo.com/quote/NVDA/sustainability/'

# Fetch and parse sustainability data
sustainability_data = fetch_and_parse_sust(url)

# Check if data is fetched successfully
if sustainability_data:
    # Create a DataFrame from the fetched data
    df = pd.DataFrame(sustainability_data, columns=['Sustainability Metric'])
    print(df)
else:
    print("No data was fetched from the URL.")

No data was fetched from the URL.


In [31]:
import requests
from bs4 import BeautifulSoup

# Create the sustainability_data table with appropriate data types
cursor.execute('''
CREATE TABLE IF NOT EXISTS sustainability_data (
    ticker TEXT PRIMARY KEY,
    total_esg_risk_score REAL,
    environmental_risk_score REAL,
    social_risk_score REAL,
    governance_risk_score REAL
)
''')

def fetch_sustainability_data(ticker):
    url = f'https://finance.yahoo.com/quote/{ticker}/sustainability/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Use appropriate selectors based on actual HTML structure
    total_esg_risk_score = soup.find(text='Total ESG Risk Score').find_next('span').get_text(strip=True)
    environmental_risk_score = soup.find(text='Environmental Risk Score').find_next('span').get_text(strip=True)
    social_risk_score = soup.find(text='Social Risk Score').find_next('span').get_text(strip=True)
    governance_risk_score = soup.find(text='Governance Risk Score').find_next('span').get_text(strip=True)
    
    # Return the scores as a dictionary
    return {
        'total_esg_risk_score': float(total_esg_risk_score),
        'environmental_risk_score': float(environmental_risk_score),
        'social_risk_score': float(social_risk_score),
        'governance_risk_score': float(governance_risk_score)
    }

def fetch_sustainability_data(ticker):
    url = f'https://finance.yahoo.com/quote/{ticker}/sustainability/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    def get_score(label):
        element = soup.find(text=label)
        return float(element.find_next('span').get_text(strip=True)) if element else None
    
    return {
        'total_esg_risk_score': get_score('Total ESG Risk Score'),
        'environmental_risk_score': get_score('Environmental Risk Score'),
        'social_risk_score': get_score('Social Risk Score'),
        'governance_risk_score': get_score('Governance Risk Score')
    }

tickers = ['NVDA', 'AAPL', 'GOOGL']  # Example tickers

# Connect to the database
conn = sqlite3.connect('sustainability.db')
cursor = conn.cursor()

for ticker in tickers:
    data = fetch_sustainability_data(ticker)
    if all(data.values()):  # Check if all values are present
        cursor.execute('''
        INSERT INTO sustainability_data (ticker, total_esg_risk_score, environmental_risk_score, social_risk_score, governance_risk_score)
        VALUES (?, ?, ?, ?, ?)
        ON CONFLICT(ticker) DO UPDATE SET
            total_esg_risk_score=excluded.total_esg_risk_score,
            environmental_risk_score=excluded.environmental_risk_score,
            social_risk_score=excluded.social_risk_score,
            governance_risk_score=excluded.governance_risk_score
        ''', (ticker, 
              data['total_esg_risk_score'], 
              data['environmental_risk_score'], 
              data['social_risk_score'], 
              data['governance_risk_score']))


  element = soup.find(text=label)


In [32]:
# Execute a query to select all data from the sustainability_data table
cursor.execute('SELECT * FROM sustainability_data')

# Fetch all rows from the executed query
rows = cursor.fetchall()

# Get column names from the cursor description
column_names = [description[0] for description in cursor.description]

# Print column names
print("Column Names:", column_names)

# Print each row of data
for row in rows:
    print(row)

Column Names: ['ticker', 'total_esg_risk_score', 'environmental_risk_score', 'social_risk_score', 'governance_risk_score']


In [None]:
# Close the cursor and connection
cursor.close()
conn.close()