# ESG-Driven Portfolio Optimization: Personalized Factor Investing Data Analysis

In [1]:
import pandas as pd
import requests
import json
from PIL import Image
import transformers
import os
import datetime
from lxml import etree
import sqlite3

  from .autonotebook import tqdm as notebook_tqdm


### I Data import by webscraping

##### 1) Tickers

In [2]:
# Define URLs for different categories
URLs = {
    'MA': "https://finance.yahoo.com/markets/stocks/most-active/",
    'TN': "https://finance.yahoo.com/markets/stocks/trending/",
    'GA': "https://finance.yahoo.com/markets/stocks/gainers/",
    'TL': "https://finance.yahoo.com/markets/stocks/losers/"
}

def fetch_and_parse(url):
    page = requests.get(url).content
    tree = etree.HTML(page)
    nodes = tree.xpath("//*[contains(concat(' ', @class, ' '), concat(' ', 'yf-138ga19', ' '))]")
    texts = [node.text for node in nodes]
    cleaned = [text.strip() for text in texts if text and text.strip()]
    tickers = [cleaned[i] for i in range(0, len(cleaned), 2)]
    return tickers

def data_exists_for_today(conn, date):
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM stock_categories WHERE date = ?", (date,))
    count = cursor.fetchone()[0]
    return count > 0

def main():
    current_date = datetime.date.today().isoformat()
    db_path = os.path.join(os.getcwd(), 'stock_data.db')

    # Connect to the SQLite database
    conn = sqlite3.connect(db_path)

    # Check if data for today already exists
    if data_exists_for_today(conn, current_date):
        print(f"Data for {current_date} already exists. Skipping insertion.")
        conn.close()
        return

    # Fetch and parse data for each category
    data = {category: fetch_and_parse(url) for category, url in URLs.items()}

    # Create a set of all unique tickers
    all_tickers = set()
    for tickers in data.values():
        all_tickers.update(tickers)

    # Create a DataFrame with all tickers and categories
    df = pd.DataFrame(index=sorted(all_tickers), columns=URLs.keys())

    # Fill the DataFrame
    for category, tickers in data.items():
        df[category] = df.index.isin(tickers).astype(int)

    # Reset index to make 'ticker' a column
    df = df.reset_index().rename(columns={'index': 'ticker'})

    # Display the first few rows of the DataFrame
    print(df.head())

    # Save the DataFrame to a CSV file
    csv_path = os.path.join(os.getcwd(), 'stock_categories.csv')
    df.to_csv(csv_path, index=False)
    print(f"Data saved to {csv_path}")

    # Prepare data for insertion
    data_to_insert = []
    current_timestamp = datetime.datetime.now().isoformat()
    for category, tickers in data.items():
        for ticker in tickers:
            data_to_insert.append((current_date, ticker, category, current_timestamp))

    # Create table if it doesn't exist
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS stock_categories
    (date TEXT, ticker TEXT, category TEXT, timestamp TEXT)
    ''')

    # Insert data into database
    query = '''
    INSERT INTO stock_categories (date, ticker, category, timestamp)
    VALUES (?, ?, ?, ?)
    '''
    cursor.executemany(query, data_to_insert)
    conn.commit()

    # Close the cursor and connection
    cursor.close()
    conn.close()

    print(f"Data inserted successfully into {db_path}")

    # Check content of database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Execute the SQL query
    query = """
    SELECT DISTINCT ticker
    FROM stock_categories
    WHERE date = ?
      AND category = 'MA'
    ORDER BY ticker;
    """
    cursor.execute(query, (current_date,))

    # Fetch and print the results
    results = cursor.fetchall()
    print(f"\nTickers in the 'MA' category for {current_date}:")
    for row in results:
        print(row[0])

    # Close the cursor and connection
    cursor.close()
    conn.close()

if __name__ == "__main__":
    main()

Data for 2024-10-24 already exists. Skipping insertion.


In [3]:
# Check content of database
# Connect to the SQLite database
db_path = os.path.join(os.getcwd(), 'stock_data.db')
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Get today's date in ISO format
today = datetime.date.today().isoformat()

# Execute the SQL query
query = """
SELECT DISTINCT ticker
FROM stock_categories
WHERE date = ?
  AND category = 'MA'
ORDER BY ticker;
"""
cursor.execute(query, (today,))

# Fetch and print the results
results = cursor.fetchall()
print(f"\nTickers in the 'MA' category for {today}:")
for row in results:
    print(row[0])

# Close the cursor and connection
cursor.close()
conn.close()


Tickers in the 'MA' category for 2024-10-24:
AAL
AAPL
AGNC
ALTM
AMZN
BAC
BBD
CLSK
DJT
F
GRAB
INTC
IONQ
KO
LCID
MARA
NIO
NVDA
OKLO
PLTR
PTON
SNAP
SOFI
T
TSLA


##### 2) Market capitalization 

##### 3) Sustainability 

In [None]:
URL = 'https://finance.yahoo.com/quote/' + list_item + '/sustainability/'

# Make the GET request
response = requests.get(URL)

# Parse the JSON response
json_data = response.json()

# Pretty print the JSON response
pretty_json = json.dumps(json_data, indent=4)
print(pretty_json)

# Create and print df
df = pd.json_normalize(json_data)