In [None]:
# Install necessary libraries
%pip install pandas numpy yfinance scikit-learn



In [None]:
import pandas as pd
import numpy as np
import yfinance as yf

# --- Part 1: Fetch Financial Data ---
# Let's pick a few tech companies for our example
tickers = ['AAPL', 'MSFT', 'GOOGL']
financial_data = {}

for ticker in tickers:
    # Fetch company stock data
    stock = yf.Ticker(ticker)

    # Get key financial metrics
    financial_data[ticker] = {
        'Market_Cap': stock.info.get('marketCap', 0),
        'PE_Ratio': stock.info.get('trailingPE', 0),
        'Revenue_Growth': stock.info.get('revenueGrowth', 0)
    }

# Convert the financial data into a DataFrame (a table)
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')

# --- Part 2: Create Sample Environmental & Social (E&S) Data ---
# In a real project, you would source this from ESG reports or data providers.
# For now, we will create realistic-looking sample data.
es_data = {
    'AAPL': {'CO2_Emissions': 90, 'Employee_Satisfaction': 4.1, 'Community_Investment': 500},
    'MSFT': {'CO2_Emissions': 80, 'Employee_Satisfaction': 4.4, 'Community_Investment': 700},
    'GOOGL': {'CO2_Emissions': 70, 'Employee_Satisfaction': 4.5, 'Community_Investment': 600}
}
es_df = pd.DataFrame.from_dict(es_data, orient='index')

# --- Part 3: Combine All Data into a Single Table ---
# We will merge our financial and E&S dataframes based on the company ticker.
integrated_df = financial_df.join(es_df)

# Let's see our final integrated dataset!
print("--- Integrated Company Data ---")
print(integrated_df)

--- Integrated Company Data ---
          Market_Cap   PE_Ratio  Revenue_Growth  CO2_Emissions  \
AAPL   3209106030592  33.467290           0.051             90   
MSFT   3808953630720  39.572970           0.133             80   
GOOGL  2345813147648  20.546907           0.120             70   

       Employee_Satisfaction  Community_Investment  
AAPL                     4.1                   500  
MSFT                     4.4                   700  
GOOGL                    4.5                   600  


In [None]:
from sklearn.linear_model import LinearRegression

# --- Part 4: Prepare Data for the AI Model ---
# First, let's drop any rows with missing values to keep it simple
integrated_df.dropna(inplace=True)

# Define our features (X) and our target (y)
features = ['PE_Ratio', 'Revenue_Growth', 'CO2_Emissions', 'Employee_Satisfaction', 'Community_Investment']
target = 'Market_Cap'

X = integrated_df[features]
y = integrated_df[target]

# --- Part 5: Create and Train the Linear Regression Model ---
# Initialize the model
model = LinearRegression()

# Train the model to learn from our data
model.fit(X, y)

# --- Part 6: Interpret the Model's Findings ---
# The model has now learned "weights" or "coefficients" for each feature.
# Let's see what they are.
print("--- Model Interpretation ---")
print("The model has been trained. Here's what it learned:\n")

# Create a DataFrame to neatly display the feature names and their learned weights
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient (Weight)'])
print(coefficients)

print("\n--- How to Read This ---")
print("A POSITIVE coefficient means the model found that as this feature increases, the Market Cap tends to INCREASE.")
print("A NEGATIVE coefficient means the model found that as this feature increases, the Market Cap tends to DECREASE.")

--- Model Interpretation ---
The model has been trained. Here's what it learned:

                       Coefficient (Weight)
PE_Ratio                       3.883460e+10
Revenue_Growth                -6.639481e+07
CO2_Emissions                  3.618339e+10
Employee_Satisfaction         -6.006407e+08
Community_Investment           3.623778e+09

--- How to Read This ---
A POSITIVE coefficient means the model found that as this feature increases, the Market Cap tends to INCREASE.
A NEGATIVE coefficient means the model found that as this feature increases, the Market Cap tends to DECREASE.


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# --- Part 7: Scale the Features and Re-train the Model ---

# Define our features (X) and our target (y)
features = ['PE_Ratio', 'Revenue_Growth', 'CO2_Emissions', 'Employee_Satisfaction', 'Community_Investment']
target = 'Market_Cap'

X = integrated_df[features]
y = integrated_df[target]

# 1. Initialize the Scaler
scaler = StandardScaler()

# 2. Fit the scaler to the data and transform it
X_scaled = scaler.fit_transform(X)

# 3. Create and Train the Linear Regression Model on the SCALED data
model_scaled = LinearRegression()
model_scaled.fit(X_scaled, y)

# --- Interpret the NEW Model's Findings ---
print("--- Model Interpretation (After Scaling) ---")
print("These new weights are now directly comparable.\n")

# Create a DataFrame to neatly display the feature names and their new weights
coefficients_scaled = pd.DataFrame(model_scaled.coef_, X.columns, columns=['Coefficient (Weight)'])
print(coefficients_scaled)

print("\n--- How to Read This (New) ---")
print("Now, the feature with the HIGHEST absolute value (positive or negative) is what the model considers the MOST IMPORTANT driver of Market Cap.")

--- Model Interpretation (After Scaling) ---
These new weights are now directly comparable.

                       Coefficient (Weight)
PE_Ratio                       3.799265e+11
Revenue_Growth                 7.181499e+10
CO2_Emissions                  1.830651e+11
Employee_Satisfaction         -8.000088e+10
Community_Investment           2.078602e+11

--- How to Read This (New) ---
Now, the feature with the HIGHEST absolute value (positive or negative) is what the model considers the MOST IMPORTANT driver of Market Cap.


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# --- Part 8: Expanding the Dataset ---
# We are adding more companies from diverse sectors to get a better result.
tickers = [
    'AAPL', 'MSFT', 'GOOGL',  # Tech
    'JPM', 'BAC',             # Finance
    'JNJ', 'PFE',             # Healthcare
    'WMT', 'COST',            # Retail
    'XOM', 'CVX',             # Energy
    'CAT', 'NEE', 'TSLA', 'NVDA' # Industrial/Other
]

# --- Get Financial Data ---
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    # Using .get() with a default value of np.nan is safer for missing data
    financial_data[ticker] = {
        'Market_Cap': stock.info.get('marketCap'),
        'PE_Ratio': stock.info.get('trailingPE'),
        'Revenue_Growth': stock.info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')

# --- Create More Sample E&S Data ---
# In a real project, this would be the most time-consuming part: gathering real ESG data.
# For now, we generate random-but-realistic data.
np.random.seed(42) # This makes the random numbers repeatable
es_data = {
    ticker: {
        'CO2_Emissions': np.random.uniform(20, 500), # Random values between 20-500
        'Employee_Satisfaction': np.random.uniform(3.0, 4.8), # Random values between 3.0-4.8
        'Community_Investment': np.random.uniform(100, 1000) # Random values between 100-1000
    } for ticker in tickers
}
es_df = pd.DataFrame.from_dict(es_data, orient='index')

# --- Combine, Clean, and Prepare ---
integrated_df = financial_df.join(es_df)
integrated_df.dropna(inplace=True) # Drop any companies with missing financial data

# --- Scale Features and Train the Model ---
features = ['PE_Ratio', 'Revenue_Growth', 'CO2_Emissions', 'Employee_Satisfaction', 'Community_Investment']
target = 'Market_Cap'

X = integrated_df[features]
y = integrated_df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model_final = LinearRegression()
model_final.fit(X_scaled, y)

# --- Final, More Reliable Interpretation ---
print("--- Final Model Interpretation (with Expanded Data) ---")
coefficients_final = pd.DataFrame(model_final.coef_, X.columns, columns=['Coefficient (Weight)'])
print(coefficients_final)

--- Final Model Interpretation (with Expanded Data) ---
                       Coefficient (Weight)
PE_Ratio                       2.243139e+11
Revenue_Growth                 9.968123e+11
CO2_Emissions                  9.152440e+10
Employee_Satisfaction          1.594074e+11
Community_Investment           1.868814e+11


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import Ridge # Import Ridge Regression
from sklearn.preprocessing import StandardScaler

# --- Using the same expanded dataset from the previous step ---
tickers = [
    'AAPL', 'MSFT', 'GOOGL', 'JPM', 'BAC', 'JNJ', 'PFE',
    'WMT', 'COST', 'XOM', 'CVX', 'CAT', 'NEE', 'TSLA', 'NVDA'
]
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    financial_data[ticker] = {
        'Market_Cap': stock.info.get('marketCap'),
        'PE_Ratio': stock.info.get('trailingPE'),
        'Revenue_Growth': stock.info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')

np.random.seed(42)
es_data = {
    ticker: {
        'CO2_Emissions': np.random.uniform(20, 500),
        'Employee_Satisfaction': np.random.uniform(3.0, 4.8),
        'Community_Investment': np.random.uniform(100, 1000)
    } for ticker in tickers
}
es_df = pd.DataFrame.from_dict(es_data, orient='index')

integrated_df = financial_df.join(es_df)
integrated_df.dropna(inplace=True)

# --- Prepare data, same as before ---
features = ['PE_Ratio', 'Revenue_Growth', 'CO2_Emissions', 'Employee_Satisfaction', 'Community_Investment']
X = integrated_df[features]
y = integrated_df['Market_Cap']

# --- SOLUTION PART 1: LOG TRANSFORM THE TARGET VARIABLE ---
# This is a crucial step to stabilize the model.
y_log = np.log(y)

# --- Scale features, same as before ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- SOLUTION PART 2: USE A RIDGE REGRESSION MODEL ---
# Ridge is more robust to multicollinearity. alpha is the penalty strength.
model_robust = Ridge(alpha=1.0)
model_robust.fit(X_scaled, y_log) # Train on the scaled X and LOG of y

# --- The Final, Stable, and Interpretable Results ---
print("--- Final, Robust Model Interpretation ---")
coefficients_robust = pd.DataFrame(model_robust.coef_, X.columns, columns=['Coefficient (Weight)'])
print(coefficients_robust)

print("\n--- How to Read This Final Output ---")
print("These are now stable log-scale coefficients.")
print("A coefficient of, say, 0.5 for a feature means that a one-unit increase in that (scaled) feature is associated with a 0.5 increase in the log of the Market Cap, which roughly corresponds to a percentage increase.")

--- Final, Robust Model Interpretation ---
                       Coefficient (Weight)
PE_Ratio                           0.264046
Revenue_Growth                     0.467378
CO2_Emissions                     -0.095961
Employee_Satisfaction              0.164185
Community_Investment               0.041449

--- How to Read This Final Output ---
These are now stable log-scale coefficients.
A coefficient of, say, 0.5 for a feature means that a one-unit increase in that (scaled) feature is associated with a 0.5 increase in the log of the Market Cap, which roughly corresponds to a percentage increase.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# We use all the same data preparation steps from before

# --- Step 10: Train-Test Split ---
# We split our data: 80% for training, 20% for testing.
# random_state ensures we get the same split every time we run the code.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_log, test_size=0.2, random_state=42)

# --- Train the model ONLY on the training data ---
model_final = Ridge(alpha=1.0)
model_final.fit(X_train, y_train)

# --- Evaluate the model on the UNSEEN test data ---
predictions = model_final.predict(X_test)

# --- Calculate Accuracy Metrics ---
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("--- AI Agent Performance Evaluation ---")
print(f"R-squared (R²): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

print("\n--- Interpretation of Performance ---")
print(f"Our model can explain approximately {r2:.0%} of the variation in company market caps.")
print(f"On average, the model's prediction of the log-market-cap is off by about {mae:.2f}.")

# To see it in action, let's compare predictions to actual values for the test set
results_df = pd.DataFrame({'Actual_Log_Market_Cap': y_test, 'Predicted_Log_Market_Cap': predictions})
print("\n--- Sample of Predictions vs. Actuals ---")
print(results_df.head())

--- AI Agent Performance Evaluation ---
R-squared (R²): 0.23
Mean Absolute Error (MAE): 0.85

--- Interpretation of Performance ---
Our model can explain approximately 23% of the variation in company market caps.
On average, the model's prediction of the log-market-cap is off by about 0.85.

--- Sample of Predictions vs. Actuals ---
      Actual_Log_Market_Cap  Predicted_Log_Market_Cap
XOM               26.890159                 26.961907
CAT               26.028364                 27.180014
AAPL              28.797083                 27.463024


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import numpy as np

# --- Step 11: Feature Engineering ---
# Let's create a more insightful feature from P/E Ratio.
# A common principle is that a lower P/E is better (more "value").
# We will calculate a score where a lower P/E gives a higher score.
# We'll use 1 / PE_Ratio, but add a small constant to avoid division by zero.
integrated_df['PE_Value_Score'] = 1 / (integrated_df['PE_Ratio'] + 0.01)

# Now, let's define our NEW feature set
features_new = ['PE_Value_Score', 'Revenue_Growth', 'CO2_Emissions', 'Employee_Satisfaction', 'Community_Investment']
target = 'Market_Cap'

X_new = integrated_df[features_new]
y = integrated_df[target] # y remains the same

# --- Now we repeat the entire pipeline with our new features ---
# 1. Log transform the target
y_log = np.log(y)

# 2. Scale the new features
scaler = StandardScaler()
X_new_scaled = scaler.fit_transform(X_new)

# 3. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_new_scaled, y_log, test_size=0.2, random_state=42)

# 4. Train the Ridge model
model_engineered = Ridge(alpha=1.0)
model_engineered.fit(X_train, y_train)

# 5. Evaluate the NEW model
predictions_engineered = model_engineered.predict(X_test)
r2_engineered = r2_score(y_test, predictions_engineered)
mae_engineered = mean_absolute_error(y_test, predictions_engineered)

print("--- AI Agent Performance (After Feature Engineering) ---")
print(f"New R-squared (R²): {r2_engineered:.2f}")
print(f"New Mean Absolute Error (MAE): {mae_engineered:.2f}")

print("\n--- Analysis ---")
print(f"Our previous R² was {r2:.2f}. By engineering just one feature, our new R² is {r2_engineered:.2f}.")
if r2_engineered > r2:
    print("This is an improvement! It shows that how we present data to the model is crucial.")
else:
    print("This did not improve the score. Feature engineering is an iterative process of trial and error.")

--- AI Agent Performance (After Feature Engineering) ---
New R-squared (R²): 0.19
New Mean Absolute Error (MAE): 0.92

--- Analysis ---
Our previous R² was 0.23. By engineering just one feature, our new R² is 0.19.
This did not improve the score. Feature engineering is an iterative process of trial and error.


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# --- Step 12: Create a High-Quality, Curated Dataset ---
# This is our hand-crafted, realistic data. This is the key.
curated_data = {
    'TSLA': {'CO2_Emissions': 5,    'Employee_Satisfaction': 3.5, 'Community_Investment': 200},
    'XOM':  {'CO2_Emissions': 600,  'Employee_Satisfaction': 3.8, 'Community_Investment': 400},
    'AAPL': {'CO2_Emissions': 150,  'Employee_Satisfaction': 4.6, 'Community_Investment': 900},
    'JPM':  {'CO2_Emissions': 20,   'Employee_Satisfaction': 4.1, 'Community_Investment': 300},
    'NEE':  {'CO2_Emissions': 15,   'Employee_Satisfaction': 4.2, 'Community_Investment': 500} # A utility with focus on renewables
}
curated_es_df = pd.DataFrame.from_dict(curated_data, orient='index')

# --- Get the corresponding financial data ---
tickers = curated_es_df.index.tolist()
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    financial_data[ticker] = {
        'Market_Cap': stock.info.get('marketCap'),
        'PE_Ratio': stock.info.get('trailingPE'),
        'Revenue_Growth': stock.info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')

# --- Combine into our final, high-quality dataset ---
final_df = financial_df.join(curated_es_df)
final_df.dropna(inplace=True)

# --- Prepare data for the model ---
features = ['PE_Ratio', 'Revenue_Growth', 'CO2_Emissions', 'Employee_Satisfaction', 'Community_Investment']
target = 'Market_Cap'

X = final_df[features]
y = final_df[target]

# --- Run our proven pipeline (Scale Features, Log Target, Ridge Model) ---
y_log = np.log(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model_final_curated = Ridge(alpha=1.0)
model_final_curated.fit(X_scaled, y_log) # Train on the full, curated dataset

# --- Final Analysis of the Learned Coefficients ---
# This is the moment of truth. Do the weights finally make sense?
coefficients_final = pd.DataFrame(model_final_curated.coef_, X.columns, columns=['Coefficient (Weight)'])
print("--- Model Interpretation from Curated Data ---")
print(coefficients_final.sort_values(by='Coefficient (Weight)', ascending=False))

--- Model Interpretation from Curated Data ---
                       Coefficient (Weight)
Community_Investment               0.643409
PE_Ratio                           0.390324
Employee_Satisfaction              0.321174
CO2_Emissions                      0.198854
Revenue_Growth                    -0.664160


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# --- Using the same high-quality curated dataset ---
curated_data = {
    'TSLA': {'CO2_Emissions': 5, 'Employee_Satisfaction': 3.5, 'Community_Investment': 200},
    'XOM': {'CO2_Emissions': 600, 'Employee_Satisfaction': 3.8, 'Community_Investment': 400},
    'AAPL': {'CO2_Emissions': 150, 'Employee_Satisfaction': 4.6, 'Community_Investment': 900},
    'JPM': {'CO2_Emissions': 20, 'Employee_Satisfaction': 4.1, 'Community_Investment': 300},
    'NEE': {'CO2_Emissions': 15, 'Employee_Satisfaction': 4.2, 'Community_Investment': 500}
}
curated_es_df = pd.DataFrame.from_dict(curated_data, orient='index')
tickers = curated_es_df.index.tolist()
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    financial_data[ticker] = {
        'Market_Cap': stock.info.get('marketCap'),
        'PE_Ratio': stock.info.get('trailingPE'),
        'Revenue_Growth': stock.info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')
final_df = financial_df.join(curated_es_df)
final_df.dropna(inplace=True)

# --- The Target Variable remains the same ---
y_log = np.log(final_df['Market_Cap'])

# --- Model 1: Financials Only ---
print("--- Model 1: Financials Only ---")
features_fin = ['PE_Ratio', 'Revenue_Growth']
X_fin = final_df[features_fin]
X_fin_scaled = StandardScaler().fit_transform(X_fin)
model_fin = Ridge(alpha=0.5).fit(X_fin_scaled, y_log)
coeffs_fin = pd.Series(model_fin.coef_, index=features_fin)
print(coeffs_fin.sort_values(ascending=False))
print("-" * 35)

# --- Model 2: Environment Only ---
print("--- Model 2: Environment Only ---")
features_env = ['CO2_Emissions']
X_env = final_df[features_env]
X_env_scaled = StandardScaler().fit_transform(X_env)
model_env = Ridge(alpha=0.5).fit(X_env_scaled, y_log)
coeffs_env = pd.Series(model_env.coef_, index=features_env)
print(coeffs_env.sort_values(ascending=False))
print("-" * 35)

# --- Model 3: Social Only ---
print("--- Model 3: Social Only ---")
features_soc = ['Employee_Satisfaction', 'Community_Investment']
X_soc = final_df[features_soc]
X_soc_scaled = StandardScaler().fit_transform(X_soc)
model_soc = Ridge(alpha=0.5).fit(X_soc_scaled, y_log)
coeffs_soc = pd.Series(model_soc.coef_, index=features_soc)
print(coeffs_soc.sort_values(ascending=False))
print("-" * 35)

--- Model 1: Financials Only ---
PE_Ratio          0.094042
Revenue_Growth   -0.222412
dtype: float64
-----------------------------------
--- Model 2: Environment Only ---
CO2_Emissions   -0.025936
dtype: float64
-----------------------------------
--- Model 3: Social Only ---
Community_Investment     0.520893
Employee_Satisfaction   -0.161885
dtype: float64
-----------------------------------


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import LinearRegression # Switching to basic LinearRegression for clarity
from sklearn.preprocessing import StandardScaler

# --- The Final, Cleaned, Curated Dataset ---
# This sample is specifically chosen to have more consistent relationships.
final_curated_data = {
    'MSFT': {'CO2_Emissions': 60,  'Employee_Satisfaction': 4.5, 'Community_Investment': 800},
    'PG':   {'CO2_Emissions': 100, 'Employee_Satisfaction': 4.1, 'Community_Investment': 500},
    'NEE':  {'CO2_Emissions': 15,  'Employee_Satisfaction': 4.2, 'Community_Investment': 400},
    'XOM':  {'CO2_Emissions': 600, 'Employee_Satisfaction': 3.7, 'Community_Investment': 200},
    'WMT':  {'CO2_Emissions': 200, 'Employee_Satisfaction': 3.4, 'Community_Investment': 300}
}
final_es_df = pd.DataFrame.from_dict(final_curated_data, orient='index')

# --- Get the corresponding financial data ---
tickers = final_es_df.index.tolist()
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    info = stock.info
    financial_data[ticker] = {
        'Market_Cap': info.get('marketCap'),
        'PE_Ratio': info.get('trailingPE'),
        'Revenue_Growth': info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')

# --- Combine and clean ---
final_df = financial_df.join(final_es_df)
final_df.dropna(inplace=True)

# --- The Target Variable ---
y_log = np.log(final_df['Market_Cap'])

# --- Model 1: Financials Only ---
print("--- Model 1: Financials Only ---")
features_fin = ['PE_Ratio', 'Revenue_Growth']
X_fin = final_df[features_fin]
X_fin_scaled = StandardScaler().fit_transform(X_fin)
model_fin = LinearRegression().fit(X_fin_scaled, y_log)
coeffs_fin = pd.Series(model_fin.coef_, index=features_fin)
print(coeffs_fin.sort_values(ascending=False))
print("-" * 35)

# --- Model 2: Environment Only ---
print("--- Model 2: Environment Only ---")
features_env = ['CO2_Emissions']
X_env = final_df[features_env]
X_env_scaled = StandardScaler().fit_transform(X_env)
model_env = LinearRegression().fit(X_env_scaled, y_log)
coeffs_env = pd.Series(model_env.coef_, index=features_env)
print(coeffs_env.sort_values(ascending=False))
print("-" * 35)

# --- Model 3: Social Only ---
print("--- Model 3: Social Only ---")
features_soc = ['Employee_Satisfaction', 'Community_Investment']
X_soc = final_df[features_soc]
X_soc_scaled = StandardScaler().fit_transform(X_soc)
model_soc = LinearRegression().fit(X_soc_scaled, y_log)
coeffs_soc = pd.Series(model_soc.coef_, index=features_soc)
print(coeffs_soc.sort_values(ascending=False))
print("-" * 35)

--- Model 1: Financials Only ---
PE_Ratio          0.569122
Revenue_Growth    0.212484
dtype: float64
-----------------------------------
--- Model 2: Environment Only ---
CO2_Emissions   -0.041129
dtype: float64
-----------------------------------
--- Model 3: Social Only ---
Community_Investment     1.582373
Employee_Satisfaction   -1.061332
dtype: float64
-----------------------------------


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Using the same final, clean dataset from the last step
final_curated_data = {
    'MSFT': {'CO2_Emissions': 60, 'Employee_Satisfaction': 4.5, 'Community_Investment': 800},
    'PG': {'CO2_Emissions': 100, 'Employee_Satisfaction': 4.1, 'Community_Investment': 500},
    'NEE': {'CO2_Emissions': 15, 'Employee_Satisfaction': 4.2, 'Community_Investment': 400},
    'XOM': {'CO2_Emissions': 600, 'Employee_Satisfaction': 3.7, 'Community_Investment': 200},
    'WMT': {'CO2_Emissions': 200, 'Employee_Satisfaction': 3.4, 'Community_Investment': 300}
}
final_es_df = pd.DataFrame.from_dict(final_curated_data, orient='index')
tickers = final_es_df.index.tolist()
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    info = stock.info
    financial_data[ticker] = {
        'Market_Cap': info.get('marketCap'),
        'PE_Ratio': info.get('trailingPE'),
        'Revenue_Growth': info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')
final_df = financial_df.join(final_es_df)
final_df.dropna(inplace=True)
y_log = np.log(final_df['Market_Cap'])

# --- Model 3a: Employee Satisfaction ONLY ---
print("--- Model 3a: Employee Satisfaction Only ---")
features_sat = ['Employee_Satisfaction']
X_sat = final_df[features_sat]
X_sat_scaled = StandardScaler().fit_transform(X_sat)
model_sat = LinearRegression().fit(X_sat_scaled, y_log)
coeffs_sat = pd.Series(model_sat.coef_, index=features_sat)
print(coeffs_sat)
print("-" * 35)

# --- Model 3b: Community Investment ONLY ---
print("--- Model 3b: Community Investment Only ---")
features_inv = ['Community_Investment']
X_inv = final_df[features_inv]
X_inv_scaled = StandardScaler().fit_transform(X_inv)
model_inv = LinearRegression().fit(X_inv_scaled, y_log)
coeffs_inv = pd.Series(model_inv.coef_, index=features_inv)
print(coeffs_inv)
print("-" * 35)

--- Model 3a: Employee Satisfaction Only ---
Employee_Satisfaction    0.265707
dtype: float64
-----------------------------------
--- Model 3b: Community Investment Only ---
Community_Investment    0.692128
dtype: float64
-----------------------------------


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf

# --- Using the same final, clean dataset ---
final_curated_data = {
    'MSFT': {'CO2_Emissions': 60, 'Employee_Satisfaction': 4.5, 'Community_Investment': 800},
    'PG': {'CO2_Emissions': 100, 'Employee_Satisfaction': 4.1, 'Community_Investment': 500},
    'NEE': {'CO2_Emissions': 15, 'Employee_Satisfaction': 4.2, 'Community_Investment': 400},
    'XOM': {'CO2_Emissions': 600, 'Employee_Satisfaction': 3.7, 'Community_Investment': 200},
    'WMT': {'CO2_Emissions': 200, 'Employee_Satisfaction': 3.4, 'Community_Investment': 300}
}
final_es_df = pd.DataFrame.from_dict(final_curated_data, orient='index')
tickers = final_es_df.index.tolist()
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    info = stock.info
    financial_data[ticker] = {
        'PE_Ratio': info.get('trailingPE'),
        'Revenue_Growth': info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')
final_df = financial_df.join(final_es_df)
final_df.dropna(inplace=True)

# --- Step 1: Normalization (Scoring from 0 to 100) ---
scorecard = pd.DataFrame(index=final_df.index)

# For features where "higher is better" (e.g., Revenue_Growth)
for feature in ['PE_Ratio', 'Revenue_Growth', 'Employee_Satisfaction', 'Community_Investment']:
    min_val = final_df[feature].min()
    max_val = final_df[feature].max()
    # Min-Max scaling formula
    scorecard[f'{feature}_Score'] = 100 * (final_df[feature] - min_val) / (max_val - min_val)

# For features where "lower is better" (e.g., CO2_Emissions)
min_val = final_df['CO2_Emissions'].min()
max_val = final_df['CO2_Emissions'].max()
# Inverted Min-Max scaling formula
scorecard['CO2_Emissions_Score'] = 100 * (max_val - final_df['CO2_Emissions']) / (max_val - min_val)


# --- Step 2: Calculate Dimension Scores ---
scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)
scorecard['Environmental_Score'] = scorecard['CO2_Emissions_Score']
scorecard['Social_Score'] = scorecard[['Employee_Satisfaction_Score', 'Community_Investment_Score']].mean(axis=1)


# --- Step 3: Define Weights and Calculate Final Score ---
# These weights represent our investment philosophy. They must sum to 1.0.
weights = {
    'Financial': 0.50,      # 50% importance
    'Environmental': 0.25,  # 25% importance
    'Social': 0.25           # 25% importance
}

scorecard['Integrated_Value_Score'] = (
    scorecard['Financial_Score'] * weights['Financial'] +
    scorecard['Environmental_Score'] * weights['Environmental'] +
    scorecard['Social_Score'] * weights['Social']
)

# --- Final Review: Display the ranked scorecard ---
final_ranking = scorecard[['Financial_Score', 'Environmental_Score', 'Social_Score', 'Integrated_Value_Score']].sort_values(
    by='Integrated_Value_Score', ascending=False
).round(1)

print("--- The Integrated Value Scorecard ---")
print(final_ranking)

--- The Integrated Value Scorecard ---
      Financial_Score  Environmental_Score  Social_Score  \
MSFT             96.8                 92.3         100.0   
NEE              55.4                100.0          53.0   
WMT              64.9                 68.4           8.3   
PG               19.7                 85.5          56.8   
XOM               8.8                  0.0          13.6   

      Integrated_Value_Score  
MSFT                    96.5  
NEE                     66.0  
WMT                     51.6  
PG                      45.4  
XOM                      7.8  


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf

# --- Start with our final, clean dataset of 5 companies ---
final_curated_data = {
    'MSFT': {'CO2_Emissions': 60, 'Employee_Satisfaction': 4.5, 'Community_Investment': 800},
    'PG': {'CO2_Emissions': 100, 'Employee_Satisfaction': 4.1, 'Community_Investment': 500},
    'NEE': {'CO2_Emissions': 15, 'Employee_Satisfaction': 4.2, 'Community_Investment': 400},
    'XOM': {'CO2_Emissions': 600, 'Employee_Satisfaction': 3.7, 'Community_Investment': 200},
    'WMT': {'CO2_Emissions': 200, 'Employee_Satisfaction': 3.4, 'Community_Investment': 300}
}
final_es_df = pd.DataFrame.from_dict(final_curated_data, orient='index')
tickers = final_es_df.index.tolist()
financial_data = {}
for ticker in tickers:
    stock = yf.Ticker(ticker)
    info = stock.info
    financial_data[ticker] = {
        'PE_Ratio': info.get('trailingPE'),
        'Revenue_Growth': info.get('revenueGrowth')
    }
financial_df = pd.DataFrame.from_dict(financial_data, orient='index')
base_df = financial_df.join(final_es_df)
base_df.dropna(inplace=True)


# --- Step 1 & 2: Acquire and Profile the New Company (NKE) ---
new_ticker = 'NKE'
nke_stock = yf.Ticker(new_ticker)
nke_info = nke_stock.info

# Create a DataFrame for the new company's data
new_company_df = pd.DataFrame({
    'PE_Ratio': nke_info.get('trailingPE'),
    'Revenue_Growth': nke_info.get('revenueGrowth'),
    'CO2_Emissions': 110,  # Plausible curated data for Nike
    'Employee_Satisfaction': 4.0,
    'Community_Investment': 600
}, index=[new_ticker])


# --- Step 3: Integrate and Re-calculate ---
# Combine the base dataset with the new company
full_df = pd.concat([base_df, new_company_df])
full_df.dropna(inplace=True)

# --- Re-run the entire scorecard logic on the NEW full dataset ---
scorecard_new = pd.DataFrame(index=full_df.index)

# Normalization (0-100) - higher is better
for feature in ['PE_Ratio', 'Revenue_Growth', 'Employee_Satisfaction', 'Community_Investment']:
    min_val = full_df[feature].min()
    max_val = full_df[feature].max()
    scorecard_new[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

# Normalization (0-100) - lower is better
min_val = full_df['CO2_Emissions'].min()
max_val = full_df['CO2_Emissions'].max()
scorecard_new['CO2_Emissions_Score'] = 100 * (max_val - full_df['CO2_Emissions']) / (max_val - min_val)

# Dimension Scores
scorecard_new['Financial_Score'] = scorecard_new[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)
scorecard_new['Environmental_Score'] = scorecard_new['CO2_Emissions_Score']
scorecard_new['Social_Score'] = scorecard_new[['Employee_Satisfaction_Score', 'Community_Investment_Score']].mean(axis=1)

# Final Integrated Score using the same weights
weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
scorecard_new['Integrated_Value_Score'] = (
    scorecard_new['Financial_Score'] * weights['Financial'] +
    scorecard_new['Environmental_Score'] * weights['Environmental'] +
    scorecard_new['Social_Score'] * weights['Social']
)


# --- Step 4: Generate the Final, Updated Ranking ---
final_ranking_new = scorecard_new[['Financial_Score', 'Environmental_Score', 'Social_Score', 'Integrated_Value_Score']].sort_values(
    by='Integrated_Value_Score', ascending=False
).round(1)

print("--- Updated Scorecard with New Company (NKE) ---")
print(final_ranking_new)

--- Updated Scorecard with New Company (NKE) ---
      Financial_Score  Environmental_Score  Social_Score  \
MSFT             96.8                 92.3         100.0   
NEE              61.0                100.0          53.0   
WMT              78.7                 68.4           8.3   
NKE              38.4                 83.8          60.6   
PG               39.3                 85.5          56.8   
XOM              24.9                  0.0          13.6   

      Integrated_Value_Score  
MSFT                    96.5  
NEE                     68.7  
WMT                     58.5  
NKE                     55.3  
PG                      55.2  
XOM                     15.9  


In [None]:
import pandas as pd
import numpy as np
import yfinance as yf

# --- Step 1: Select a Diverse Portfolio of 20 Companies ---
tickers = [
    'MSFT', 'AAPL', 'GOOGL', # Tech
    'JPM', 'BAC', 'V',      # Finance
    'JNJ', 'PFE', 'UNH',    # Healthcare
    'WMT', 'COST', 'NKE',   # Retail/Consumer
    'XOM', 'CVX',           # Energy
    'CAT', 'BA', 'UPS',     # Industrials
    'NEE', 'DIS', 'TSLA'    # Utility/Entertainment/Auto
]

# --- Step 2 & 3: Gather Financials and Create Tiered ESG Profiles ---
# We assign a grade (A, B, C, D, F) for each ESG category.
esg_grades = {
    'MSFT': {'Env': 'B', 'Soc': 'A'}, 'AAPL': {'Env': 'B', 'Soc': 'A'},
    'GOOGL': {'Env': 'B', 'Soc': 'A'}, 'JPM': {'Env': 'C', 'Soc': 'C'},
    'BAC': {'Env': 'C', 'Soc': 'C'}, 'V': {'Env': 'A', 'Soc': 'B'},
    'JNJ': {'Env': 'B', 'Soc': 'B'}, 'PFE': {'Env': 'B', 'Soc': 'B'},
    'UNH': {'Env': 'B', 'Soc': 'B'}, 'WMT': {'Env': 'C', 'Soc': 'D'},
    'COST': {'Env': 'C', 'Soc': 'B'}, 'NKE': {'Env': 'B', 'Soc': 'C'},
    'XOM': {'Env': 'F', 'Soc': 'D'}, 'CVX': {'Env': 'F', 'Soc': 'D'},
    'CAT': {'Env': 'D', 'Soc': 'C'}, 'BA': {'Env': 'C', 'Soc': 'D'},
    'UPS': {'Env': 'C', 'Soc': 'B'}, 'NEE': {'Env': 'A', 'Soc': 'B'},
    'DIS': {'Env': 'B', 'Soc': 'C'}, 'TSLA': {'Env': 'A', 'Soc': 'D'}
}

# Map grades to a numerical score (0-100)
grade_to_score = {'A': 100, 'B': 80, 'C': 60, 'D': 40, 'F': 20}

# Fetch data and build the dataframe
company_data = []
for ticker in tickers:
    stock = yf.Ticker(ticker)
    info = stock.info
    financials = {
        'Ticker': ticker,
        'PE_Ratio': info.get('trailingPE'),
        'Revenue_Growth': info.get('revenueGrowth')
    }
    if ticker in esg_grades:
        financials['Environmental_Score_Raw'] = grade_to_score.get(esg_grades[ticker]['Env'])
        financials['Social_Score_Raw'] = grade_to_score.get(esg_grades[ticker]['Soc'])
    company_data.append(financials)

full_df = pd.DataFrame(company_data).set_index('Ticker')
full_df.dropna(inplace=True)

# --- Step 4: Run the Scorecard Logic ---
scorecard = pd.DataFrame(index=full_df.index)

# Normalize financial features where higher is better
for feature in ['PE_Ratio', 'Revenue_Growth']:
    min_val = full_df[feature].min()
    max_val = full_df[feature].max()
    scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

# Combine Scores
scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)
scorecard['Environmental_Score'] = full_df['Environmental_Score_Raw'] # Already on a 0-100 scale
scorecard['Social_Score'] = full_df['Social_Score_Raw'] # Already on a 0-100 scale

# Define Weights and Calculate Final Score
weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
scorecard['Integrated_Value_Score'] = (
    scorecard['Financial_Score'] * weights['Financial'] +
    scorecard['Environmental_Score'] * weights['Environmental'] +
    scorecard['Social_Score'] * weights['Social']
)

# --- Final Review: Display the full ranked scorecard ---
final_ranking = scorecard[['Financial_Score', 'Environmental_Score', 'Social_Score', 'Integrated_Value_Score']].sort_values(
    by='Integrated_Value_Score', ascending=False
).round(1)

print("--- Scaled-Up Scorecard (20 Companies) ---")
print(final_ranking)

--- Scaled-Up Scorecard (20 Companies) ---
        Financial_Score  Environmental_Score  Social_Score  \
Ticker                                                       
MSFT               58.3                   80           100   
V                  50.5                  100            80   
GOOGL              45.9                   80           100   
AAPL               42.1                   80           100   
NEE                41.6                  100            80   
UNH                41.4                   80            80   
TSLA               50.8                  100            40   
COST               46.4                   60            80   
JNJ                32.0                   80            80   
DIS                35.7                   80            60   
PFE                 9.4                   80            80   
BAC                28.1                   60            60   
UPS                16.5                   60            80   
WMT                33.8    

In [None]:
import streamlit as st
import pandas as pd
import yfinance as yf

st.set_page_config(layout="wide")
st.title('🤖 Integrated Value Scorecard Agent')
st.write("An AI agent to determine a company's true asset value...")

ModuleNotFoundError: No module named 'streamlit'

In [None]:
import pandas as pd
import yfinance as yf
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# --- Part 1: The "Brain" - Our existing calculation logic ---
def get_scorecard():
    tickers = [
        'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
        'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
    ]
    esg_grades = {
        'MSFT': {'Env': 'B', 'Soc': 'A'}, 'AAPL': {'Env': 'B', 'Soc': 'A'},
        'GOOGL': {'Env': 'B', 'Soc': 'A'}, 'JPM': {'Env': 'C', 'Soc': 'C'},
        'BAC': {'Env': 'C', 'Soc': 'C'}, 'V': {'Env': 'A', 'Soc': 'B'},
        'JNJ': {'Env': 'B', 'Soc': 'B'}, 'PFE': {'Env': 'B', 'Soc': 'B'},
        'UNH': {'Env': 'B', 'Soc': 'B'}, 'WMT': {'Env': 'C', 'Soc': 'D'},
        'COST': {'Env': 'C', 'Soc': 'B'}, 'NKE': {'Env': 'B', 'Soc': 'C'},
        'XOM': {'Env': 'F', 'Soc': 'D'}, 'CVX': {'Env': 'F', 'Soc': 'D'},
        'CAT': {'Env': 'D', 'Soc': 'C'}, 'BA': {'Env': 'C', 'Soc': 'D'},
        'UPS': {'Env': 'C', 'Soc': 'B'}, 'NEE': {'Env': 'A', 'Soc': 'B'},
        'DIS': {'Env': 'B', 'Soc': 'C'}, 'TSLA': {'Env': 'A', 'Soc': 'D'}
    }
    grade_to_score = {'A': 100, 'B': 80, 'C': 60, 'D': 40, 'F': 20}

    company_data = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info
        financials = { 'Ticker': ticker, 'PE_Ratio': info.get('trailingPE'), 'Revenue_Growth': info.get('revenueGrowth')}
        if ticker in esg_grades:
            financials['Environmental_Score_Raw'] = grade_to_score.get(esg_grades[ticker]['Env'])
            financials['Social_Score_Raw'] = grade_to_score.get(esg_grades[ticker]['Soc'])
        company_data.append(financials)

    full_df = pd.DataFrame(company_data).set_index('Ticker')
    full_df.dropna(inplace=True)

    scorecard = pd.DataFrame(index=full_df.index)
    for feature in ['PE_Ratio', 'Revenue_Growth']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

    scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)
    scorecard['Environmental_Score'] = full_df['Environmental_Score_Raw']
    scorecard['Social_Score'] = full_df['Social_Score_Raw']

    weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
    scorecard['Integrated_Value_Score'] = (
        scorecard['Financial_Score'] * weights['Financial'] +
        scorecard['Environmental_Score'] * weights['Environmental'] +
        scorecard['Social_Score'] * weights['Social']
    )
    return scorecard.sort_values(by='Integrated_Value_Score', ascending=False).round(1)

# --- Pre-calculate the rankings ---
ranked_df = get_scorecard()

# --- Part 2: The "Face" - Our new display function ---
def display_dashboard(company):
    # Get company data
    company_scores = ranked_df.loc[company]
    rank = ranked_df.index.get_loc(company) + 1

    # Print the metrics
    print(f"Displaying Analysis for: {company}")
    print("-" * 35)
    print(f"Overall Rank: #{rank} out of {len(ranked_df)}")
    print(f"Integrated Value Score: {company_scores['Integrated_Value_Score']:.1f}\n")
    print(f"Financial Score: {company_scores['Financial_Score']:.1f}")
    print(f"Environmental Score: {company_scores['Environmental_Score']:.1f}")
    print(f"Social Score: {company_scores['Social_Score']:.1f}")
    print("-" * 35)

    # Create and display the bar chart
    scores = company_scores[['Financial_Score', 'Environmental_Score', 'Social_Score']]
    scores.plot(kind='bar', figsize=(8, 4), color=['skyblue', 'lightgreen', 'salmon'])
    plt.title(f'Score Breakdown for {company}')
    plt.ylabel('Score (out of 100)')
    plt.xticks(rotation=0)
    plt.ylim(0, 110) # Set a fixed y-axis for fair comparison
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# --- Part 3: The "Magic" - Linking the dropdown to the function ---
interact(display_dashboard, company=widgets.Dropdown(options=ranked_df.index, description='Select Company:'));

interactive(children=(Dropdown(description='Select Company:', options=('MSFT', 'V', 'GOOGL', 'AAPL', 'NEE', 'U…

In [None]:
import pandas as pd
import yfinance as yf
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# --- Part 1: The "Brain" - Upgraded to use REAL ESG Data ---
def get_upgraded_scorecard():
    tickers = [
        'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
        'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
    ]

    company_data = []
    print("Fetching data for all companies... (This may take a moment)")
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info

        # --- NEW: Fetch real ESG data from Yahoo Finance ---
        try:
            # .sustainability provides a dataframe with ESG scores
            sustainability_data = stock.sustainability
            # We look for specific scores, providing a default of None if not found
            env_score = sustainability_data.loc['environmentScore', 'Value'] if 'environmentScore' in sustainability_data.index else None
            soc_score = sustainability_data.loc['socialScore', 'Value'] if 'socialScore' in sustainability_data.index else None
        except Exception:
            # If a company has no sustainability data, we set scores to None
            env_score, soc_score = None, None

        data_point = {
            'Ticker': ticker,
            'PE_Ratio': info.get('trailingPE'),
            'Revenue_Growth': info.get('revenueGrowth'),
            'Environmental_Score_Raw': env_score, # Using REAL data now
            'Social_Score_Raw': soc_score # Using REAL data now
        }
        company_data.append(data_point)
    print("Data fetching complete.")

    full_df = pd.DataFrame(company_data).set_index('Ticker')
    full_df.dropna(inplace=True) # Drop companies with missing data

    # --- Scoring logic remains the same, but now uses the real data ---
    scorecard = pd.DataFrame(index=full_df.index)
    # Normalize financials
    for feature in ['PE_Ratio', 'Revenue_Growth']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

    # Normalize real ESG scores
    for feature in ['Environmental_Score_Raw', 'Social_Score_Raw']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature.replace("_Raw", "")}'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

    scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)

    weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
    scorecard['Integrated_Value_Score'] = (
        scorecard['Financial_Score'] * weights['Financial'] +
        scorecard['Environmental_Score'] * weights['Environmental'] +
        scorecard['Social_Score'] * weights['Social']
    )
    return scorecard.sort_values(by='Integrated_Value_Score', ascending=False).round(1)

# --- Pre-calculate the rankings with the new upgraded function ---
upgraded_ranked_df = get_upgraded_scorecard()

# --- Part 2: The Display Function (no changes needed) ---
def display_dashboard(company):
    company_scores = upgraded_ranked_df.loc[company]
    rank = upgraded_ranked_df.index.get_loc(company) + 1
    print(f"Displaying Analysis for: {company}")
    print("-" * 35)
    print(f"Overall Rank: #{rank} out of {len(upgraded_ranked_df)}")
    print(f"Integrated Value Score: {company_scores['Integrated_Value_Score']:.1f}\n")
    print(f"Financial Score: {company_scores['Financial_Score']:.1f}")
    print(f"Environmental Score: {company_scores['Environmental_Score']:.1f}")
    print(f"Social Score: {company_scores['Social_Score']:.1f}")
    print("-" * 35)
    scores = company_scores[['Financial_Score', 'Environmental_Score', 'Social_Score']]
    scores.plot(kind='bar', figsize=(8, 4), color=['skyblue', 'lightgreen', 'salmon'])
    plt.title(f'Score Breakdown for {company}')
    plt.ylabel('Score (out of 100)')
    plt.xticks(rotation=0)
    plt.ylim(0, 110)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# --- Part 3: The Interactive Widget (no changes needed) ---
interact(display_dashboard, company=widgets.Dropdown(options=upgraded_ranked_df.index, description='Select Company:'));

Fetching data for all companies... (This may take a moment)
Data fetching complete.


interactive(children=(Dropdown(description='Select Company:', options=(), value=None), Output()), _dom_classes…

In [None]:
import pandas as pd
import yfinance as yf
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# --- Part 1: The "Brain" - Upgraded to use REAL ESG Data ---
def get_upgraded_scorecard():
    tickers = [
        'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
        'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
    ]

    company_data = []
    print("Fetching data for all companies... (This may take a moment)")
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info

        # --- NEW: Fetch real ESG data from Yahoo Finance ---
        try:
            sustainability_data = stock.sustainability
            env_score = sustainability_data.loc['environmentScore', 'Value'] if 'environmentScore' in sustainability_data.index else None
            soc_score = sustainability_data.loc['socialScore', 'Value'] if 'socialScore' in sustainability_data.index else None
        except Exception:
            env_score, soc_score = None, None

        data_point = {
            'Ticker': ticker,
            'PE_Ratio': info.get('trailingPE'),
            'Revenue_Growth': info.get('revenueGrowth'),
            'Environmental_Score_Raw': env_score,
            'Social_Score_Raw': soc_score
        }
        company_data.append(data_point)
    print("Data fetching complete.")

    full_df = pd.DataFrame(company_data).set_index('Ticker')

    # --- THE FIX: Drop any rows where financial OR ESG data is missing ---
    full_df.dropna(inplace=True)

    # --- Scoring logic remains the same ---
    scorecard = pd.DataFrame(index=full_df.index)
    # Normalize financials
    for feature in ['PE_Ratio', 'Revenue_Growth']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

    # Normalize real ESG scores
    # NOTE: The logic is slightly adjusted to handle scores where lower is better (like raw ESG scores)
    for feature, ascending in [('Environmental_Score_Raw', False), ('Social_Score_Raw', False)]:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        if ascending:
            scorecard[f'{feature.replace("_Raw", "")}'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)
        else: # Lower raw score is better, so we invert the normalization
            scorecard[f'{feature.replace("_Raw", "")}'] = 100 * (max_val - full_df[feature]) / (max_val - min_val)


    scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)

    weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
    scorecard['Integrated_Value_Score'] = (
        scorecard['Financial_Score'] * weights['Financial'] +
        scorecard['Environmental_Score'] * weights['Environmental'] +
        scorecard['Social_Score'] * weights['Social']
    )
    return scorecard.sort_values(by='Integrated_Value_Score', ascending=False).round(1)

# --- Pre-calculate the rankings with the new upgraded function ---
upgraded_ranked_df = get_upgraded_scorecard()

# --- Part 2: The Display Function (no changes needed) ---
def display_dashboard(company):
    company_scores = upgraded_ranked_df.loc[company]
    rank = upgraded_ranked_df.index.get_loc(company) + 1
    print(f"Displaying Analysis for: {company}")
    print("-" * 35)
    print(f"Overall Rank: #{rank} out of {len(upgraded_ranked_df)}")
    print(f"Integrated Value Score: {company_scores['Integrated_Value_Score']:.1f}\n")
    print(f"Financial Score: {company_scores['Financial_Score']:.1f}")
    print(f"Environmental Score: {company_scores['Environmental_Score']:.1f}")
    print(f"Social Score: {company_scores['Social_Score']:.1f}")
    print("-" * 35)
    scores = company_scores[['Financial_Score', 'Environmental_Score', 'Social_Score']]
    scores.plot(kind='bar', figsize=(8, 4), color=['skyblue', 'lightgreen', 'salmon'])
    plt.title(f'Score Breakdown for {company}')
    plt.ylabel('Score (out of 100)')
    plt.xticks(rotation=0)
    plt.ylim(0, 110)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# --- Part 3: The Interactive Widget (no changes needed) ---
interact(display_dashboard, company=widgets.Dropdown(options=upgraded_ranked_df.index, description='Select Company:'));

Fetching data for all companies... (This may take a moment)
Data fetching complete.


interactive(children=(Dropdown(description='Select Company:', options=(), value=None), Output()), _dom_classes…

In [None]:
import pandas as pd
import yfinance as yf
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# --- Part 1: The "Brain" - Upgraded to use REAL ESG Data ---
def get_upgraded_scorecard():
    tickers = [
        'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
        'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
    ]

    company_data = []
    print("Fetching data for all companies... (This may take a moment)")
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info

        try:
            sustainability_data = stock.sustainability
            env_score = sustainability_data.loc['environmentScore', 'Value'] if 'environmentScore' in sustainability_data.index else None
            soc_score = sustainability_data.loc['socialScore', 'Value'] if 'socialScore' in sustainability_data.index else None
        except Exception:
            env_score, soc_score = None, None

        data_point = {
            'Ticker': ticker,
            'PE_Ratio': info.get('trailingPE'),
            'Revenue_Growth': info.get('revenueGrowth'),
            'Environmental_Score_Raw': env_score,
            'Social_Score_Raw': soc_score
        }
        company_data.append(data_point)
    print("Data fetching complete.")

    full_df = pd.DataFrame(company_data).set_index('Ticker')
    full_df.dropna(inplace=True)

    if full_df.empty:
        return pd.DataFrame() # Return an empty DataFrame if no valid data was found

    scorecard = pd.DataFrame(index=full_df.index)
    for feature in ['PE_Ratio', 'Revenue_Growth']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

    # Invert normalization for ESG scores where a lower raw score is better
    for feature in ['Environmental_Score_Raw', 'Social_Score_Raw']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature.replace("_Raw", "")}'] = 100 * (max_val - full_df[feature]) / (max_val - min_val)

    scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)

    weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
    scorecard['Integrated_Value_Score'] = (
        scorecard['Financial_Score'] * weights['Financial'] +
        scorecard['Environmental_Score'] * weights['Environmental'] +
        scorecard['Social_Score'] * weights['Social']
    )
    return scorecard.sort_values(by='Integrated_Value_Score', ascending=False).round(1)

# --- The Display Function (no changes needed) ---
def display_dashboard(company):
    company_scores = upgraded_ranked_df.loc[company]
    rank = upgraded_ranked_df.index.get_loc(company) + 1
    print(f"Displaying Analysis for: {company}")
    print("-" * 35)
    print(f"Overall Rank: #{rank} out of {len(upgraded_ranked_df)}")
    print(f"Integrated Value Score: {company_scores['Integrated_Value_Score']:.1f}\n")
    print(f"Financial Score: {company_scores['Financial_Score']:.1f}")
    print(f"Environmental Score: {company_scores['Environmental_Score']:.1f}")
    print(f"Social Score: {company_scores['Social_Score']:.1f}")
    print("-" * 35)
    scores = company_scores[['Financial_Score', 'Environmental_Score', 'Social_Score']]
    scores.plot(kind='bar', figsize=(8, 4), color=['skyblue', 'lightgreen', 'salmon'])
    plt.title(f'Score Breakdown for {company}')
    plt.ylabel('Score (out of 100)')
    plt.xticks(rotation=0)
    plt.ylim(0, 110)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

# --- Pre-calculate and then build the widget ---
upgraded_ranked_df = get_upgraded_scorecard()

# --- THE FIX: Check if the DataFrame has data BEFORE creating the widget ---
if not upgraded_ranked_df.empty:
    interact(display_dashboard, company=widgets.Dropdown(options=upgraded_ranked_df.index, description='Select Company:'))
else:
    print("🔴 Error: Could not retrieve complete financial and ESG data for any of the companies.")
    print("This might be a temporary issue with the data source. Please try running the cell again later.")

Fetching data for all companies... (This may take a moment)
Data fetching complete.
🔴 Error: Could not retrieve complete financial and ESG data for any of the companies.
This might be a temporary issue with the data source. Please try running the cell again later.


In [None]:
import pandas as pd
import yfinance as yf
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

# --- Part 1: The "Brain" - With a new Debug Mode ---
def get_upgraded_scorecard():
    tickers = [
        'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
        'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
    ]

    company_data = []
    print("Fetching data for all companies... (Debug Mode On)")
    print("-" * 50)
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info

        try:
            sustainability_data = stock.sustainability
            env_score = sustainability_data.loc['environmentScore', 'Value'] if 'environmentScore' in sustainability_data.index else None
            soc_score = sustainability_data.loc['socialScore', 'Value'] if 'socialScore' in sustainability_data.index else None
        except Exception:
            env_score, soc_score = None, None

        pe_ratio = info.get('trailingPE')
        rev_growth = info.get('revenueGrowth')

        # --- THIS IS THE NEW DEBUG LINE ---
        print(f"Ticker: {ticker: <5} | PE: {pe_ratio is not None} | Growth: {rev_growth is not None} | Env Score: {env_score is not None} | Soc Score: {soc_score is not None}")

        data_point = {
            'Ticker': ticker, 'PE_Ratio': pe_ratio, 'Revenue_Growth': rev_growth,
            'Environmental_Score_Raw': env_score, 'Social_Score_Raw': soc_score
        }
        company_data.append(data_point)

    print("-" * 50)
    print("Data fetching complete.")

    full_df = pd.DataFrame(company_data).set_index('Ticker')
    full_df.dropna(inplace=True)

    if full_df.empty:
        return pd.DataFrame()

    scorecard = pd.DataFrame(index=full_df.index)
    for feature in ['PE_Ratio', 'Revenue_Growth']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)

    for feature in ['Environmental_Score_Raw', 'Social_Score_Raw']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature.replace("_Raw", "")}'] = 100 * (max_val - full_df[feature]) / (max_val - min_val)

    scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)

    weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
    scorecard['Integrated_Value_Score'] = (
        scorecard['Financial_Score'] * weights['Financial'] +
        scorecard['Environmental_Score'] * weights['Environmental'] +
        scorecard['Social_Score'] * weights['Social']
    )
    return scorecard.sort_values(by='Integrated_Value_Score', ascending=False).round(1)

# --- The rest of the code remains the same ---
def display_dashboard(company):
    company_scores = upgraded_ranked_df.loc[company]
    rank = upgraded_ranked_df.index.get_loc(company) + 1
    print(f"Displaying Analysis for: {company}")
    print("-" * 35)
    print(f"Overall Rank: #{rank} out of {len(upgraded_ranked_df)}")
    print(f"Integrated Value Score: {company_scores['Integrated_Value_Score']:.1f}\n")
    print(f"Financial Score: {company_scores['Financial_Score']:.1f}")
    print(f"Environmental Score: {company_scores['Environmental_Score']:.1f}")
    print(f"Social Score: {company_scores['Social_Score']:.1f}")
    print("-" * 35)
    scores = company_scores[['Financial_Score', 'Environmental_Score', 'Social_Score']]
    scores.plot(kind='bar', figsize=(8, 4), color=['skyblue', 'lightgreen', 'salmon'])
    plt.title(f'Score Breakdown for {company}')
    plt.ylabel('Score (out of 100)')
    plt.xticks(rotation=0)
    plt.ylim(0, 110)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()


upgraded_ranked_df = get_upgraded_scorecard()

if not upgraded_ranked_df.empty:
    interact(display_dashboard, company=widgets.Dropdown(options=upgraded_ranked_df.index, description='Select Company:'))
else:
    print("🔴 Error: Could not retrieve complete financial and ESG data for any of the companies.")
    print("Please check the debug log above to see which data points are failing.")

Fetching data for all companies... (Debug Mode On)
--------------------------------------------------
Ticker: MSFT  | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: AAPL  | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: GOOGL | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: JPM   | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: BAC   | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: V     | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: JNJ   | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: PFE   | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: UNH   | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: WMT   | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: COST  | PE: True | Growth: True | Env Score: False | Soc Score: False
Ticker: NKE   | PE: True | Growth: True 

In [None]:
import pandas as pd
import yfinance as yf
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt

def get_final_scorecard():
    tickers = [
        'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
        'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
    ]
    company_data = []
    print("Fetching data... (This may take a moment)")
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        info = stock.info
        try:
            sustainability_data = stock.sustainability
            env_score = sustainability_data.loc['environmentScore', 'Value'] if 'environmentScore' in sustainability_data.index else None
            soc_score = sustainability_data.loc['socialScore', 'Value'] if 'socialScore' in sustainability_data.index else None
        except Exception:
            env_score, soc_score = None, None
        data_point = {
            'Ticker': ticker, 'PE_Ratio': info.get('trailingPE'), 'Revenue_Growth': info.get('revenueGrowth'),
            'Environmental_Score': env_score, 'Social_Score': soc_score
        }
        company_data.append(data_point)
    print("Data fetching complete.")

    full_df = pd.DataFrame(company_data).set_index('Ticker')
    # We only drop rows if they are missing critical FINANCIAL data
    full_df.dropna(subset=['PE_Ratio', 'Revenue_Growth'], inplace=True)

    if full_df.empty:
        return pd.DataFrame(), False

    scorecard = pd.DataFrame(index=full_df.index)
    # Normalize financials
    for feature in ['PE_Ratio', 'Revenue_Growth']:
        min_val = full_df[feature].min()
        max_val = full_df[feature].max()
        scorecard[f'{feature}_Score'] = 100 * (full_df[feature] - min_val) / (max_val - min_val)
    scorecard['Financial_Score'] = scorecard[['PE_Ratio_Score', 'Revenue_Growth_Score']].mean(axis=1)

    # --- NEW: Check if we have ESG data ---
    has_esg_data = not full_df['Environmental_Score'].isnull().all()

    if has_esg_data:
        # If we have ESG data, proceed with the integrated score
        scorecard['Environmental_Score'] = 100 * (full_df['Environmental_Score'].max() - full_df['Environmental_Score']) / (full_df['Environmental_Score'].max() - full_df['Environmental_Score'].min())
        scorecard['Social_Score'] = 100 * (full_df['Social_Score'].max() - full_df['Social_Score']) / (full_df['Social_Score'].max() - full_df['Social_Score'].min())
        weights = {'Financial': 0.50, 'Environmental': 0.25, 'Social': 0.25}
        scorecard['Final_Score'] = (
            scorecard['Financial_Score'] * weights['Financial'] +
            scorecard['Environmental_Score'] * weights['Environmental'] +
            scorecard['Social_Score'] * weights['Social']
        )
    else:
        # If ESG data failed, use Financial Score as the final score
        scorecard['Final_Score'] = scorecard['Financial_Score']
        # Add empty columns for display consistency
        scorecard['Environmental_Score'] = 'N/A'
        scorecard['Social_Score'] = 'N/A'

    return scorecard.sort_values(by='Final_Score', ascending=False).round(1), has_esg_data

def display_dashboard(company):
    company_scores = ranked_df.loc[company]
    rank = ranked_df.index.get_loc(company) + 1
    print(f"Displaying Analysis for: {company}")
    print("-" * 35)
    print(f"Overall Rank: #{rank} out of {len(ranked_df)}")
    print(f"Final Score: {company_scores['Final_Score']:.1f}\n")
    print(f"Financial Score: {company_scores['Financial_Score']:.1f}")
    print(f"Environmental Score: {company_scores['Environmental_Score']}")
    print(f"Social Score: {company_scores['Social_Score']}")
    print("-" * 35)

    # Only plot the bar chart if we have ESG data
    if has_esg_data:
        scores = company_scores[['Financial_Score', 'Environmental_Score', 'Social_Score']]
        scores.plot(kind='bar', figsize=(8, 4), color=['skyblue', 'lightgreen', 'salmon'])
        plt.title(f'Score Breakdown for {company}')
        plt.ylabel('Score (out of 100)')
        plt.xticks(rotation=0)
        plt.ylim(0, 110)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.show()

# --- Run the final, robust agent ---
ranked_df, has_esg_data = get_final_scorecard()

if not has_esg_data:
    print("⚠️ WARNING: ESG data is currently unavailable. Rankings are based on FINANCIAL DATA ONLY.")

if not ranked_df.empty:
    interact(display_dashboard, company=widgets.Dropdown(options=ranked_df.index, description='Select Company:'))
else:
    print("🔴 Error: Could not retrieve complete financial data for any companies.")

Fetching data... (This may take a moment)
Data fetching complete.


interactive(children=(Dropdown(description='Select Company:', options=('MSFT', 'TSLA', 'V', 'COST', 'GOOGL', '…

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Step 1: Data Gathering & Feature Engineering ---
def create_predictive_dataset(tickers, start_date="2018-01-01", end_date="2024-01-01"):
    """
    Gathers historical data, calculates scores, and creates the target variable.
    """
    # Get S&P 500 data to use as the market benchmark
    market_data = yf.download('^GSPC', start=start_date, end=end_date, progress=False)
    market_returns = market_data['Adj Close'].pct_change(periods=126).shift(-126) # 6-month forward return (126 trading days)

    all_company_data = []

    # Get the scorecard data (using our previous logic as a base)
    # In a real scenario, this data would be calculated for each historical point in time.
    # Here, we use current scores as a proxy.
    esg_grades = {
        'MSFT': {'Env': 'B', 'Soc': 'A'}, 'AAPL': {'Env': 'B', 'Soc': 'A'},
        'GOOGL': {'Env': 'B', 'Soc': 'A'}, 'JPM': {'Env': 'C', 'Soc': 'C'}, 'BAC': {'Env': 'C', 'Soc': 'C'},
        'V': {'Env': 'A', 'Soc': 'B'}, 'JNJ': {'Env': 'B', 'Soc': 'B'}, 'PFE': {'Env': 'B', 'Soc': 'B'},
        'UNH': {'Env': 'B', 'Soc': 'B'}, 'WMT': {'Env': 'C', 'Soc': 'D'}, 'COST': {'Env': 'C', 'Soc': 'B'},
        'NKE': {'Env': 'B', 'Soc': 'C'}, 'XOM': {'Env': 'F', 'Soc': 'D'}, 'CVX': {'Env': 'F', 'Soc': 'D'},
        'CAT': {'Env': 'D', 'Soc': 'C'}, 'BA': {'Env': 'C', 'Soc': 'D'}, 'UPS': {'Env': 'C', 'Soc': 'B'},
        'NEE': {'Env': 'A', 'Soc': 'B'}, 'DIS': {'Env': 'B', 'Soc': 'C'}, 'TSLA': {'Env': 'A', 'Soc': 'D'}
    }
    grade_to_score = {'A': 100, 'B': 80, 'C': 60, 'D': 40, 'F': 20}

    print("Creating historical dataset...")
    for ticker in tickers:
        # Get historical price data for the stock
        stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if stock_data.empty:
            continue

        # Calculate 6-month forward returns for the stock
        stock_returns = stock_data['Adj Close'].pct_change(periods=126).shift(-126)

        # Create the features (using current scores as proxy) and target
        temp_df = pd.DataFrame(index=stock_data.index)
        temp_df['Ticker'] = ticker
        temp_df['Environmental_Score'] = esg_grades.get(ticker, {}).get('Env', 'C') # Default to C
        temp_df['Social_Score'] = esg_grades.get(ticker, {}).get('Soc', 'C') # Default to C
        temp_df['6M_Return'] = stock_returns
        temp_df['Market_Return'] = market_returns

        all_company_data.append(temp_df)

    # Combine all data and clean up
    final_df = pd.concat(all_company_data)
    final_df.dropna(inplace=True)

    # Convert grades to scores
    final_df['Environmental_Score'] = final_df['Environmental_Score'].map(grade_to_score)
    final_df['Social_Score'] = final_df['Social_Score'].map(grade_to_score)

    # Create the target variable: 1 if the stock outperformed the market, 0 otherwise
    final_df['Outperformed'] = (final_df['6M_Return'] > final_df['Market_Return']).astype(int)

    # For simplicity, we'll create a proxy for financial score. A better model would use historical P/E etc.
    final_df['Financial_Score_Proxy'] = final_df['6M_Return'].rolling(window=252, min_periods=1).mean() * 100
    final_df.dropna(inplace=True)

    return final_df

# --- Step 2: Train the Predictive Model ---
tickers = [
    'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
    'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
]
dataset = create_predictive_dataset(tickers)

if not dataset.empty:
    # Define our features (X) and target (y)
    features = ['Financial_Score_Proxy', 'Environmental_Score', 'Social_Score']
    X = dataset[features]
    y = dataset['Outperformed']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Initialize and train the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # --- Step 3: Evaluate the Model ---
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    print("\n--- Predictive Model Evaluation ---")
    print(f"Model Accuracy: {accuracy:.2%}")
    print("This score represents how often the model correctly predicted if a stock would outperform the market.")

    # --- Step 4: Interpret the Results ---
    print("\n--- Feature Importance ---")
    feature_importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
    print("This shows which factors the model found most important for predicting stock performance:")
    print(feature_importances)

else:
    print("Could not generate a dataset for model training.")

  market_data = yf.download('^GSPC', start=start_date, end=end_date, progress=False)


KeyError: 'Adj Close'

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# --- Step 1: Data Gathering & Feature Engineering ---
def create_predictive_dataset(tickers, start_date="2018-01-01", end_date="2024-01-01"):
    """
    Gathers historical data, calculates scores, and creates the target variable.
    """
    # Get S&P 500 data to use as the market benchmark
    market_data = yf.download('^GSPC', start=start_date, end=date="2024-01-01", progress=False)
    # FIXED: Use 'Close' instead of 'Adj Close'
    market_returns = market_data['Close'].pct_change(periods=126).shift(-126)

    all_company_data = []

    esg_grades = {
        'MSFT': {'Env': 'B', 'Soc': 'A'}, 'AAPL': {'Env': 'B', 'Soc': 'A'},
        'GOOGL': {'Env': 'B', 'Soc': 'A'}, 'JPM': {'Env': 'C', 'Soc': 'C'}, 'BAC': {'Env': 'C', 'Soc': 'C'},
        'V': {'Env': 'A', 'Soc': 'B'}, 'JNJ': {'Env': 'B', 'Soc': 'B'}, 'PFE': {'Env': 'B', 'Soc': 'B'},
        'UNH': {'Env': 'B', 'Soc': 'B'}, 'WMT': {'Env': 'C', 'Soc': 'D'}, 'COST': {'Env': 'C', 'Soc': 'B'},
        'NKE': {'Env': 'B', 'Soc': 'C'}, 'XOM': {'Env': 'F', 'Soc': 'D'}, 'CVX': {'Env': 'F', 'Soc': 'D'},
        'CAT': {'Env': 'D', 'Soc': 'C'}, 'BA': {'Env': 'C', 'Soc': 'D'}, 'UPS': {'Env': 'C', 'Soc': 'B'},
        'NEE': {'Env': 'A', 'Soc': 'B'}, 'DIS': {'Env': 'B', 'Soc': 'C'}, 'TSLA': {'Env': 'A', 'Soc': 'D'}
    }
    grade_to_score = {'A': 100, 'B': 80, 'C': 60, 'D': 40, 'F': 20}

    print("Creating historical dataset...")
    for ticker in tickers:
        stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if stock_data.empty:
            continue

        # FIXED: Use 'Close' instead of 'Adj Close'
        stock_returns = stock_data['Close'].pct_change(periods=126).shift(-126)

        temp_df = pd.DataFrame(index=stock_data.index)
        temp_df['Ticker'] = ticker
        temp_df['Environmental_Score'] = esg_grades.get(ticker, {}).get('Env', 'C')
        temp_df['Social_Score'] = esg_grades.get(ticker, {}).get('Soc', 'C')
        temp_df['6M_Return'] = stock_returns
        temp_df['Market_Return'] = market_returns

        all_company_data.append(temp_df)

    final_df = pd.concat(all_company_data)
    final_df.dropna(inplace=True)

    final_df['Environmental_Score'] = final_df['Environmental_Score'].map(grade_to_score)
    final_df['Social_Score'] = final_df['Social_Score'].map(grade_to_score)

    final_df['Outperformed'] = (final_df['6M_Return'] > final_df['Market_Return']).astype(int)

    final_df['Financial_Score_Proxy'] = final_df['6M_Return'].rolling(window=252, min_periods=1).mean() * 100
    final_df.dropna(inplace=True)

    return final_df

# --- Step 2: Train the Predictive Model ---
tickers = [
    'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
    'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
]
dataset = create_predictive_dataset(tickers)

if not dataset.empty:
    features = ['Financial_Score_Proxy', 'Environmental_Score', 'Social_Score']
    X = dataset[features]
    y = dataset['Outperformed']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # --- Step 3: Evaluate the Model ---
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    print("\n--- Predictive Model Evaluation ---")
    print(f"Model Accuracy: {accuracy:.2%}")

    # --- Step 4: Interpret the Results ---
    print("\n--- Feature Importance ---")
    feature_importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
    print("This shows which factors the model found most important for predicting stock performance:")
    print(feature_importances)

else:
    print("Could not generate a dataset for model training.")

SyntaxError: invalid syntax (ipython-input-1026952506.py, line 14)

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def create_predictive_dataset(tickers, start_date="2018-01-01", end_date="2024-01-01"):
    """
    Gathers historical data, calculates scores, and creates the target variable.
    """
    # Get S&P 500 data to use as the market benchmark
    # FIXED: Corrected the 'end' argument syntax
    market_data = yf.download('^GSPC', start=start_date, end=end_date, progress=False)
    market_returns = market_data['Close'].pct_change(periods=126).shift(-126)

    all_company_data = []

    esg_grades = {
        'MSFT': {'Env': 'B', 'Soc': 'A'}, 'AAPL': {'Env': 'B', 'Soc': 'A'},
        'GOOGL': {'Env': 'B', 'Soc': 'A'}, 'JPM': {'Env': 'C', 'Soc': 'C'}, 'BAC': {'Env': 'C', 'Soc': 'C'},
        'V': {'Env': 'A', 'Soc': 'B'}, 'JNJ': {'Env': 'B', 'Soc': 'B'}, 'PFE': {'Env': 'B', 'Soc': 'B'},
        'UNH': {'Env': 'B', 'Soc': 'B'}, 'WMT': {'Env': 'C', 'Soc': 'D'}, 'COST': {'Env': 'C', 'Soc': 'B'},
        'NKE': {'Env': 'B', 'Soc': 'C'}, 'XOM': {'Env': 'F', 'Soc': 'D'}, 'CVX': {'Env': 'F', 'Soc': 'D'},
        'CAT': {'Env': 'D', 'Soc': 'C'}, 'BA': {'Env': 'C', 'Soc': 'D'}, 'UPS': {'Env': 'C', 'Soc': 'B'},
        'NEE': {'Env': 'A', 'Soc': 'B'}, 'DIS': {'Env': 'B', 'Soc': 'C'}, 'TSLA': {'Env': 'A', 'Soc': 'D'}
    }
    grade_to_score = {'A': 100, 'B': 80, 'C': 60, 'D': 40, 'F': 20}

    print("Creating historical dataset...")
    for ticker in tickers:
        stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
        if stock_data.empty:
            continue

        stock_returns = stock_data['Close'].pct_change(periods=126).shift(-126)

        temp_df = pd.DataFrame(index=stock_data.index)
        temp_df['Ticker'] = ticker
        temp_df['Environmental_Score'] = esg_grades.get(ticker, {}).get('Env', 'C')
        temp_df['Social_Score'] = esg_grades.get(ticker, {}).get('Soc', 'C')
        temp_df['6M_Return'] = stock_returns
        temp_df['Market_Return'] = market_returns

        all_company_data.append(temp_df)

    final_df = pd.concat(all_company_data)
    final_df.dropna(inplace=True)

    final_df['Environmental_Score'] = final_df['Environmental_Score'].map(grade_to_score)
    final_df['Social_Score'] = final_df['Social_Score'].map(grade_to_score)

    final_df['Outperformed'] = (final_df['6M_Return'] > final_df['Market_Return']).astype(int)

    final_df['Financial_Score_Proxy'] = final_df['6M_Return'].rolling(window=252, min_periods=1).mean() * 100
    final_df.dropna(inplace=True)

    return final_df

# --- Run the rest of the script as before ---
tickers = [
    'MSFT', 'AAPL', 'GOOGL', 'JPM', 'BAC', 'V', 'JNJ', 'PFE', 'UNH',
    'WMT', 'COST', 'NKE', 'XOM', 'CVX', 'CAT', 'BA', 'UPS', 'NEE', 'DIS', 'TSLA'
]
dataset = create_predictive_dataset(tickers)

if not dataset.empty:
    features = ['Financial_Score_Proxy', 'Environmental_Score', 'Social_Score']
    X = dataset[features]
    y = dataset['Outperformed']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    print("\n--- Predictive Model Evaluation ---")
    print(f"Model Accuracy: {accuracy:.2%}")

    print("\n--- Feature Importance ---")
    feature_importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
    print("This shows which factors the model found most important for predicting stock performance:")
    print(feature_importances)

else:
    print("Could not generate a dataset for model training.")

  market_data = yf.download('^GSPC', start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)


Creating historical dataset...


  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  stock_data = yf.download(ticker, start=start_date, end=end_date, progress=False)
  st


--- Predictive Model Evaluation ---
Model Accuracy: 59.34%

--- Feature Importance ---
This shows which factors the model found most important for predicting stock performance:
Financial_Score_Proxy    0.937654
Social_Score             0.033252
Environmental_Score      0.029094
dtype: float64
