# ESG Scoring Dashboard - Exploratory Analysis

This notebook provides exploratory data analysis for the ESG scoring system, including data quality assessment, correlation analysis, and preliminary insights.

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

%matplotlib inline

## Data Loading and Initial Exploration

In [None]:
# Load sample data (replace with actual data paths when available)
# For demonstration, we'll create sample data

np.random.seed(42)

# Create sample dataset
companies = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'META', 'NVDA', 'JPM', 'JNJ', 'PG',
           'KO', 'DIS', 'NFLX', 'ADBE', 'CRM', 'INTC', 'CSCO', 'PEP', 'WMT', 'HD']

sectors = ['Technology', 'Technology', 'Technology', 'Consumer Discretionary', 'Automotive',
          'Technology', 'Technology', 'Financial', 'Healthcare', 'Consumer Staples',
          'Consumer Staples', 'Consumer Discretionary', 'Technology', 'Technology', 'Technology',
          'Technology', 'Technology', 'Consumer Staples', 'Consumer Staples', 'Consumer Discretionary']

data = []
for i, (ticker, sector) in enumerate(zip(companies, sectors)):
    base_esg = np.random.normal(60, 15)
    
    data_point = {
        'ticker': ticker,
        'company_name': f'{ticker} Inc.',
        'sector': sector,
        'environmental_score': np.clip(base_esg + np.random.normal(0, 8), 0, 100),
        'social_score': np.clip(base_esg + np.random.normal(0, 8), 0, 100),
        'governance_score': np.clip(base_esg + np.random.normal(0, 8), 0, 100),
        'market_cap': np.random.lognormal(25, 1),
        'revenue': np.random.lognormal(24, 1),
        'price_change_1y': np.random.normal(10, 25),
        'volatility': np.random.uniform(15, 45),
        'pe_ratio': np.random.uniform(8, 40),
        'roe': np.random.uniform(5, 30),
        'debt_to_equity': np.random.uniform(0.1, 2.0),
    }
    
    data_point['esg_score'] = (data_point['environmental_score'] + 
                              data_point['social_score'] + 
                              data_point['governance_score']) / 3
    
    data.append(data_point)

df = pd.DataFrame(data)
print(f"Dataset shape: {df.shape}")
df.head()

## Data Quality Assessment

In [None]:
# Basic data info
print("Dataset Information:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

## ESG Score Distribution Analysis

In [None]:
# ESG score distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('ESG Score Distributions', fontsize=16)

# Overall ESG Score
axes[0, 0].hist(df['esg_score'], bins=15, alpha=0.7, color='green')
axes[0, 0].set_title('Overall ESG Score')
axes[0, 0].set_xlabel('ESG Score')
axes[0, 0].set_ylabel('Frequency')

# Environmental Score
axes[0, 1].hist(df['environmental_score'], bins=15, alpha=0.7, color='blue')
axes[0, 1].set_title('Environmental Score')
axes[0, 1].set_xlabel('Environmental Score')
axes[0, 1].set_ylabel('Frequency')

# Social Score
axes[1, 0].hist(df['social_score'], bins=15, alpha=0.7, color='orange')
axes[1, 0].set_title('Social Score')
axes[1, 0].set_xlabel('Social Score')
axes[1, 0].set_ylabel('Frequency')

# Governance Score
axes[1, 1].hist(df['governance_score'], bins=15, alpha=0.7, color='red')
axes[1, 1].set_title('Governance Score')
axes[1, 1].set_xlabel('Governance Score')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Sector Analysis

In [None]:
# ESG scores by sector
plt.figure(figsize=(12, 8))

# Box plot of ESG scores by sector
df_melted = df.melt(id_vars=['sector'], 
                   value_vars=['environmental_score', 'social_score', 'governance_score'],
                   var_name='ESG_Component', value_name='Score')

sns.boxplot(data=df_melted, x='sector', y='Score', hue='ESG_Component')
plt.title('ESG Component Scores by Sector')
plt.xlabel('Sector')
plt.ylabel('ESG Score')
plt.xticks(rotation=45)
plt.legend(title='ESG Component')
plt.tight_layout()
plt.show()

# Average ESG score by sector
sector_avg = df.groupby('sector')['esg_score'].mean().sort_values(ascending=False)
print("\nAverage ESG Score by Sector:")
print(sector_avg)

## ESG-Financial Performance Correlation Analysis

In [None]:
# Correlation matrix
correlation_cols = ['esg_score', 'environmental_score', 'social_score', 'governance_score',
                   'price_change_1y', 'volatility', 'pe_ratio', 'roe', 'debt_to_equity']

corr_matrix = df[correlation_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, square=True, 
            fmt='.2f', cbar_kws={'label': 'Correlation Coefficient'})
plt.title('ESG-Financial Metrics Correlation Matrix')
plt.tight_layout()
plt.show()

# Print strongest correlations with ESG score
esg_correlations = corr_matrix['esg_score'].drop('esg_score').abs().sort_values(ascending=False)
print("\nStrongest correlations with overall ESG Score:")
print(esg_correlations)

## Scatter Plot Analysis

In [None]:
# Scatter plots of ESG vs Financial metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('ESG Score vs Financial Performance Metrics', fontsize=16)

# ESG vs Stock Return
axes[0, 0].scatter(df['esg_score'], df['price_change_1y'], alpha=0.7, color='green')
axes[0, 0].set_xlabel('ESG Score')
axes[0, 0].set_ylabel('1-Year Price Change (%)')
axes[0, 0].set_title('ESG Score vs Stock Return')
# Add trend line
z = np.polyfit(df['esg_score'], df['price_change_1y'], 1)
p = np.poly1d(z)
axes[0, 0].plot(df['esg_score'], p(df['esg_score']), "r--", alpha=0.8)

# ESG vs Volatility
axes[0, 1].scatter(df['esg_score'], df['volatility'], alpha=0.7, color='blue')
axes[0, 1].set_xlabel('ESG Score')
axes[0, 1].set_ylabel('Volatility (%)')
axes[0, 1].set_title('ESG Score vs Volatility')
z = np.polyfit(df['esg_score'], df['volatility'], 1)
p = np.poly1d(z)
axes[0, 1].plot(df['esg_score'], p(df['esg_score']), "r--", alpha=0.8)

# ESG vs ROE
axes[1, 0].scatter(df['esg_score'], df['roe'], alpha=0.7, color='orange')
axes[1, 0].set_xlabel('ESG Score')
axes[1, 0].set_ylabel('Return on Equity (%)')
axes[1, 0].set_title('ESG Score vs ROE')
z = np.polyfit(df['esg_score'], df['roe'], 1)
p = np.poly1d(z)
axes[1, 0].plot(df['esg_score'], p(df['esg_score']), "r--", alpha=0.8)

# ESG vs P/E Ratio
axes[1, 1].scatter(df['esg_score'], df['pe_ratio'], alpha=0.7, color='red')
axes[1, 1].set_xlabel('ESG Score')
axes[1, 1].set_ylabel('P/E Ratio')
axes[1, 1].set_title('ESG Score vs P/E Ratio')
z = np.polyfit(df['esg_score'], df['pe_ratio'], 1)
p = np.poly1d(z)
axes[1, 1].plot(df['esg_score'], p(df['esg_score']), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

## Statistical Significance Testing

In [None]:
# Test statistical significance of correlations
financial_metrics = ['price_change_1y', 'volatility', 'pe_ratio', 'roe', 'debt_to_equity']

print("Statistical Significance of ESG-Financial Correlations:")
print("=" * 55)

for metric in financial_metrics:
    correlation, p_value = stats.pearsonr(df['esg_score'], df[metric])
    significance = "Significant" if p_value < 0.05 else "Not Significant"
    
    print(f"ESG Score vs {metric}:")
    print(f"  Correlation: {correlation:.3f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Significance: {significance}")
    print()

## ESG Component Relationships

In [None]:
# Correlation between ESG components
esg_components = ['environmental_score', 'social_score', 'governance_score']
esg_corr = df[esg_components].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(esg_corr, annot=True, cmap='Blues', square=True, fmt='.3f',
           cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Correlation Between ESG Components')
plt.tight_layout()
plt.show()

# ESG component statistics
print("ESG Component Statistics:")
print(df[esg_components + ['esg_score']].describe())

## Top and Bottom Performers Analysis

In [None]:
# Top 5 ESG performers
top_esg = df.nlargest(5, 'esg_score')[['ticker', 'sector', 'esg_score', 'environmental_score', 'social_score', 'governance_score']]
print("Top 5 ESG Performers:")
print(top_esg)

print("\n" + "="*50 + "\n")

# Bottom 5 ESG performers
bottom_esg = df.nsmallest(5, 'esg_score')[['ticker', 'sector', 'esg_score', 'environmental_score', 'social_score', 'governance_score']]
print("Bottom 5 ESG Performers:")
print(bottom_esg)

# Compare financial performance
print("\n" + "="*50 + "\n")
print("Financial Performance Comparison:")
print("Top ESG Performers - Average Financial Metrics:")
top_financial = df.nlargest(5, 'esg_score')[['price_change_1y', 'volatility', 'pe_ratio', 'roe']].mean()
print(top_financial)

print("\nBottom ESG Performers - Average Financial Metrics:")
bottom_financial = df.nsmallest(5, 'esg_score')[['price_change_1y', 'volatility', 'pe_ratio', 'roe']].mean()
print(bottom_financial)

## Key Insights and Conclusions

### Summary of Findings:

1. **ESG Score Distribution**: The ESG scores show a roughly normal distribution with most companies scoring between 40-80.

2. **Sector Variations**: Different sectors show varying ESG performance patterns, with some sectors consistently outperforming others.

3. **Component Correlations**: The three ESG components (Environmental, Social, Governance) show moderate to strong correlations with each other, suggesting companies that perform well in one area tend to perform well in others.

4. **Financial Performance Relationships**: 
   - ESG scores show varying correlations with financial metrics
   - Some relationships may be statistically significant while others may not
   - The strength and direction of correlations can provide insights for investment decisions

5. **Top vs Bottom Performers**: Companies with higher ESG scores may show different risk-return profiles compared to lower-scoring companies.

### Next Steps:

1. **Data Quality**: Improve data collection to include more companies and time series data
2. **Advanced Analytics**: Implement machine learning models for ESG score prediction
3. **Risk Analysis**: Conduct deeper analysis of ESG impact on investment risk
4. **Sector Benchmarking**: Develop sector-specific ESG benchmarks
5. **Dashboard Integration**: Integrate these insights into the interactive dashboard