In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import ipywidgets as widgets
from IPython.display import display

In [2]:
df = pd.read_csv('data/World Energy Consumption.csv')

In [3]:
# Filter the DataFrame to include only ASEAN countries and specified years
asean_countries = ['Brunei', 'Cambodia', 'Indonesia', 'Lao People\'s Democratic Republic',
                    'Malaysia', 'Myanmar', 'Philippines', 'Singapore', 'Thailand', 'Vietnam']
years = ['2017', '2018', '2019', '2020', '2021', '2022']

# Get the selected features
selected_features = ['population', 'gdp', 'biofuel_consumption', 'coal_consumption', 
                     'fossil_fuel_consumption', 'gas_consumption', 'hydro_consumption', 
                     'oil_consumption', 'wind_consumption', 'other_renewable_consumption', 
                     'solar_consumption']
filtered_df = df[(df['country'].isin(asean_countries)) & (df['year'].isin(years))][selected_features]

In [4]:
# Create widgets for feature selection
feature_selector = widgets.SelectMultiple(
    options=filtered_df.columns.tolist(),
    value=['population', 'gdp'],
    description='Features:',
    disabled=False
)

def update_plot(features):
    features = list(features)  # Convert tuple to list
    print("Selected Features:", features)  # Check the selected features
    if len(features) < 2:
        print("Select at least 2 features.")
        return
    
    # Create a new DataFrame with only the selected features
    subset_df = filtered_df[features]
    print("Subset DataFrame Shape:", subset_df.shape)  # Check the shape of the subset DataFrame
    
    # Calculate the correlation matrix
    correlation_matrix = subset_df.corr()
    
    # Check if the correlation matrix is empty
    if correlation_matrix.empty:
        print("No correlations found.")
        return
    
    # Find the pair with the highest correlation
    correlation_matrix = correlation_matrix.replace(1, 0)  # Replace diagonal values with 0
    correlation_matrix = correlation_matrix.stack().reset_index()
    correlation_matrix.columns = ['Feature 1', 'Feature 2', 'Correlation']
    feature_pair = correlation_matrix.loc[correlation_matrix['Correlation'].idxmax(), ['Feature 1', 'Feature 2']]
    
    x_feature, y_feature = feature_pair
    
    # Perform linear regression
    x = subset_df[x_feature].values.reshape(-1, 1)
    y = subset_df[y_feature].values.reshape(-1, 1)
    model = LinearRegression()
    model.fit(x, y)
    slope = model.coef_[0][0]
    intercept = model.intercept_[0]
    
    # Plot scatterplot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=subset_df, x=x_feature, y=y_feature)
    plt.xlabel(x_feature)
    plt.ylabel(y_feature)
    plt.title(f"{y_feature} vs {x_feature}")
    
    # Plot regression line
    plt.plot(subset_df[x_feature], slope*subset_df[x_feature] + intercept, color='red')
    
    # Display equation of the line
    plt.text(0.1, 0.9, f"y = {slope:.2f}x + {intercept:.2f}", transform=plt.gca().transAxes)
    
    plt.show()

# Create a button to update the plot
button = widgets.Button(description="Update Plot")

# Event handler for button click
def on_button_clicked(b):
    update_plot(feature_selector.value)

button.on_click(on_button_clicked)

# Display widgets
display(feature_selector)
display(button)

SelectMultiple(description='Features:', index=(0, 1), options=('population', 'gdp', 'biofuel_consumption', 'co…

Button(description='Update Plot', style=ButtonStyle())