### **Imports**

In [10]:
import os
import sys

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import gradio as gr

### **Dataset**

Dataset provides data for two subjects: Maths and Portugese. We will focus on the Portugese dataset initially as it contains more data.

In [11]:
portugese_df = pd.read_csv('./data/Portuguese.csv')
description_df = pd.read_csv('./data/student_data_description.csv')

### **Feature Analysis**

#### **Compile Distributions**

#### **Compile Distributions**

In [12]:
distributions = {}
tooltip_text = {}

for feature in portugese_df.columns:
    value_counts = portugese_df[feature].value_counts()
    distributions[feature] = value_counts
    tooltip_text[feature] = description_df[description_df['Column'] == feature]['Description'].values[0]
        

In [13]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

m = 50

# Create a subplot with 11 rows and 3 columns
fig = make_subplots(
    rows=11,
    cols=3,
    subplot_titles=list(distributions.keys()),
    vertical_spacing=0.025,
    # horizontal_spacing=0.05,
)

# Set title, bold
fig.update_layout(title_text='Feature Distributions', title_font=dict(size=24, color='#212121', weight='bold'))

# Plot histogram in each subplot
for i, (feature, counts) in enumerate(distributions.items()):
    sorted_counts = counts.sort_index(ascending=False)
    fig.add_trace(
        # Vertical bar, set color of max value to red
        go.Bar(
            x=sorted_counts.index, 
            y=sorted_counts.values, 
            name=feature, 
            orientation='v', 
            marker=dict(color='#414141'), 
            showlegend=False,
            # Add annotation above all values
            text=sorted_counts.values,
            textposition='outside',
            textfont=dict(size=14, color='black'),
            width=0.75,
        ),

        row=(i // 3) + 1,  # Adjust the row index for 11 rows
        col=(i % 3) + 1,   # Adjust the column index for 3 columns
    )

# Update layout
fig.update_layout(
    plot_bgcolor='white',
    # Remove legend
    showlegend=False,
    height=5000,  # Set a fixed height for scrolling
    margin=dict(l=m, r=m, t=m*2, b=m, pad=5),
)


fig.update_annotations(font=dict(size=18, color='#212121', family='Arial', weight='bold'), 
                      xanchor='left', yanchor='top', xshift=0, yshift=40)

# Save to HTML
fig.write_html('../docs/distributions.html', include_plotlyjs='cdn')  # Use CDN for better loading performance

#### **Correlation Matrix**

In [83]:
# Initialize the label encoder
le = LabelEncoder()

# Apply label encoding to each categorical column
for col in portugese_df.select_dtypes(include=['object']).columns:
    portugese_df[col] = le.fit_transform(portugese_df[col])

# Now you can compute the correlation matrix
correlation_matrix = portugese_df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool), k=1)
correlation_matrix = correlation_matrix.mask(mask)

In [80]:
fig = go.Figure(
    data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        colorscale='RdBu',
        zmin=-1,
        zmax=1,
    )
)

fig.update_layout(
    title='Correlation Matrix',
    title_font=dict(size=24, color='#212121', weight='bold'),
    margin=dict(l=m, r=m, t=m*2, b=m, pad=5),
    width=800,
    height=800,
    plot_bgcolor='white',
)

fig.update_xaxes(title_text=None, title_font=dict(size=20, color='#212121', weight='bold'), tickangle=-45, tickfont=dict(size=14, color='#212121'))
fig.update_yaxes(title_text=None, title_font=dict(size=20, color='#212121', weight='bold'), autorange='reversed', tickfont=dict(size=14, color='#212121'))

fig.write_html('../docs/correlation_matrix.html', include_plotlyjs='cdn')

In [91]:
import plotly.graph_objects as go
import numpy as np

# Assuming correlation_matrix is already defined as a DataFrame

# Prepare the data for the scatter plot
z_values = correlation_matrix.values
x_labels = correlation_matrix.columns
y_labels = correlation_matrix.columns

# Create a figure
fig = go.Figure()

# Prepare lists to hold the data for a single scatter trace
x_data = []
y_data = []
sizes = []
colors = []

# Set a maximum marker size
max_marker_size = 16  # Adjust this value as needed

# Loop through the matrix to gather data for a single trace
for i in range(len(y_labels)):
    for j in range(len(x_labels)):
        if not np.isnan(z_values[i, j]) and i != j:  # Only plot if the value is not NaN and not on the diagonal
            x_data.append(j)  # X position (column index)
            y_data.append(i)  # Y position (row index)
            size = abs(z_values[i, j] * 30)  # Scale the size of the circle
            # Cap the size to max_marker_size
            sizes.append(min(size, max_marker_size) + 2)
            colors.append(z_values[i, j])  # Color based on correlation value

# Add a single scatter trace
fig.add_trace(
    go.Scatter(
        x=x_data,
        y=y_data,
        mode='markers',
        marker=dict(
            size=sizes,  # Use the list of sizes
            color=colors,  # Use the list of colors based on correlation
            colorscale='RdBu',  # Maintain the original colorscale
            colorbar=dict(title='Correlation'),  # Add color bar for reference
            cmin=-1,  # Set the minimum value of the color scale
            cmax=1,  # Set the maximum value of the color scale
            showscale=True,  # Show the color scale
            line=dict(width=1, color='DarkSlateGrey'),  # Optional outline for visibility
            symbol='square'
        ),
        text=[f'{x_labels[j]} vs {y_labels[i]}: {z_values[i, j]:.2f}' for i, j in zip(y_data, x_data)],  # Hover text
        hoverinfo='text',
    )
)

m = 40

# Update layout settings
fig.update_layout(
    title='Correlation Matrix',
    title_font=dict(size=24, color='#212121', weight='bold'),
    margin=dict(l=m, r=m, t=m * 2, b=m, pad=0),
    # width=1200,
    # height=800,
    plot_bgcolor='white',  # Set the plot background color to white
    # Show gridlines
    xaxis=dict(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.05)', zeroline=False),
    yaxis=dict(showgrid=True, gridwidth=1, gridcolor='rgba(0,0,0,0.05)', zeroline=False),
)

# Update x and y axes to match the original settings
fig.update_xaxes(
    title_text=None,
    title_font=dict(size=20, color='#212121', weight='bold'),
    tickangle=-45,
    tickfont=dict(size=14, color='#212121'),
    tickvals=list(range(len(x_labels))),  # Set ticks to match labels
    ticktext=x_labels  # Set tick labels
)
fig.update_yaxes(
    title_text=None,
    title_font=dict(size=20, color='#212121', weight='bold'),
    autorange='reversed',  # Reverse the y-axis
    tickfont=dict(size=14, color='#212121'),
    tickvals=list(range(len(y_labels))),  # Set ticks to match labels
    ticktext=y_labels  # Set tick labels
)

# Save the figure as an HTML file
fig.write_html('../docs/correlation_matrix.html', include_plotlyjs='cdn')