### **Imports**

In [1]:
import os
import sys

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


### **Dataset**

Dataset provides data for two subjects: Maths and Portugese. We will focus on the Portugese dataset initially as it contains more data.

In [2]:
portuguese_df = pd.read_csv('./data/Portuguese.csv')
description_df = pd.read_csv('./data/student_data_description.csv')

In [3]:
# Display entirety, no truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
description_df

Unnamed: 0,Column,Description
0,school,Student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
1,sex,Student's sex (binary: 'F' - female or 'M' - male)
2,age,Student's age (numeric: from 15 to 22)
3,address,Student's home address type (binary: 'U' - urban or 'R' - rural)
4,famsize,Family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
5,Pstatus,Parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
6,Medu,"Mother's education (numeric: 0 - none, 1 - primary (4th grade), 2 - 5th to 9th grade, 3 - secondary, 4 - higher)"
7,Fedu,"Father's education (numeric: 0 - none, 1 - primary (4th grade), 2 - 5th to 9th grade, 3 - secondary, 4 - higher)"
8,Mjob,"Mother's job (nominal: 'teacher', 'health' care, 'services' (e.g., administrative/police), 'at_home', 'other')"
9,Fjob,"Father's job (nominal: 'teacher', 'health' care, 'services' (e.g., administrative/police), 'at_home', 'other')"


In [4]:
# Reset
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

### **Feature Analysis**

In [5]:
# Define a function for default figure styling
def default_fig(title_text="Default Title", **kwargs):
    fig = go.Figure(**kwargs)
    m = 50

    # Set default title properties (bold, larger font, etc.)
    fig.update_layout(
        title={
            'text': title_text,
            'font': {'size': 20, 'color': 'black', 'weight': 'bold'},
        },
        title_font=dict(family="Arial", size=24, color="black"),
        # Set space between ticks and plot
        margin=dict(l=m, r=m, t=m*2, b=m),
        # Set plot background color
        plot_bgcolor='white',
        # Larger axes ticks
        xaxis=dict(tickfont=dict(size=14)),
        yaxis=dict(tickfont=dict(size=14)),
        # Set gridline properties
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        xaxis_gridcolor='rgba(0, 0, 0, 0.1)',
        yaxis_gridcolor='rgba(0, 0, 0, 0.1)',
        # Thinner bars
        
    )
    return fig

#### **Compile Distributions**

In [29]:
distributions = {}

for feature in portuguese_df.columns:
    value_counts = portuguese_df[feature].value_counts()
    distributions[feature] = value_counts
        

In [136]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

m = 50

# Create a subplot with 11 rows and 3 columns
fig = make_subplots(
    rows=11,
    cols=3,
    subplot_titles=list(distributions.keys()),
    vertical_spacing=0.025,
    # horizontal_spacing=0.05,
)

# Set title, bold
fig.update_layout(title_text='Feature Distributions', title_font=dict(size=24, color='#212121', weight='bold'))

# Plot histogram in each subplot
for i, (feature, counts) in enumerate(distributions.items()):
    sorted_counts = counts.sort_index(ascending=False)
    fig.add_trace(
        # Vertical bar, set color of max value to red
        go.Bar(
            x=sorted_counts.index, 
            y=sorted_counts.values, 
            name=feature, 
            orientation='v', 
            marker=dict(color='#414141'), 
            showlegend=False,
            # Add annotation above all values
            text=sorted_counts.values,
            textposition='outside',
            textfont=dict(size=14, color='black'),
            width=0.75,


        ),
        row=(i // 3) + 1,  # Adjust the row index for 11 rows
        col=(i % 3) + 1,   # Adjust the column index for 3 columns
    )

# Update layout
fig.update_layout(
    plot_bgcolor='white',
    # Remove legend
    showlegend=False,
    height=5000,  # Set a fixed height for scrolling
    margin=dict(l=m, r=m, t=m*2, b=m, pad=5),
)


fig.update_annotations(font=dict(size=18, color='#212121', family='Arial', weight='bold'), 
                      xanchor='left', yanchor='top', xshift=0, yshift=40)

# Save to HTML
fig.write_html('distributions.html', include_plotlyjs='cdn')  # Use CDN for better loading performance

### **Feature Analysis**