# Comparative Study of Random Forest and Gradient-Boosted Trees for Predicting Indonesian Public University Tuition Fees

## Exploratory Data Analysis

This notebook contains exploratory data analysis for our research project.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load the data
data_path = "../Data/data.csv"
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully with shape: {df.shape}")
else:
    print(f"Data file not found at {data_path}")
    
print(f"Dataset info:")
print(df.info())

In [None]:
# Display basic statistics
print("Dataset description:")
print(df.describe())

print("\nFirst few rows:")
print(df.head())

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
# Visualize distribution of target variables
target_cols = ['UKT-1', 'UKT-2', 'UKT-3', 'UKT-4']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(target_cols):
    axes[i].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {list(categorical_cols)}")

# Plot distributions of categorical variables
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for i, col in enumerate(categorical_cols):
    if i < len(axes):  # Make sure we don't exceed the number of subplots
        value_counts = df[col].value_counts()
        axes[i].bar(value_counts.index, value_counts.values)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].tick_params(axis='x', rotation=45)

# Hide unused subplots
for i in range(len(categorical_cols), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(12, 10))
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, square=True, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
# Analyze relationship between categorical variables and target
# For each categorical variable, plot boxplots of target variables
fig, axes = plt.subplots(len(categorical_cols), len(target_cols), figsize=(20, 5*len(categorical_cols)))

if len(categorical_cols) == 1:
    axes = axes.reshape(1, -1)

for i, cat_col in enumerate(categorical_cols):
    for j, target_col in enumerate(target_cols):
        sns.boxplot(data=df, x=cat_col, y=target_col, ax=axes[i, j])
        axes[i, j].set_title(f'{target_col} by {cat_col}')
        axes[i, j].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Summary statistics by categorical variables
for cat_col in categorical_cols:
    print(f"\nSummary of target variables by {cat_col}:")
    summary = df.groupby(cat_col)[target_cols].mean()
    print(summary)

In [None]:
# Distribution of tuition fees across years
plt.figure(figsize=(12, 6))

for col in target_cols:
    plt.plot(df[col].head(50), label=col, marker='o')

plt.title('Tuition Fees Across Years (First 50 Universities)')
plt.xlabel('University Index')
plt.ylabel('Tuition Fee (IDR)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Prepare for modeling - check for any preprocessing needs
print("Unique values in categorical variables:")
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

print(f"\nAny infinite values: {np.isinf(df.select_dtypes(include=[np.number])).any().any()}")
print(f"Any NaN values: {np.isnan(df.select_dtypes(include=[np.number])).any().any()}")

In [None]:
# Save processed data if needed
processed_data_path = "../Data/processed_data.csv"
df.to_csv(processed_data_path, index=False)
print(f"Processed data saved to {processed_data_path}")