In [None]:

# Cell 1: Install FLAML
!pip install flaml

In [None]:
# Cell 2: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from flaml import AutoML
from sklearn.metrics import accuracy_score

# Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
# Cell 3: Load Data
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')

In [None]:
# Cell 4: Initial Data Inspection
display(train.head())
display(train.dtypes)
display(train.nunique())

In [None]:
# Cell 5: Separating Features and Removing White Spaces from Feature Names
target_feature = ['Target']

categorical_features = [
    'Marital status', 'Application mode', 'Application order', 'Course',
    'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
    "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation"
]

binary_categorical_features = [
    'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date',
    'Gender', 'Scholarship holder', 'International'
]

numerical_features = [
    'Previous qualification (grade)', 'Admission grade', 'Age at enrollment',
    'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)',
    'Unemployment rate', 'Inflation rate', 'GDP'
]

# Removing white spaces from feature names
train.columns = [x.replace(' ', '_') for x in train.columns]
test.columns = [x.replace(' ', '_') for x in test.columns]

In [None]:
# Cell 6: Correlation Check
X = train[train.columns[1:-1]]
X['Target'] = [1 if x == 'Graduate' else 2 if x == 'Dropout' else 3 for x in train['Target']]
corr = X.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plotting correlation heatmap
fig, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corr, mask=mask, cmap='Reds', linewidth=0.1)
plt.title('CORRELATION MAP', color='blue', fontsize=12)
plt.show()

In [None]:
# Cell 7: Distribution Plots
palette = sns.color_palette("Set2", 3)
target_palette = {'Graduate': palette[0], 'Dropout': palette[1], 'Enrolled': palette[2]}

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
sns.set_style("whitegrid")

train['Target'].value_counts().plot(kind='bar', ax=axes[0, 0], title='Distribution of Target Variable', color=[target_palette['Graduate'], target_palette['Dropout'], target_palette['Enrolled']])
axes[0, 0].set_xlabel('Target')
axes[0, 0].set_ylabel('Count')

train['Admission_grade'].plot(kind='hist', bins=20, ax=axes[0, 1], title='Distribution of Admission Grade', color=palette[0])
axes[0, 1].set_xlabel('Admission Grade')
axes[0, 1].set_ylabel('Frequency')

train['Mean_Semester_Grade'] = (train['Curricular_units_1st_sem_(grade)'] + train['Curricular_units_2nd_sem_(grade)']) / 2
sns.boxplot(x='Target', y='Mean_Semester_Grade', data=train, ax=axes[0, 2], palette=[target_palette['Graduate'], target_palette['Dropout'], target_palette['Enrolled']])
axes[0, 2].set_title('Mean Semester Grades by Target Variable')
axes[0, 2].set_xlabel('Target')
axes[0, 2].set_ylabel('Mean Semester Grades')

train['Age_at_enrollment'].plot(kind='hist', bins=20, ax=axes[1, 0], title='Age Distribution', color=palette[0])
axes[1, 0].set_xlabel('Age at Enrollment')
axes[1, 0].set_ylabel('Frequency')

sns.boxplot(x='Target', y='Admission_grade', data=train, ax=axes[1, 1], palette=[target_palette['Graduate'], target_palette['Dropout'], target_palette['Enrolled']])
axes[1, 1].set_title('Admission Grade by Target Variable')
axes[1, 1].set_xlabel('Target')
axes[1, 1].set_ylabel('Admission Grade')

train['Gender'].value_counts().plot(kind='bar', ax=axes[1, 2], title='Distribution of Gender', color=palette[0])
axes[1, 2].set_xlabel('Gender')
axes[1, 2].set_xticklabels(['Female', 'Male'], rotation=0)
axes[1, 2].set_ylabel('Count')

plt.tight_layout()
plt.show()

train = train.drop('Mean_Semester_Grade', axis=1)

In [None]:
# Cell 8: Gender vs. Target Distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Gender', hue='Target', data=train, palette=[target_palette['Graduate'], target_palette['Dropout'], target_palette['Enrolled']])
plt.title('Target Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Target')
plt.xticks([0, 1], ['Female', 'Male'])
plt.show()

In [None]:
# Cell 9: Chi-Square Test
contingency_table = pd.crosstab(train['Gender'], train['Target'])
chi2, p, dof, ex = chi2_contingency(contingency_table)
print(f'Chi-Square Test:\nChi2: {chi2}\np-value: {p}')

In [None]:
# Cell 10: Age Group and Scholarship Analysis
fig, axes = plt.subplots(1, 2, figsize=(18, 6))
train['Age_Group'] = pd.cut(train['Age_at_enrollment'], bins=[15, 20, 25, 30, 35, 40, 50], labels=['15-20', '21-25', '26-30', '31-35', '36-40', '41-50'])

age_group_distribution = train.groupby('Age_Group')['Target'].value_counts(normalize=True).unstack().fillna(0) * 100
age_group_distribution = age_group_distribution[['Graduate', 'Dropout', 'Enrolled']]
age_group_distribution.plot(kind='bar', ax=axes[0], color=[target_palette['Graduate'], target_palette['Dropout'], target_palette['Enrolled']])
axes[0].set_title('Normalized Target Distribution Across Age Groups')
axes[0].set_xlabel('Age Group')
axes[0].set_ylabel('Percentage')
axes[0].legend(title='Target')

scholarship_distribution = train.groupby('Scholarship_holder')['Target'].value_counts(normalize=True).unstack().fillna(0) * 100
scholarship_distribution = scholarship_distribution[['Graduate', 'Dropout', 'Enrolled']]
scholarship_distribution.plot(kind='bar', ax=axes[1], color=[target_palette['Graduate'], target_palette['Dropout'], target_palette['Enrolled']])
axes[1].set_title('Normalized Target Distribution by Scholarship Holder Status')
axes[1].set_xlabel('Scholarship Holder')
axes[1].set_ylabel('Percentage')
axes[1].set_xticklabels(['No', 'Yes'], rotation=0)
axes[1].legend(title='Target')

plt.tight_layout()
plt.show()

train = train.drop(['Age_Group'], axis=1)

In [None]:
# Cell 11: Combine Features
def combine_features(df):
    df['Total_Credits'] = df['Curricular_units_1st_sem_(credited)'] + df['Curricular_units_2nd_sem_(credited)']
    df['Total_Evaluations'] = df['Curricular_units_1st_sem_(evaluations)'] + df['Curricular_units_2nd_sem_(evaluations)']
    df['Total_Approved'] = df['Curricular_units_1st_sem_(approved)'] + df['Curricular_units_2nd_sem_(approved)']
    return df

train = combine_features(train)
test = combine_features(test)

# Verify the correct columns to remove
less_important_features = [
    'Marital_status', 'Application_mode', 'International', 'Mother\'s_occupation',
    'Father\'s_occupation', 'Educational_special_needs', 'Displaced', 'GDP',
    'Inflation_rate', 'Unemployment_rate'
]

train = train.drop(columns=less_important_features)
test = test.drop(columns=less_important_features)

In [None]:
# Cell 12: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(train.drop('Target', axis=1), train['Target'], test_size=0.2, random_state=42)

In [None]:
# Cell 13: AutoML Model Training
autoML = AutoML()
autoML.fit(X_train, y_train, task="classification", metric='accuracy', time_budget=30)

print(f'Best AutoML Model: {autoML.best_estimator}\n')
print(f'Best Parameter AutoML Model:\n {autoML.best_config}\n')
print(f'Best roc_auc_ovo On Val data: {1 - autoML.best_loss:.4g}\n')
print(f'Best Run Training duration: {autoML.best_config_train_time:.4g} s\n')

In [None]:
# Cell 14: Feature Importance
plt.figure(figsize=(20, 15), facecolor='yellow')
plt.barh(autoML.model.estimator.feature_name_, autoML.model.estimator.feature_importances_)
plt.show()

In [None]:
# Cell 15: Model Evaluation
y_pred = autoML.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Best Accuracy on Validation Data: {accuracy:.4g}\n')
