In [50]:
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [51]:
# Load the data
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')

train.head()

Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [52]:
# Initial data inspection
train.dtypes

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance                          int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender        

In [53]:
# Calculate the number of unique values in each column
train.nunique()

Marital status                                       6
Application mode                                    22
Application order                                    8
Course                                              19
Daytime/evening attendance                           2
Previous qualification                              21
Previous qualification (grade)                     110
Nacionality                                         18
Mother's qualification                              35
Father's qualification                              39
Mother's occupation                                 40
Father's occupation                                 56
Admission grade                                    668
Displaced                                            2
Educational special needs                            2
Debtor                                               2
Tuition fees up to date                              2
Gender                                               2
Scholarshi

In [54]:
# Separating the features into variables

target_feature = ['Target']

categorical_features = [
    'Marital status', 'Application mode', 'Application order', 'Course',
    'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
    "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation"
]

binary_categorical_features = [
    'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date',
    'Gender', 'Scholarship holder', 'International'
]

numerical_features = [
    'Previous qualification (grade)', 'Admission grade', 'Age at enrollment',
    'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)',
    'Unemployment rate', 'Inflation rate', 'GDP'
]


In [55]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

# Function to calculate Cramér's V
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    return np.sqrt(chi2 / (n * (min(confusion_matrix.shape) - 1)))

# Combine categorical and binary categorical features
categorical_data = train[categorical_features + binary_categorical_features]

# Calculate the correlation matrix for categorical data using Cramér's V
categorical_corr_matrix = pd.DataFrame(np.zeros((len(categorical_data.columns), len(categorical_data.columns))), 
                                       index=categorical_data.columns, columns=categorical_data.columns)

for col1 in categorical_data.columns:
    for col2 in categorical_data.columns:
        if col1 != col2:
            categorical_corr_matrix.loc[col1, col2] = cramers_v(categorical_data[col1], categorical_data[col2])
        else:
            categorical_corr_matrix.loc[col1, col2] = 1.0

# Calculate the most correlated categorical feature pairs using Cramér's V
high_corr_pairs_cat = (categorical_corr_matrix.where(np.triu(np.ones(categorical_corr_matrix.shape), k=1).astype(bool))
                                           .stack()
                                           .sort_values(ascending=False))

# Define a threshold for high correlation (e.g., above 0.8)
threshold = 0.8
high_corr_pairs_cat = high_corr_pairs_cat[high_corr_pairs_cat > threshold]

# Display the most correlated feature pairs
high_corr_pairs_cat


Course       Daytime/evening attendance    0.998065
Nacionality  International                 0.996039
dtype: float64

In [56]:
# Display the value counts for the specified categories
categories = ['Course', 'Daytime/evening attendance', 'Nacionality', 'International']

value_counts = {category: train[category].value_counts() for category in categories}

value_counts


{'Course': Course
 9500    12074
 9773     8214
 9238     7935
 9147     7741
 9254     5425
 9085     5373
 9670     4760
 9991     4057
 9003     3733
 9070     3281
 9853     3198
 9119     3004
 171      2859
 8014     2438
 9130     1606
 9556      746
 33         72
 979         1
 39          1
 Name: count, dtype: int64,
 'Daytime/evening attendance': Daytime/evening attendance
 1    70038
 0     6480
 Name: count, dtype: int64,
 'Nacionality': Nacionality
 1      76013
 41       221
 26        67
 6         56
 22        56
 24        15
 11        15
 2         13
 103       12
 105        9
 101        9
 100        9
 21         6
 25         6
 62         6
 17         2
 109        2
 32         1
 Name: count, dtype: int64,
 'International': International
 0    76011
 1      507
 Name: count, dtype: int64}

In [57]:
# Create a new column 'Adjusted International' based on the relationship between 'Nacionality' and 'International'
train['Adjusted International'] = train.apply(lambda row: 0 if row['Nacionality'] == 1 else row['International'], axis=1)

# Drop the original 'Nacionality' and 'International' columns
train = train.drop(columns=['Nacionality', 'International'])

# Verify the changes
train.head()


Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,Adjusted International
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,19,5,...,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate,0
1,1,17,1,9238,1,1,125.0,19,19,9,...,6,9,0,0.0,0,11.1,0.6,2.02,Dropout,0
2,1,17,2,9254,1,1,137.0,3,19,2,...,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout,0
3,1,1,3,9500,1,1,131.0,19,3,3,...,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled,0
4,1,1,2,9500,1,1,132.0,19,37,4,...,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate,0


In [58]:
# Compute the correlation matrix for numerical features only
correlation_matrix_numerical = train[numerical_features].corr().abs()

# Find pairs of numerical features with high correlation
threshold = 0.8  # Define a threshold for high correlation
high_corr_pairs_numerical = (correlation_matrix_numerical.where(np.triu(np.ones(correlation_matrix_numerical.shape), k=1).astype(bool))
                                                .stack()
                                                .sort_values(ascending=False))
high_corr_pairs_numerical = high_corr_pairs_numerical[high_corr_pairs_numerical > threshold]

# Display the most correlated numerical feature pairs
high_corr_pairs_numerical

Curricular units 1st sem (enrolled)  Curricular units 2nd sem (enrolled)    0.956321
Curricular units 1st sem (credited)  Curricular units 2nd sem (credited)    0.933935
Curricular units 1st sem (approved)  Curricular units 2nd sem (approved)    0.922657
Curricular units 1st sem (grade)     Curricular units 2nd sem (grade)       0.889393
Curricular units 2nd sem (approved)  Curricular units 2nd sem (grade)       0.859370
Curricular units 1st sem (approved)  Curricular units 1st sem (grade)       0.839860
                                     Curricular units 2nd sem (grade)       0.830430
dtype: float64

In [59]:
# Create a correlation matrix for columns that contain 'Curricular units'
curricular_columns = [col for col in train.columns if 'Curricular units' in col]
curricular_corr_matrix = train[curricular_columns].corr()

# Display the correlation matrix
curricular_corr_matrix

Unnamed: 0,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations)
Curricular units 1st sem (credited),1.0,0.549256,0.33208,0.356888,0.080462,0.041958,0.933935,0.421179,0.264235,0.270014,0.083428,0.022417
Curricular units 1st sem (enrolled),0.549256,1.0,0.484462,0.599526,0.388141,0.021245,0.531755,0.956321,0.459693,0.548,0.374975,0.006231
Curricular units 1st sem (evaluations),0.33208,0.484462,1.0,0.418769,0.500855,0.120004,0.317123,0.437354,0.786699,0.354304,0.429139,0.077303
Curricular units 1st sem (approved),0.356888,0.599526,0.418769,1.0,0.83986,-0.055343,0.345784,0.590151,0.48493,0.922657,0.83043,-0.065043
Curricular units 1st sem (grade),0.080462,0.388141,0.500855,0.83986,1.0,-0.035564,0.073241,0.401075,0.553566,0.787559,0.889393,-0.043324
Curricular units 1st sem (without evaluations),0.041958,0.021245,0.120004,-0.055343,-0.035564,1.0,0.042009,0.011253,0.060928,-0.055103,-0.040058,0.446286
Curricular units 2nd sem (credited),0.933935,0.531755,0.317123,0.345784,0.073241,0.042009,1.0,0.42993,0.260298,0.27852,0.07991,0.025959
Curricular units 2nd sem (enrolled),0.421179,0.956321,0.437354,0.590151,0.401075,0.011253,0.42993,1.0,0.44581,0.572588,0.392509,-0.00038
Curricular units 2nd sem (evaluations),0.264235,0.459693,0.786699,0.48493,0.553566,0.060928,0.260298,0.44581,1.0,0.411914,0.517637,0.071972
Curricular units 2nd sem (approved),0.270014,0.548,0.354304,0.922657,0.787559,-0.055103,0.27852,0.572588,0.411914,1.0,0.85937,-0.066165


In [60]:
# Aggregating highly correlated "Curricular units" features by averaging
train['Curricular units (enrolled)'] = (train['Curricular units 1st sem (enrolled)'] + train['Curricular units 2nd sem (enrolled)']) / 2
train['Curricular units (credited)'] = (train['Curricular units 1st sem (credited)'] + train['Curricular units 2nd sem (credited)']) / 2
train['Curricular units (approved)'] = (train['Curricular units 1st sem (approved)'] + train['Curricular units 2nd sem (approved)']) / 2
train['Curricular units (grade)'] = (train['Curricular units 1st sem (grade)'] + train['Curricular units 2nd sem (grade)']) / 2

# Optional: Drop the original columns if no longer needed
train.drop(columns=[
    'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (enrolled)',
    'Curricular units 1st sem (credited)', 'Curricular units 2nd sem (credited)',
    'Curricular units 1st sem (approved)', 'Curricular units 2nd sem (approved)',
    'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)'
], inplace=True)

train.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification',
       'Previous qualification (grade)', 'Mother's qualification',
       'Father's qualification', 'Mother's occupation', 'Father's occupation',
       'Admission grade', 'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (without evaluations)', 'Unemployment rate',
       'Inflation rate', 'GDP', 'Target', 'Adjusted International',
       'Curricular units (enrolled)', 'Curricular units (credited)',
       'Curricular units (approved)', 'Curricular units (grade)'],
      dtype='object')