In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [2]:
train = pd.read_csv("data/train.csv", index_col="id")
train.head(5)

Unnamed: 0_level_0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


# Pre-process

In [3]:
Categorical = ["Marital status", 
               "Application mode", 
               "Application order", 
               "Course", 
               "Previous qualification", 
               "Nacionality", 
               "Mother's qualification", 
               "Father's qualification", 
               "Mother's occupation", 
               "Father's occupation"]
Boolean = ["Daytime/evening attendance", 
           "Displaced", 
           "Educational special needs", 
           "Debtor", 
           "Tuition fees up to date", 
           "Gender", 
           "Scholarship holder", 
           "International" ]
Continuous = ["Previous qualification (grade)", 
              "Admission grade", 
              "Age at enrollment", 
              "Curricular units 1st sem (credited)", 
              "Curricular units 1st sem (enrolled)", 
              "Curricular units 1st sem (evaluations)", 
              "Curricular units 1st sem (approved)", 
              "Curricular units 1st sem (grade)", 
              "Curricular units 1st sem (without evaluations)",
              "Curricular units 2nd sem (credited)", 
              "Curricular units 2nd sem (enrolled)", 
              "Curricular units 2nd sem (evaluations)", 
              "Curricular units 2nd sem (approved)", 
              "Curricular units 2nd sem (grade)", 
              "Curricular units 2nd sem (without evaluations)",
              "Unemployment rate",
              "Inflation rate",
              "GDP"]

In [4]:
# Separate features and target
X = train[Boolean + Categorical + Continuous]
y = train["Target"]

In [5]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_combined = pd.concat([X_train, X_test]) 

In [6]:
# Preprocessing pipeline for categorical and boolean features
preprocessor = ColumnTransformer(
    transformers=[
        ('boolean', 'passthrough', Boolean),
        ('categorical', OneHotEncoder(), Categorical)
    ],
    remainder='drop'
)

# Linear Model

In [7]:
# Fit preprocessing on combined data
X_combined_transformed = preprocessor.fit_transform(X_combined)

# Split into transformed training and test sets
X_train_transformed = X_combined_transformed[:len(X_train)]
X_test_transformed = X_combined_transformed[len(X_train):]

In [8]:
start_time = time.time()

# Define SVM model
model = SVC(kernel='linear')

# Train the model on the small subset
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

end_time = time.time()

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("time (s):", round(end_time - start_time))

Test Accuracy: 0.8198
time (s): 148


# Polynomial Model

In [9]:
# Fit preprocessing on combined data
X_combined_transformed = preprocessor.fit_transform(X_combined)

# Split into transformed training and test sets
X_train_transformed = X_combined_transformed[:len(X_train)]
X_test_transformed = X_combined_transformed[len(X_train):]

In [10]:
start_time = time.time()

# Define SVM model
model = SVC(kernel='poly', degree=2, C=0.01)

# Train the model on the small subset
model.fit(X_train, y_train)

train_time = time.time()

# Make predictions on the test set
y_pred = model.predict(X_test)

test_time = time.time()

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Train time (s):", round(train_time - start_time))
print("Predict time (s):", round(test_time - train_time))

Test Accuracy: 0.4938
Train time (s): 133
Predict time (s): 18


In [11]:
print(len(X_train), len(X_test))

57388 19130


In [12]:
start_time = time.time()
# Make predictions on the test set
y_pred = model.predict(X_train)

end_time = time.time()

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred)
print(f"Train Accuracy: {accuracy:.4f}")
print("time (s):", round(end_time - start_time))

Train Accuracy: 0.4932
time (s): 54


## 