In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [7]:
# Load dataset
df = pd.read_csv("train.csv")  # Replace with your dataset file

# Sort data by year
df = df.sort_values(by=["academicYear", "universityName", "departmentName"])

# Dynamic features (change yearly)
dynamic_features = [
    "baseAdmissionRanking(TYT)", "outOfCityStudentRate", "avgOrderofPreference", 
    "top3AdmittedRatio", "baseScore", "topScore", "totalStudentNumber", "Urap_Rank", 
    "Urap_Score", "avg_monthly_income_group", "Time_for_employment", "employment_rate", 
    "base_salary_by_year", "inflation_by_year", "growth_by_year", "quota", "occupiedSlots", 
    "tuitionFee", "profCount", "assoCount", "docCount", "topRanking", 
    "avgAdmissionRanking(TYT)", "stdDeviationStudents", "revenue", "totalPreference", 
    "top1PreferenceRatio", "avgAdmittedStudentPrefOrder", "top1AdmittedRatio", 
    "top10AdmittedRatio", "admittedTotalPref", "admittedTotalDepartmentPref", 
    "currentStudentCount", "totalForeignStudents", "baseRanking"
]

# Static features (do not change over time)
static_features = [
    "academicYear", "universityName", "faculty", "departmentName", "idOSYM", 
    "scholarshipRate", "universityLocation", "universityRegion", "universityType_devlet", 
    "universityType_vakıf", "programType_DİL", "programType_EA", "programType_SAY", 
    "programType_SÖZ", "language_Almanca", "language_Arapça", "language_Bulgarca", 
    "language_Ermenice", "language_Fransızca", "language_Korece", "language_Lehçe", 
    "language_Rusça", "language_Türkçe", "language_Çince", "language_İngilizce", 
    "language_İspanyolca", "language_İtalyanca", "idOSYM_flag"
]

print("Dynamic features:", len(dynamic_features))
print("Static features:", len(static_features))
print("Total features:", df.count)

Dynamic features: 35
Static features: 28
Total features: <bound method DataFrame.count of        academicYear  universityName   faculty  departmentName    idOSYM  \
11        -1.641363       -1.759866  0.581975       -1.650296 -0.791291   
43        -1.641363       -1.759866  1.856959       -1.479335 -0.791291   
59        -1.641363       -1.759866  1.885610       -1.225651 -0.791291   
18        -1.641363       -1.759866  0.581975       -1.187046 -0.791292   
24        -1.641363       -1.759866  0.581975       -1.082264 -0.791293   
...             ...             ...       ...             ...       ...   
52192      1.362864        1.637196  0.581975       -0.199883 -0.733233   
52199      1.362864        1.637196  0.983093        0.759705 -0.731986   
52203      1.362864        1.637196  0.983093        0.897577 -0.731986   
52220      1.362864        1.637196  1.928587        1.046479 -0.731987   
52210      1.362864        1.637196  1.613422        1.195381 -0.731987   

       sc

In [None]:

# Create lag features (previous year's value)
for feature in dynamic_features:
    df[f"{feature}_prev_year"] = df.groupby(["universityName", "departmentName"])[feature].shift(1)

# Drop rows with NaN values from lagging
df = df.dropna()

In [None]:
# Split into train (2019-2023) and test (2024)
train_df = df[df["academicYear"] < 2024]
test_df = df[df["academicYear"] == 2024]

# Train models to predict dynamic features
predicted_features = {}
for feature in dynamic_features:
    X_train = train_df[[f"{feature}_prev_year"]]
    y_train = train_df[feature]
    X_test = test_df[[f"{feature}_prev_year"]]
    
    model = XGBRegressor(n_estimators=100, learning_rate=0.1)
    model.fit(X_train, y_train)
    
    test_df[f"{feature}_predicted"] = model.predict(X_test)
    predicted_features[feature] = model

# Replace missing future values in test set with predictions
for feature in dynamic_features:
    test_df[feature] = test_df[f"{feature}_predicted"]
