In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from google.colab import drive

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Specify the paths to your files
train_path = '/content/drive/MyDrive/DATASET/train.csv'
test_path = '/content/drive/MyDrive/DATASET/test_x.csv'

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train_df = pd.read_csv(train_path)


In [None]:
# Display the first few rows of the dataset
print(train_df.head())

   Basvuru Yili  Degerlendirme Puani Cinsiyet  Dogum Tarihi        Dogum Yeri  \
0          2014                 52.0    Erkek      4/6/1994  Altindag, Ankara   
1          2014                 30.0    Erkek     6/11/1993           Üsküdar   
2          2014                 18.0    Erkek     1/15/1986            Samsun   
3          2014                 40.0    Erkek      6/4/1991        Diyarbakır   
4          2014                 24.0    Erkek  2 Kasim 1992   Ankara/Altındağ   

  Ikametgah Sehri                    Universite Adi Universite Turu  \
0          Ankara           İHSAN DOĞRAMACI BİLKENT            Özel   
1        İstanbul           İHSAN DOĞRAMACI BİLKENT            Özel   
2        İstanbul  ULUSLARARASI KIBRIS ÜNİVERSİTESİ            Özel   
3        İstanbul       İSTANBUL ŞEHİR ÜNİVERSİTESİ            Özel   
4          Ankara          TURGUT ÖZAL ÜNİVERSİTESİ            Özel   

   Burslu ise Burs Yuzdesi Burs Aliyor mu?  ... Spor Dalindaki Rolunuz Nedir?  \
0    

In [None]:
# Display summary statistics
print(train_df.describe(include='all'))

        Basvuru Yili  Degerlendirme Puani Cinsiyet Dogum Tarihi Dogum Yeri  \
count   65125.000000         65124.000000    64956        64948      64334   
unique           NaN                  NaN        4        13498       2230   
top              NaN                  NaN    Kadın  1/1/70 2:00   İstanbul   
freq             NaN                  NaN    32077         1655       8682   
mean     2018.277298            32.086466      NaN          NaN        NaN   
std         2.669979            18.139239      NaN          NaN        NaN   
min      2014.000000             0.000000      NaN          NaN        NaN   
25%      2016.000000            18.000000      NaN          NaN        NaN   
50%      2019.000000            29.000000      NaN          NaN        NaN   
75%      2021.000000            44.000000      NaN          NaN        NaN   
max      2022.000000           102.000000      NaN          NaN        NaN   

       Ikametgah Sehri         Universite Adi Universite Turu  

In [None]:
# Check for missing values
print(train_df.isnull().sum())

Basvuru Yili                                                     0
Degerlendirme Puani                                              1
Cinsiyet                                                       169
Dogum Tarihi                                                   177
Dogum Yeri                                                     791
Ikametgah Sehri                                               2037
Universite Adi                                                 132
Universite Turu                                                255
Burslu ise Burs Yuzdesi                                      42685
Burs Aliyor mu?                                                  0
Bölüm                                                          231
Universite Kacinci Sinif                                       374
Universite Not Ortalamasi                                     2753
Daha Once Baska Bir Universiteden Mezun Olmus                37345
Lise Adi                                                      

In [None]:
# Fill missing values for categorical columns with mode
categorical_cols = train_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col in train_df.columns and train_df[col].isnull().sum() > 0:
        mode_value = train_df[col].mode()[0]  # Get the mode
        train_df[col] = train_df[col].fillna(mode_value)

# Fill missing values for numerical columns with median
numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    if col in train_df.columns and train_df[col].isnull().sum() > 0:
        median_value = train_df[col].median()  # Get the median
        train_df[col] = train_df[col].fillna(median_value)

# Special case for columns with a lot of missing values
# For example, 'Burslu ise Burs Yuzdesi' has a significant number of missing values
# Consider dropping this column if it's not critical or has too many missing values
train_df = train_df.drop(columns=['Burslu ise Burs Yuzdesi', 'Lise Adi Diger', 'Lise Bolum Diger', 'Baska Kurumdan Aldigi Burs Miktari'])

# Recheck for missing values
print(train_df.isnull().sum())


Basvuru Yili                                                 0
Degerlendirme Puani                                          0
Cinsiyet                                                     0
Dogum Tarihi                                                 0
Dogum Yeri                                                   0
Ikametgah Sehri                                              0
Universite Adi                                               0
Universite Turu                                              0
Burs Aliyor mu?                                              0
Bölüm                                                        0
Universite Kacinci Sinif                                     0
Universite Not Ortalamasi                                    0
Daha Once Baska Bir Universiteden Mezun Olmus                0
Lise Adi                                                     0
Lise Sehir                                                   0
Lise Turu                                              

In [None]:
binary_cols = [
    'Burs Aliyor mu?',
    'Baska Bir Kurumdan Burs Aliyor mu?',
    'Anne Calisma Durumu',
    'Baba Calisma Durumu',
    'Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?',
    'Profesyonel Bir Spor Daliyla Mesgul musunuz?',
    'Aktif olarak bir STK üyesi misiniz?',
    'Girisimcilikle Ilgili Deneyiminiz Var Mi?',
    'Ingilizce Biliyor musunuz?'
]

for col in binary_cols:
    if col in train_df.columns:
        # Check unique values
        print(f"Unique values in '{col}':", train_df[col].unique())

        # Fill NaN values with 'Unknown' and map values
        train_df[col] = train_df[col].fillna('Unknown').astype(str).map({'Evet': 1, 'Hayır': 0, 'Unknown': -1})

Unique values in 'Burs Aliyor mu?': ['Evet' 'Hayır' 'hayır' 'evet' 'EVET']
Unique values in 'Baska Bir Kurumdan Burs Aliyor mu?': ['Hayır' 'Evet']
Unique values in 'Anne Calisma Durumu': ['Hayır' 'Evet' 'Emekli']
Unique values in 'Baba Calisma Durumu': ['Emekli' 'Evet' 'Hayır']
Unique values in 'Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?': ['Evet' 'Hayır']
Unique values in 'Profesyonel Bir Spor Daliyla Mesgul musunuz?': ['Evet' 'Hayır']
Unique values in 'Aktif olarak bir STK üyesi misiniz?': ['Hayır' 'Evet']
Unique values in 'Girisimcilikle Ilgili Deneyiminiz Var Mi?': ['Hayır' 'Evet']
Unique values in 'Ingilizce Biliyor musunuz?': ['Evet' 'Hayır']


In [None]:
train_df["Burs Aliyor mu?"].head()

Unnamed: 0,Burs Aliyor mu?
0,1.0
1,0.0
2,0.0
3,1.0
4,1.0


In [None]:
# Define the transliteration function
def transliterate_turkish(text):
    # Mapping for Turkish to English transliteration
    turkish_to_english = str.maketrans("üÜıIİğĞşŞçÇöÖ", "uUiIigGsScCoO")
    # Replace Turkish characters and convert to lowercase
    text = text.translate(turkish_to_english)
    text = text.lower().strip()
    return text

# Convert non-string values to strings before applying the function
train_df = train_df.astype(str)

# Apply the transliteration function to all columns
train_df = train_df.map(transliterate_turkish)

In [None]:
import re

def transliterate_turkish_and_expand(text):
    # Replace common abbreviations with their full form
    text = text.replace('muhendisligi', 'muh')
    text = text.replace('-', ' ')

    # Remove text inside parentheses and the parentheses themselves
    text = re.sub(r'\(.*?\)', '', text)

    return text.strip()

# Convert non-string values to strings and apply the function to the 'Bölüm' column
train_df['Bölüm'] = train_df['Bölüm'].astype(str).apply(transliterate_turkish_and_expand)

In [None]:
train_df["Burs Aliyor mu?"].head()

Unnamed: 0,Burs Aliyor mu?
0,1.0
1,0.0
2,0.0
3,1.0
4,1.0


In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = [
    'Cinsiyet', 'Dogum Yeri', 'Ikametgah Sehri', 'Universite Adi', 'Universite Turu', 'Bölüm',
    'Daha Once Baska Bir Universiteden Mezun Olmus', 'Lise Adi', 'Lise Sehir', 'Lise Turu',
    'Lise Bolumu', 'Baska Bir Kurumdan Burs Aliyor mu?', 'Burs Aldigi Baska Kurum',
    'Anne Egitim Durumu', 'Baba Egitim Durumu', 'Anne Sektor', 'Baba Sektor',
    'Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?', 'Uye Oldugunuz Kulubun Ismi',
    'Spor Dalindaki Rolunuz Nedir?', "Hangi STK'nin Uyesisiniz?",
    'Girisimcilikle Ilgili Deneyiminizi Aciklayabilir misiniz?', 'Ingilizce Seviyeniz?',
    'Daha Önceden Mezun Olunduysa, Mezun Olunan Üniversite'
]

le = LabelEncoder()
for col in label_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype(str)
        train_df[col] = le.fit_transform(train_df[col])


In [None]:
# Dictionary to map Turkish month names to numeric values
month_mapping = {
    'Ocak': 1, 'Şubat': 2, 'Mart': 3, 'Nisan': 4, 'Mayis': 5, 'Haziran': 6,
    'Temmuz': 7, 'Ağustos': 8, 'Eylül': 9, 'Ekim': 10, 'Kasım': 11, 'Aralık': 12, 'Kasim' : 11
}
# Function to replace Turkish month names with numeric values
def replace_month_names(date_str):
    for month_name, month_number in month_mapping.items():
        if month_name in date_str:
            return date_str.replace(month_name, str(month_number))
    return date_str

# Apply the function to the 'Dogum Tarihi' column
train_df['Dogum Tarihi'] = train_df['Dogum Tarihi'].apply(lambda x: replace_month_names(str(x)) if pd.notna(x) else x)

# Convert 'Dogum Tarihi' to datetime
train_df['Dogum Tarihi'] = pd.to_datetime(train_df['Dogum Tarihi'], errors='coerce')

# Extract year, month, and day as separate features
train_df['Birth_Year'] = train_df['Dogum Tarihi'].dt.year
train_df['Birth_Month'] = train_df['Dogum Tarihi'].dt.month
train_df['Birth_Day'] = train_df['Dogum Tarihi'].dt.day

# Create a placeholder date, e.g., the median year, month, and day
median_year = train_df['Birth_Year'].median()
median_month = train_df['Birth_Month'].median()
median_day = train_df['Birth_Day'].median()

# Fill missing values with placeholders
train_df['Birth_Year'].fillna(median_year, inplace=True)
train_df['Birth_Month'].fillna(median_month, inplace=True)
train_df['Birth_Day'].fillna(median_day, inplace=True)

In [None]:
print(train_df.isnull().sum())

Basvuru Yili                                                     0
Degerlendirme Puani                                              0
Cinsiyet                                                         0
Dogum Tarihi                                                 45938
Dogum Yeri                                                       0
Ikametgah Sehri                                                  0
Universite Adi                                                   0
Universite Turu                                                  0
Burs Aliyor mu?                                                  0
Bölüm                                                            0
Universite Kacinci Sinif                                         0
Universite Not Ortalamasi                                        0
Daha Once Baska Bir Universiteden Mezun Olmus                    0
Lise Adi                                                         0
Lise Sehir                                                    

In [None]:
# Drop the original 'Dogum Tarihi' column
train_df = train_df.drop(columns=['Dogum Tarihi'])

In [None]:
# Convert categorical columns to numeric using LabelEncoder if not done yet
from sklearn.preprocessing import LabelEncoder

categorical_cols = train_df.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in categorical_cols:
    train_df[col] = train_df[col].astype(str)  # Convert to string first
    train_df[col] = le.fit_transform(train_df[col])

# Check for any non-numeric columns
print(train_df.dtypes)

Basvuru Yili                                                   int64
Degerlendirme Puani                                            int64
Cinsiyet                                                       int64
Dogum Yeri                                                     int64
Ikametgah Sehri                                                int64
Universite Adi                                                 int64
Universite Turu                                                int64
Burs Aliyor mu?                                                int64
Bölüm                                                          int64
Universite Kacinci Sinif                                       int64
Universite Not Ortalamasi                                      int64
Daha Once Baska Bir Universiteden Mezun Olmus                  int64
Lise Adi                                                       int64
Lise Sehir                                                     int64
Lise Turu                         

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model with hyperparameters
model = xgb.XGBRegressor(
    random_state=42,
    learning_rate=0.01,      # Learning rate
    n_estimators=500,       # Number of trees
    max_depth=5,            # Maximum depth of a tree
    subsample=0.8,          # Subsample ratio of training instances
    colsample_bytree=0.8,   # Subsample ratio of features per tree
    min_child_weight=3,     # Minimum sum of instance weight (hessian) needed in a child
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Evaluate performance
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 9.321096585945524
Mean Squared Error: 201.1930468460872


In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation to get a better estimate of model performance
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')

# Print the average MAE from cross-validation
print(f"Cross-validated MAE: {-cv_scores.mean()}")

Cross-validated MAE: 11.770938747764777
