In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import xgboost as xgb
from datetime import datetime
import re
import random

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Load datasets
train_path = '/content/drive/MyDrive/DATASET/train.csv'
test_path = '/content/drive/MyDrive/DATASET/test_x.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
le = LabelEncoder()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  train_df = pd.read_csv(train_path)


In [None]:
# Check for missing values
print(train_df.isnull().sum().sort_values(ascending=True))

Basvuru Yili                                                     0
Degerlendirme Puani                                              1
Cinsiyet                                                       169
Dogum Tarihi                                                   177
Dogum Yeri                                                     791
Ikametgah Sehri                                               2037
Universite Adi                                                 132
Universite Turu                                                255
Burslu ise Burs Yuzdesi                                      42685
Burs Aliyor mu?                                                  0
Bölüm                                                          231
Universite Kacinci Sinif                                       374
Universite Not Ortalamasi                                     2753
Daha Once Baska Bir Universiteden Mezun Olmus                37345
Lise Adi                                                      

In [None]:
# Set threshold and drop columns with more than 40k missing values
missing_value_threshold = 42000
train_df = train_df.loc[:, train_df.isnull().sum() < missing_value_threshold]
train_df = train_df.dropna(subset=['Degerlendirme Puani'])  # Delete missing 'Degerlendirme Puani' value
train_df.drop(columns=['Lise Adi', 'Ingilizce Seviyeniz?','Anne Sektor', 'Baba Sektor'], inplace=True)

print(train_df.shape)

(65124, 33)


In [None]:
# Transliteration function for Turkish characters
def transliterate_turkish(text):
    if pd.isna(text) or not isinstance(text, str):
        return text
    turkish_to_english = str.maketrans("üÜıIİğĞşŞçÇöÖ", "uUiIigGsScCoO")
    return text.translate(turkish_to_english).lower().strip()

# Apply transliteration to string columns
str_cols = train_df.select_dtypes(include=['object']).columns
for col in str_cols:
    train_df[col] = train_df[col].apply(transliterate_turkish)

In [None]:
# Map 'Cinsiyet' and fill missing values
train_df['Cinsiyet'] = train_df['Cinsiyet'].map({'erkek': 1, 'kadin': 0})
train_df['Cinsiyet'].fillna(np.random.choice([0, 1]), inplace=True)

# Map binary columns and fill missing values
binary_cols = [
    'Burs Aliyor mu?', 'Baska Bir Kurumdan Burs Aliyor mu?',
    'Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?',
    'Profesyonel Bir Spor Daliyla Mesgul musunuz?',
    'Aktif olarak bir STK üyesi misiniz?',
    'Girisimcilikle Ilgili Deneyiminiz Var Mi?',
    'Ingilizce Biliyor musunuz?', 'Stk Projesine Katildiniz Mi?',
    'Daha Once Baska Bir Universiteden Mezun Olmus'
]

for col in binary_cols:
    train_df[col] = train_df[col].map({'evet': 1, 'hayir': 0})
    train_df[col].fillna(np.random.choice([0, 1]), inplace=True)

# Map and fill missing values for 'Universite Turu'
train_df['Universite Turu'] = train_df['Universite Turu'].map({'devlet': 1, 'ozel': 0})
train_df['Universite Turu'].fillna(np.random.choice([0, 1]), inplace=True)

   Cinsiyet  Burs Aliyor mu?  Baska Bir Kurumdan Burs Aliyor mu?  \
0       1.0                1                                 0.0   
1       1.0                0                                 0.0   
2       1.0                0                                 0.0   
3       1.0                1                                 0.0   
4       1.0                1                                 0.0   

   Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?  \
0                                                1.0      
1                                                0.0      
2                                                1.0      
3                                                1.0      
4                                                0.0      

   Profesyonel Bir Spor Daliyla Mesgul musunuz?  \
0                                           1.0   
1                                           0.0   
2                                           0.0   
3                               

In [None]:
# Define city correction function
def process_city_column(text):
    if pd.isna(text) or text.lower() == 'nan':  # Keep missing values intact
        return ''
    text = text.lower().strip()
    text = re.sub(r'\s*[/,-]\s*', '/', text)

    # Handle city corrections
    city_corrections = {
        "sansun": "samsun",
        "eskiseir": "eskisehir",
        "kopnya": "konya",
        "kamankirshir": "kirsehir",
        "gedizkutahya": "kutahya",
        "mudanya bursa": "bursa",
        "yozgat/cayiralan": "yozgat",
        "trabzon/arsin": "trabzon",
        "adiyaman/celikhan": "adiyaman",
        "uskudar":"istanbul"
    }

    cities = ["adana", "adiyaman", "afyonkarahisar", "agri", "aksaray", "amasya", "ankara", "antalya", "ardahan", "artvin",
              "aydin", "balikesir", "bartin", "batman", "bayburt", "bilecik", "bingol", "bitlis", "bolu", "burdur", "bursa",
              "canakkale", "cankiri", "corum", "denizli", "diyarbakir", "duzce", "edirne", "elazig", "erzincan", "erzurum",
              "eskisehir", "gaziantep", "giresun", "gumushane", "hakkari", "hatay", "igdir", "isparta", "istanbul", "izmir",
              "kahramanmaras", "karabuk", "karaman", "kars", "kastamonu", "kayseri", "kilis", "kirikkale", "kirklareli",
              "kirsehir", "kocaeli", "konya", "kutahya", "malatya", "manisa", "mardin", "mersin", "mugla", "mus", "nevsehir",
              "nigde", "ordu", "osmaniye", "rize", "sakarya", "samsun", "siirt", "sinop", "sivas", "sanliurfa", "sirnak",
              "tekirdag", "tokat", "trabzon", "tunceli", "usak", "van", "yalova", "yozgat", "zonguldak"]

    if text in city_corrections:
        text = city_corrections[text]

    if any(city in text for city in cities):
        return next(city for city in cities if city in text)  # Return the first matching city

    if re.search(r'\d|mahalle|sokak|apartman|daire', text):
        return None

    return "others"  # For non-matching cities, assign "others"

# Apply preprocessing and encoding to relevant columns
for col in ['Dogum Yeri', 'Ikametgah Sehri', 'Lise Sehir']:
    train_df[col] = train_df[col].astype(str).apply(process_city_column)
    train_df[col].replace('', pd.NA, inplace=True)  # Replace empty strings with NaN
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    labels = train_df[col].unique()
    train_df[col] = train_df[col].apply(lambda x: random.choice(labels) if pd.isna(x) else x)

In [None]:
train_df['Lise Sehir'].value_counts().tail(50)

Unnamed: 0_level_0,count
Lise Sehir,Unnamed: 1_level_1
midyat,1
yilmaz,1
izmit/karamursel,1
lefkosa,1
kasyeri,1
canakklale,1
montezuma/new mexico,1
bodrum,1
atyrau,1
denizl,1


In [None]:
import difflib

# First, replace common suffixes and abbreviations
train_df['Bölüm'] = train_df['Bölüm'].astype(str).apply(lambda x: x.replace('muhendisligi', 'muh').replace('-', ' ').strip())

# Define a function to match similar values using difflib
def get_best_match(value, choices, threshold=0.8):
    match = difflib.get_close_matches(value, choices, n=1, cutoff=threshold)
    if match:
        return match[0]
    return value  # Return the original if no good match is found

# Get the unique values from 'Bölüm' column
unique_values = train_df['Bölüm'].unique()

# Create a mapping for similar values
similar_value_mapping = {}
for value in unique_values:
    if value not in similar_value_mapping:
        # Find the best match for each value in the list of unique values
        match = get_best_match(value, unique_values)
        similar_value_mapping[value] = match

# Apply the mapping to standardize similar values
train_df['Bölüm'] = train_df['Bölüm'].map(similar_value_mapping)

# Count the occurrences after normalizing similar values
final_value_counts = train_df['Bölüm'].value_counts()

Bölüm
isletme                                                           2913
endustri muh                                                      2838
bilgisayar muh                                                    2249
hukuk                                                             2007
elektrik elektronik muh                                           1797
                                                                  ... 
yonetim bilisim sistemleri yuksek lisans                             1
elek elektronik muh                                                  1
bilgisayar egitim ve ogretim teknolojileri ogretmenligi              1
iktisadi ve idari bilimler fakultesiyonetim bilisim sistemleri       1
nanoteknoloji muh                                                    1
Name: count, Length: 5183, dtype: int64


In [None]:
# Calculate age from 'Dogum Tarihi'
train_df['Dogum Tarihi'] = pd.to_datetime(train_df['Dogum Tarihi'], errors='coerce')
current_date = datetime.now()
train_df['Age'] = train_df['Dogum Tarihi'].apply(lambda x: current_date.year - x.year - ((current_date.month, current_date.day) < (x.month, x.day)) if pd.notnull(x) else np.nan)
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df.drop(columns=['Dogum Tarihi'], inplace=True)

In [None]:
# Parent's education and employment mappings
education_mapping = {
    'yuksek lisans / doktora': 5,
    'yuksek lisans / doktara': 5,
    'yuksek lisans': 5,
    'doktora': 5,
    'ilkokul mezunu': 1,
    'ilkokul': 1,
    'ilk okul': 1,
    'ortaokul mezunu': 2,
    'ortaokul': 2,
    'orta okul': 2,
    'lise mezunu': 3,
    'lise': 3,
    'liseli' : 3,
    'universite mezunu': 4,
    'universite': 4,
    'mezun': 4,
    'egitim yok': 0,
    'egitimi yok': 0
}
calisma_mapping = {'evet': 1, 'hayir': 0, 'emekli': 2}

for col in ['Anne Calisma Durumu', 'Baba Calisma Durumu']:
    train_df[col] = train_df[col].replace(calisma_mapping).fillna(0)

for col in ['Anne Egitim Durumu', 'Baba Egitim Durumu']:
    train_df[col] = train_df[col].str.lower().replace(education_mapping).fillna(0)

In [None]:
# Standardize 'Universite Kacinci Sinif'
class_mapping = {'hazirlik': 0, 'mezun': 7, 'yuksek lisans': 7, 'tez': 7, '0': -1}

def standardize_class(value):
    if pd.isna(value) or value in ['nan', '']:
        return -1
    try:
        value = int(value)
        return value if 1 <= value <= 6 else -1
    except ValueError:
        return class_mapping.get(value.lower(), -1)

train_df['Universite Kacinci Sinif'] = train_df['Universite Kacinci Sinif'].apply(standardize_class)
mode_value = train_df['Universite Kacinci Sinif'][train_df['Universite Kacinci Sinif'] != -1].mode()[0]
train_df['Universite Kacinci Sinif'].replace(-1, mode_value, inplace=True)

In [None]:
# Clean 'Kardes Sayisi' column
train_df['Kardes Sayisi'] = pd.to_numeric(train_df['Kardes Sayisi'].astype(str), errors='coerce').fillna(-1).astype(int)
top_3_values = train_df['Kardes Sayisi'].value_counts().nlargest(3).index.tolist()
train_df.loc[train_df['Kardes Sayisi'] == -1, 'Kardes Sayisi'] = np.random.choice(top_3_values)

In [None]:
def convert_grade_range(grade):
    if pd.isna(grade):
        return np.nan
    grade = str(grade).lower().strip()
    if grade in ['ortalama bulunmuyor', 'not ortalamasi yok', 'hazirligim']:
        return np.nan
    if '-' in grade or 've' in grade:
        range_values = [float(val) for val in grade.replace('ve alti', '-').replace('ve ustu', '-').split('-') if val.strip()]
        return np.mean(range_values)
    try:
        return float(grade)
    except ValueError:
        return np.nan

train_df['Universite Not Ortalamasi'] = train_df['Universite Not Ortalamasi'].apply(convert_grade_range)
non_nan_values = train_df['Universite Not Ortalamasi'].dropna().values
train_df['Universite Not Ortalamasi'] = train_df['Universite Not Ortalamasi'].apply(lambda x: np.random.choice(non_nan_values) if np.isnan(x) else x)

In [None]:
# Define mappings
lise_turu_mapping = {
    'anadolu lisesi': 0,
    'anadolu': 0,
    'duz lise': 0,
    'devlet': 0,
    'meslek lisesi': 0,
    'meslek': 0,
    'fen lisesi': 0,
    'fen': 0,
    'imam hatip lisesi': 0,
    'ozel': 1,
    'ozel lisesi': 1,
    'ozel lise': 1
}


#Preserve the original missing values
originally_missing_mask = train_df['Lise Turu'].isnull()

# Convert 'Lise Turu' to lowercase and map the values
train_df['Lise Turu'] = train_df['Lise Turu'].str.lower().map(lise_turu_mapping)

# Replace unmapped values (now NaN) with 2
train_df['Lise Turu'].fillna(2, inplace=True)

# Randomly fill the original NaN values with 0, 1, or 2
train_df.loc[originally_missing_mask, 'Lise Turu'] = np.random.choice([0, 1, 2], size=originally_missing_mask.sum())

In [None]:
# Define keyword lists for each Lise Bolumu category
sayisal_keywords = ['sayisal', 'fen', 'matematik', 'mf', 'fen sayisal bilimleri', 'fen matematik', 'matematik fen']
esit_agirlik_keywords = ['esitagirlik', 'esit agirlik', 'tm', 'turkce matematik', 'turkce  matematik', 'ea']
sozel_keywords = ['sozel', 'sosyal', 'ts', 'sosyal bilimler']
dil_keywords = ['dil', 'yabanci dil']

# Function to map Lise Bolumu
def map_lise_bolumu(value):
    if pd.isna(value):
        return np.nan  # Keep NaN values intact
    value = str(value).lower()
    value = re.sub(r'[-/]', ' ', value)

    if any(keyword in value for keyword in sayisal_keywords):
        return '0'
    elif any(keyword in value for keyword in esit_agirlik_keywords):
        return '1'
    elif any(keyword in value for keyword in sozel_keywords):
        return '2'
    elif any(keyword in value for keyword in dil_keywords):
        return '3'
    else:
        return '4'

# Apply the function to the 'Lise Bolumu' column
train_df['Lise Bolumu'] = train_df['Lise Bolumu'].apply(map_lise_bolumu)
# Ensure the column is of type float to handle NaNs
train_df['Lise Bolumu'] = train_df['Lise Bolumu'].astype(float)

# Get the unique non-NaN values for random filling
non_missing_values = train_df['Lise Bolumu'].dropna().unique()

# Fill NaN values with random choices from the existing non-missing values
train_df['Lise Bolumu'] = train_df['Lise Bolumu'].apply(lambda x: np.random.choice(non_missing_values) if pd.isna(x) else x)

In [None]:
# Check for missing values
print(train_df.isnull().sum().sort_values(ascending=True))

Basvuru Yili                                                0
Ingilizce Biliyor musunuz?                                  0
Girisimcilikle Ilgili Deneyiminiz Var Mi?                   0
Stk Projesine Katildiniz Mi?                                0
Aktif olarak bir STK üyesi misiniz?                         0
Profesyonel Bir Spor Daliyla Mesgul musunuz?                0
Girisimcilik Kulupleri Tarzi Bir Kulube Uye misiniz?        0
Kardes Sayisi                                               0
Baba Calisma Durumu                                         0
Baba Egitim Durumu                                          0
Anne Calisma Durumu                                         0
Anne Egitim Durumu                                          0
id                                                          0
Lise Bolumu                                                 0
Baska Bir Kurumdan Burs Aliyor mu?                          0
Burs Aliyor mu?                                             0
Degerlen

In [None]:
train_df['Bölüm'].value_counts().head(50)

Unnamed: 0_level_0,count
Bölüm,Unnamed: 1_level_1
isletme,2913
endustri muh,2838
bilgisayar muh,2249
hukuk,2007
elektrik elektronik muh,1797
makine muh,1557
iktisat,1529
tip,1334
insaat muh,1209
uluslararasi iliskiler,974


In [None]:
print(train_df.head(10))

   Basvuru Yili  Degerlendirme Puani  Cinsiyet  Dogum Yeri Ikametgah Sehri  \
0          2014                 52.0       1.0      ankara          ankara   
1          2014                 30.0       1.0     uskudar        istanbul   
2          2014                 18.0       1.0      samsun        istanbul   
3          2014                 40.0       1.0  diyarbakir        istanbul   
4          2014                 24.0       1.0      ankara          ankara   
5          2014                 37.0       1.0    istanbul        istanbul   
6          2014                 18.0       1.0    istanbul        istanbul   
7          2014                 24.0       1.0    istanbul        istanbul   
8          2014                 40.0       1.0    erzincan        istanbul   
9          2014                 31.0       1.0       sivas        istanbul   

                     Universite Adi  Universite Turu  Burs Aliyor mu?  \
0           ihsan dogramaci bilkent              0.0                

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Split the data into features and target variable
X = train_df.drop(columns=['Degerlendirme Puani', 'id'])
y = train_df['Degerlendirme Puani']

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Sayısal ve kategorik özelliklerin ayrılması
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'int64', 'float64']).columns

# Kategorik sütunlardaki tüm değerleri string'e çevir
for col in categorical_features:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Sayısal özellikler için işlem: Eksik değerleri ortalama ile doldur ve ölçekle
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Kategorik özellikler için işlem: Eksik değerleri en sık görülen ile doldur ve one-hot encode yap
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Sayısal ve kategorik işlemleri birleştir
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Ön işleme pipeline'ını X_train ve X_val verilerine uygulayın
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# Model eğitimine hazır!
print(X_train_processed.shape, X_val_processed.shape)

(52099, 5835) (13025, 5835)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# Random Forest Model Training
rf_model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
rf_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf_grid = GridSearchCV(
    rf_model,
    rf_params,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
rf_grid.fit(X_train_processed, y_train)

# Best parameters and model
best_rf_model = rf_grid.best_estimator_
print("Best RF Parameters:", rf_grid.best_params_)

# XGBoost Model Training
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter tuning for XGBoost
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0]
}

xgb_grid = GridSearchCV(
    xgb_model,
    xgb_params,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)
xgb_grid.fit(X_train_processed, y_train)

# Best parameters and model
best_xgb_model = xgb_grid.best_estimator_
print("Best XGB Parameters:", xgb_grid.best_params_)

# Evaluate the Random Forest model on validation data
y_pred_rf = best_rf_model.predict(X_val_processed)
print("\nRandom Forest Model Evaluation:")
print("MSE:", mean_squared_error(y_val, y_pred_rf))
print("MAE:", mean_absolute_error(y_val, y_pred_rf))
print("R² Score:", r2_score(y_val, y_pred_rf))

# Evaluate the XGBoost model on validation data
y_pred_xgb = best_xgb_model.predict(X_val_processed)
print("\nXGBoost Model Evaluation:")
print("MSE:", mean_squared_error(y_val, y_pred_xgb))
print("MAE:", mean_absolute_error(y_val, y_pred_xgb))
print("R² Score:", r2_score(y_val, y_pred_xgb))

# Load and preprocess test data
test_df_features = test_df.drop(columns=['id'])  # Drop 'id' for predictions
test_processed = preprocessor.transform(test_df_features)  # Use the preprocessor pipeline

# Choose the best model based on validation performance (e.g., Random Forest here)
test_predictions = best_rf_model.predict(test_processed)

# If you need to save the predictions, you can convert them to a DataFrame and export them:
import pandas as pd

test_df['Predictions'] = test_predictions
test_df[['id', 'Predictions']].to_csv('test_predictions.csv', index=False)