In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

In [None]:
dftrain = pd.read_csv("train.csv")
dftest = pd.read_csv("test.csv")

In [3]:
objeler_train = dftrain.select_dtypes(include=['object']).columns
objeler_test = dftest.select_dtypes(include=['object']).columns

dftrain_dummies = pd.get_dummies(dftrain, columns=objeler_train, drop_first=True, dtype=int)
dftest_dummies = pd.get_dummies(dftest, columns=objeler_test, drop_first=True, dtype=int)

missing_cols = set(dftrain_dummies.columns) - set(dftest_dummies.columns)
for c in missing_cols:
    dftest_dummies[c] = 0 



dftest_dummies = dftest_dummies[dftrain_dummies.columns]

print("✅ Train ve test kolon sayısı:", dftrain_dummies.shape, dftest_dummies.shape)

✅ Train ve test kolon sayısı: (750000, 172) (250000, 172)


In [4]:
imputer = SimpleImputer(strategy='median')
dfy = dftrain_dummies["Listening_Time_minutes"]
dfx = dftrain_dummies.drop("Listening_Time_minutes", axis=1)
dftrain_imputed = imputer.fit_transform(dfx)
dftest_imputed = imputer.transform(dftest_dummies.drop("Listening_Time_minutes", axis=1))


In [5]:
scaler = MinMaxScaler()
dftrain_scaled = scaler.fit_transform(dftrain_imputed)
dftest_scaled = scaler.transform(dftest_imputed)


In [23]:

# KFold ayarı
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Grid parametreleri
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='r2',
    cv=cv,
    verbose=2,
    n_jobs=1  # tüm çekirdekleri kullanır, dilersen 1 yapabilirsin
)

# Fit
grid_search.fit(dftrain_scaled, dfy)

# Sonuçlar
print("En iyi parametreler:", grid_search.best_params_)
print("En iyi R² skoru:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   1.9s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=0.8; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=50, subsample=1.0; total time=   2.2s
[CV] END colsampl

KeyboardInterrupt: 