In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from category_encoders import TargetEncoder
from sklearn.feature_selection import RFE

# FutureWarning ile ilgili ayar
pd.set_option('future.no_silent_downcasting', True)

# 1. Dosyaları yükle
train_df = pd.read_csv(r"C:\Users\Admin\Desktop\train.csv", low_memory=False)
test_df = pd.read_csv(r"C:\Users\Admin\Desktop\test_x.csv", low_memory=False)
submission_sample = pd.read_csv(r"C:\Users\Admin\Desktop\sample_submission.csv", low_memory=False)

# 2. Eksik değerlerin analizi ve doldurulması
categorical_cols = train_df.select_dtypes(include=['object']).columns
numerical_cols = train_df.select_dtypes(exclude=['object']).columns

# Sayısal sütunlardaki eksik değerleri ortalama ile dolduralım
imputer_num = SimpleImputer(strategy='mean')
train_df[numerical_cols] = imputer_num.fit_transform(train_df[numerical_cols])

# Kategorik sütunlardaki eksik değerleri en sık görülen değer (mod) ile dolduralım
imputer_cat = SimpleImputer(strategy='most_frequent')
train_df[categorical_cols] = imputer_cat.fit_transform(train_df[categorical_cols])

# 3. Target Encoding uygulayalım
target_encoder = TargetEncoder()
train_df[categorical_cols] = target_encoder.fit_transform(train_df[categorical_cols], train_df['Degerlendirme Puani'])

# Aynı işlemi test verisi için de uygulayalım
test_df[categorical_cols] = target_encoder.transform(test_df[categorical_cols])

# 4. Aykırı değerlerin analizi ve filtrelenmesi (IQR yöntemi ile)
for col in numerical_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    train_df = train_df[(train_df[col] >= lower_bound) & (train_df[col] <= upper_bound)]

# 5. Hedef ve özellikleri belirleyelim
X = train_df.drop(columns=['Degerlendirme Puani'])
y = train_df['Degerlendirme Puani']

# Eğitim ve doğrulama setlerine ayıralım
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Recursive Feature Elimination (RFE) ile özellik seçimi yapalım
rfe_selector = RFE(estimator=RandomForestRegressor(random_state=42), n_features_to_select=10, step=1)
rfe_selector.fit(X_train, y_train)

# Seçilen özellikler
X_train_rfe = X_train.loc[:, rfe_selector.support_]
X_valid_rfe = X_valid.loc[:, rfe_selector.support_]

# 7. Hiperparametre optimizasyonu - RandomForest için
param_grid_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid_rf,
                           cv=5,
                           n_jobs=-1,
                           verbose=2)
grid_search_rf.fit(X_train_rfe, y_train)

# En iyi RandomForest parametreleri
best_params_rf = grid_search_rf.best_params_
print("En iyi RandomForest parametreleri:", best_params_rf)

# RandomForest ile en iyi modelle eğitim
best_model_rf = RandomForestRegressor(**best_params_rf, random_state=42)
best_model_rf.fit(X_train_rfe, y_train)

# Doğrulama setinde tahmin yapalım
y_pred_rf = best_model_rf.predict(X_valid_rfe)
rmse_rf = np.sqrt(mean_squared_error(y_valid, y_pred_rf))
print("\nRandomForest Modeli Validation RMSE:", rmse_rf)

# 8. Hiperparametre optimizasyonu - XGBoost için
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0]
}

grid_search_xgb = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
                               param_grid=param_grid_xgb,
                               cv=5,
                               n_jobs=-1,
                               verbose=2)
grid_search_xgb.fit(X_train_rfe, y_train)

# En iyi XGBoost parametreleri
best_params_xgb = grid_search_xgb.best_params_
print("En iyi XGBoost parametreleri:", best_params_xgb)

# XGBoost ile en iyi modelle eğitim
xgb_best_model = xgb.XGBRegressor(**best_params_xgb)
xgb_best_model.fit(X_train_rfe, y_train)

# Doğrulama setinde tahmin yapalım
y_pred_xgb = xgb_best_model.predict(X_valid_rfe)
rmse_xgb = np.sqrt(mean_squared_error(y_valid, y_pred_xgb))
print("\nXGBoost Modeli Validation RMSE:", rmse_xgb)

# 9. Stratified K-Fold CV ile XGBoost
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search_xgb_cv = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
                                  param_grid=param_grid_xgb,
                                  cv=strat_kfold,
                                  n_jobs=-1,
                                  verbose=2)
grid_search_xgb_cv.fit(X_train_rfe, y_train)

# En iyi modelin tahmini
y_pred_xgb_cv = grid_search_xgb_cv.predict(X_valid_rfe)
rmse_xgb_cv = np.sqrt(mean_squared_error(y_valid, y_pred_xgb_cv))
print("\nStratified K-Fold ile XGBoost Modeli Validation RMSE:", rmse_xgb_cv)

# 10. Ensemble modeli (RandomForest ve XGBoost)
ensemble_predictions = (best_model_rf.predict(X_valid_rfe) + xgb_best_model.predict(X_valid_rfe)) / 2
rmse_ensemble = np.sqrt(mean_squared_error(y_valid, ensemble_predictions))
print("\nEnsemble Modeli Validation RMSE:", rmse_ensemble)

# 11. Test verisi üzerinde tahmin yapalım
test_df[numerical_cols] = imputer_num.transform(test_df[numerical_cols])
test_predictions_ensemble = (best_model_rf.predict(test_df) + xgb_best_model.predict(test_df)) / 2

# Submission dosyası
submission = pd.DataFrame({
    'id': test_df['id'],
    'Degerlendirme Puani': test_predictions_ensemble
})

submission.to_csv(r"C:\Users\Admin\Desktop\submission_ensemble.csv", index=False)
print("Ensemble submission dosyası oluşturuldu.")

# 12. Exploratory Data Analysis (EDA)
plt.figure(figsize=(10, 6))
sns.histplot(y, kde=True)
plt.title('Degerlendirme Puani Dağılımı')
plt.xlabel('Degerlendirme Puani')
plt.ylabel('Frekans')
plt.show()

plt.figure(figsize=(12, 10))
corr = train_df.corr()
sns.heatmap(corr, annot=False, cmap='coolwarm')
plt.title('Özellikler Arası Korelasyon Matrisi')
plt.show()

residuals_ensemble = y_valid - ensemble_predictions
plt.figure(figsize=(10, 6))
sns.histplot(residuals_ensemble, kde=True)
plt.title('Ensemble Modeli Hata Dağılımı')
plt.xlabel('Hata Değeri')
plt.ylabel('Frekans')
plt.show()
