In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import xgboost as xgb

# Veri Yükleme ve Hazırlık
df = pd.read_csv("element_verileri.csv")

# Elementler ve hedefler
elements = ['He', 'Ne', 'Cl', 'Mg', 'Ti', 'Fe', 'Ag', 'Ni', 'Si', 'Cu', 'Mn', 'Pt', 'U', 'Al', 'Ar', 'N', 'Zn', 'P', 'H', 'Ca', 'C', 'Cr', 'S', 'Li', 'Na', 'V']
targets = ['life_score', 'science_score', 'mining_score', 'success_score']

# Özellikler ve hedefler
X = df[elements].copy()
y = df[targets]

# Yeni Özellikler Eklemek (Özellik mühendisliği)
X['average_density'] = X.mean(axis=1)  # Elementlerin yoğunluklarının ortalamasını yeni bir özellik olarak ekliyoruz.
X['sum_density'] = X.sum(axis=1)  # Elementlerin toplam yoğunluğunu yeni bir özellik olarak ekliyoruz.

# Özelliklerin normalizasyonu
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Eğitimi ve Parametre Optimizasyonu (GridSearchCV ile)
models = {}
for target in targets:
    y_target = df[target]
    
    # Eğitim ve test verilerini ayır
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_target, test_size=0.2, random_state=42)
    
    # Random Forest için parametreler
    rf_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 20],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True],  # Overfitting riskini azaltmak için bootstrap kullanıyoruz.
    }
    
    # XGBoost için parametreler
    xgb_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'alpha': [0.1, 0.5],  # L2 düzenlileştirme ile modelin karmaşıklığını sınırlıyoruz.
        'lambda': [0.1, 0.5],  # L1 düzenlileştirme ile modelin karmaşıklığını sınırlıyoruz.
    }
    
    # GridSearchCV - RandomForest
    rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    rf_grid_search.fit(X_train, y_train)
    
    # GridSearchCV - XGBoost
    xgb_grid_search = GridSearchCV(xgb.XGBRegressor(random_state=42, early_stopping_rounds=10), xgb_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    xgb_grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    # Model seçiminde en iyi sonucu veren model
    if rf_grid_search.best_score_ > xgb_grid_search.best_score_:
        model = rf_grid_search.best_estimator_
    else:
        model = xgb_grid_search.best_estimator_
    
    # Modeli kaydet
    models[target] = model
    
    # Modeli kaydetmek için pickle
    with open(f'{target}_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    # Modeli test et
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Target: {target}")
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

# Normalizer'ı kaydet
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)





In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle

# Veri Yükleme ve Hazırlık
df = pd.read_csv("element_verileri.csv")

# Elementler ve hedefler
elements = ['He', 'Ne', 'Cl', 'Mg', 'Ti', 'Fe', 'Ag', 'Ni', 'Si', 'Cu', 'Mn', 'Pt', 'U', 'Al', 'Ar', 'N', 'Zn', 'P', 'H', 'Ca', 'C', 'Cr', 'S', 'Li', 'Na', 'V']
targets = ['life_score', 'science_score', 'mining_score', 'success_score']

# Özellikler ve hedefler
X = df[elements]
y = df[targets]

# Özelliklerin normalizasyonu
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Eğitimi
models = {}
for target in targets:
    y_target = df[target]
    
    # Eğitim ve test verilerini ayır
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_target, test_size=0.2, random_state=42)
    
    # Random Forest modelini oluştur
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Modeli kaydet
    models[target] = model
    
    # Modeli kaydetmek için pickle
    with open(f'{target}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

# Normalizer'ı kaydet
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import xgboost as xgb

# Veri Yükleme
df = pd.read_csv("element_verileri.csv")

# Elementler ve hedefler
elements = ['He', 'Ne', 'Cl', 'Mg', 'Ti', 'Fe', 'Ag', 'Ni', 'Si', 'Cu', 'Mn', 'Pt', 'U', 'Al', 'Ar', 'N', 'Zn', 'P', 'H', 'Ca', 'C', 'Cr', 'S', 'Li', 'Na', 'V']
targets = ['life_score', 'science_score', 'mining_score', 'success_score']

# Özellikler ve hedefler
X = df[elements].copy()
y = df[targets]

# Özellik mühendisliği (Feature Engineering)
X['average_density'] = X.mean(axis=1)
X['sum_density'] = X.sum(axis=1)
X['H_to_C_ratio'] = X['H'] / (X['C'] + 1e-6)
X['life_related_sum'] = X[['H', 'C', 'N', 'P', 'S']].sum(axis=1)
X['metal_sum'] = X[['Fe', 'Ni', 'Cu', 'Mn', 'Zn', 'Ag', 'Pt', 'Ti']].sum(axis=1)

# Özelliklerin normalizasyonu
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Repeated K-Fold Cross Validation
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Model Eğitimi ve Parametre Optimizasyonu
models = {}
for target in targets:
    y_target = df[target]
    
    # Eğitim ve test verilerini ayır
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_target, test_size=0.2, random_state=42)
    
    # İsteğe bağlı: Data Augmentation (hafif veri gürültüsü ekleme)
    def augment(X, y, noise_level=0.01):
        noise = np.random.normal(0, noise_level, X.shape)
        X_aug = X + noise
        y_aug = y
        return np.vstack((X, X_aug)), np.hstack((y, y_aug))
    
    X_train, y_train = augment(X_train, y_train)

    # Random Forest için genişletilmiş parametreler
    rf_param_grid = {
        'n_estimators': [100, 200],  # Daha az ağaç
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1],
        'bootstrap': [True],
    }
    
    # XGBoost için genişletilmiş parametreler
    xgb_param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.05],
        'subsample': [0.7, 1.0],
        'colsample_bytree': [0.7, 1.0],
        'alpha': [0.1, 0.5],
        'lambda': [0.1, 0.5],
    }
    
    # GridSearchCV - Random Forest
    rf_grid_search = GridSearchCV(
        RandomForestRegressor(random_state=42),
        rf_param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1  # Paralel işlem
    )
    rf_grid_search.fit(X_train, y_train)
    
    # GridSearchCV - XGBoost
    xgb_grid_search = GridSearchCV(
        xgb.XGBRegressor(
            random_state=42,
            early_stopping_rounds=10,  # Erken durdurma
            eval_metric='rmse',
            verbosity=0
        ),
        xgb_param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1  # Paralel işlem
    )
    xgb_grid_search.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
    # Model seçiminde en iyi sonucu veren model
    if rf_grid_search.best_score_ > xgb_grid_search.best_score_:
        model = rf_grid_search.best_estimator_
    else:
        model = xgb_grid_search.best_estimator_
    
    # Modeli kaydet
    models[target] = model
    
    # Modeli kaydetmek için pickle
    with open(f'{target}_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    # Modeli test et
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Target: {target}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R^2 Score: {r2:.4f}")

# Scaler'ı kaydet
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


Target: life_score
Mean Squared Error: 0.0003
R^2 Score: 0.9209
Target: science_score
Mean Squared Error: 0.0001
R^2 Score: 0.9691
Target: mining_score
Mean Squared Error: 0.0002
R^2 Score: 0.9229
Target: success_score
Mean Squared Error: 0.0001
R^2 Score: 0.9609
