In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
#data_00="/home/mustapha/Downloads/Predictive-Modeling-of-Ionic-Conductivity-in-Garnet-Type-Solid-Electrolytes-main/Data/data_65.csv"
file_path = "/home/mustapha/Downloads/data_augmented.xlsx"
file_path1 = "/home/mustapha/Downloads/Predictive-Modeling-of-Ionic-Conductivity-in-Garnet-Type-Solid-Electrolytes-main/Data/data_augmented.xlsx"



#df0 = pd.read_excel(file_path1)
df = pd.read_excel(file_path)
df1 = df.iloc[:176]
df2 = df.iloc[176:]

In [None]:
def transform_to_material_formula(row):
    # Extract the values from each column
    a = row['Li stoichiometry']
    b = row['La stoichiometry']
    c = row['Zr stoichiometry']
    x = row['Li site dopant stoichiometry']
    y = row['La site dopant stoichiometry']
    z = row['Zr site dopant stoichiometry']
    M = row['li_dopant']
    N = row['la_dopant']
    K = row['zr_dopant']



    formula = ''


    if a > 0:
        formula += f'Li{a:.2f}'

    # Add M if x > 0
    if x > 0:
        formula += f'{M}{x:.2f}'

    # Add La with its stoichiometry
    if b > 0:
        formula += f'La{b:.2f}'

    # Add N if y > 0
    if y > 0:
        formula += f'{N}{y:.2f}'

    # Add Zr with its stoichiometry
    if c > 0:
        formula += f'Zr{c:.2f}'

    # Add K if z > 0
    if z > 0:
        formula += f'{K}{z:.2f}'
        
    formula += f'O12'
   
    return formula



# Assuming you have a DataFrame named 'df' and want to create a new column 'MaterialFormula'
df['MaterialFormula'] = df.apply(transform_to_material_formula, axis=1)


In [None]:
#def interpolate_relative_density(df):
#    # Sort by 'MaterialFormula' and 'log_cond' to ensure correct interpolation
#    df = df.sort_values(by=['MaterialFormula', 'Ionic conductivity']).reset_index(drop=True)

#    # Interpolate 'Relative density' within each 'MaterialFormula' group
#    interpolated = df.groupby('MaterialFormula')['Relative density'].apply(lambda group: group.interpolate(method='linear'))

#    # Ensure the index matches the original DataFrame's index
#    df['Relative density'] = interpolated.reset_index(level=0, drop=True)

#   # Fill remaining missing values with forward fill and backward fill
#    df['Relative density'] = df['Relative density'].ffill().bfill()

#   return df



#df = interpolate_relative_density(df)
#df.info()


In [None]:
import pandas as pd
from sklearn.impute import KNNImputer

def knn_impute_relative_density(df, n_neighbors=5):
    # Check if 'log_cond' has any missing values
    if df['log_cond'].isna().all():
        # If 'log_cond' is entirely NaN, skip KNN and just ffill/bfill
        df['Relative density'] = df['Relative density'].ffill().bfill()
    else:
        # Select only the relevant columns for KNN imputation
        impute_df = df[['log_cond', 'Relative density']]

        # Initialize the KNNImputer
        imputer = KNNImputer(n_neighbors=n_neighbors)

        # Apply KNN Imputer
        imputed_values = imputer.fit_transform(impute_df)

        # Replace 'Relative density' column with imputed values
        df['Relative density'] = imputed_values[:, -1]  # Use the last column, which is 'Relative density'

        # Fill remaining missing values with forward fill and backward fill
        df['Relative density'] = df['Relative density'].ffill().bfill()

    return df

# Example usage
df = knn_impute_relative_density(df)
df.info()


In [None]:
materials_to_remove = ['Li6.40Al0.20La3.00Zr2.00O12', 'Li6.25Al0.25La3.00Zr2.00O12']
#df = df[~df['MaterialFormula'].isin(materials_to_remove)]
df.info()

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import linregress
import matplotlib.pyplot as plt
import seaborn as sns

def check_monotonicity_with_deviation_removal(df, max_allowed_deviations, deviation_threshold):
    filtered_data = []
    non_increasing_materials = []

    grouped = df.groupby('MaterialFormula')

    for material, group in grouped:
        group_sorted = group.sort_values('Relative density')
        densities = group_sorted['Relative density'].values
        conductivities = group_sorted['log_cond'].values

        # Check for NaN or infinite values and remove them
        if np.any(np.isnan(conductivities)):
            print(f"Skipping material {material} due to NaN values.")
            continue

        if np.any(np.isinf(densities)) or np.any(np.isinf(conductivities)):
            print(f"Skipping material {material} due to infinite values.")
            continue

        # Handle materials with only one unique density value
        if len(np.unique(densities)) < 2:
            print(f"Including material {material} with a single unique 'Relative density' value.")
            filtered_data.append(group_sorted)
            continue

        # Fit a linear model to capture the average increasing trend
        slope, intercept, _, _, _ = linregress(densities, conductivities)

        # Check if the slope is positive
        if slope <= 0:
            non_increasing_materials.append(material)
            continue

        expected_trend = intercept + slope * densities
        deviations = conductivities - expected_trend
        significant_deviations = np.abs(deviations) > deviation_threshold

        # If the number of significant deviations is within the allowed limit, filter them out
        if significant_deviations.sum() <= max_allowed_deviations:
            group_filtered = group_sorted[~significant_deviations]
        else:
            # Otherwise, skip the entire material
            continue

        filtered_data.append(group_filtered)

    # Combine filtered data into a single DataFrame
    filtered_df = pd.concat(filtered_data)

    # Plot non-increasing materials
    sns.set(style="whitegrid")
    for material in non_increasing_materials:
        group = df[df['MaterialFormula'] == material]
        plt.figure(figsize=(6, 4))
        sns.lineplot(
            x='Relative density',
            y='log_cond',
            data=group,
            marker='o',
            label=material,
        )
        plt.title(f'Non-Increasing Trend for {material}')
        plt.xlabel('Relative density')
        plt.ylabel('log_cond')
        plt.legend()
        plt.show()

    # Remove non-increasing materials from the dataset
    filtered_df = filtered_df[~filtered_df['MaterialFormula'].isin(non_increasing_materials)]

    return filtered_df




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_top_materials(df, num_materials):

    # Counting the occurrences of each material formula
    material_counts = df['MaterialFormula'].value_counts()

    # Getting the top N most repeated materials
    top_materials = material_counts.nlargest(num_materials)

    sns.set(style="whitegrid")

    # Looping through the top materials and plotting their data
    for selected_material in top_materials.index:
        selected_material_data = df[df['MaterialFormula'] == selected_material]
        plt.figure(figsize=(6, 4))
        sns.lineplot(
            x='Relative density',
            y='log_cond',
            data=selected_material_data,
            marker='o',
            label=selected_material,
        )
        plt.title(f'Line Plot for {selected_material}')
        plt.xlabel('Relative density')
        plt.ylabel('log_cond')
        plt.legend()
        plt.show()


plot_top_materials(df, 7)

In [None]:

df = check_monotonicity_with_deviation_removal(df, max_allowed_deviations=10, deviation_threshold=.2)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_electric_neutrality(df, tolerance):
    # Ensure DataFrame has necessary columns
    required_columns = [
        'Li stoichiometry', 'La stoichiometry', 'Zr stoichiometry',
        'Li site dopant stoichiometry', 'Li site dopant ion charge',
        'La site dopant stoichiometry', 'La site dopant ion charge',
        'Zr site dopant stoichiometry', 'Zr site dopant ion charge'
    ]
    
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' is missing from the DataFrame")

    # Calculate the total charge for each row
    df['Total Charge'] = (
        df['Li stoichiometry'] * 1 +  # Li has a charge of +1
        df['La stoichiometry'] * 3 +  # La has a charge of +3
        df['Zr stoichiometry'] * 4 +  # Zr has a charge of +4
        df['Li site dopant stoichiometry'] * df['Li site dopant ion charge'] +  # Li site dopant charge
        df['La site dopant stoichiometry'] * df['La site dopant ion charge'] +  # La site dopant charge
        df['Zr site dopant stoichiometry'] * df['Zr site dopant ion charge']  # Zr site dopant charge
    )
    
    # Subtract 24 from the total charge to check neutrality
    df['Charge Difference'] = df['Total Charge'] - 24

    # Identify rows that do not satisfy neutrality within the specified tolerance
    non_neutrality_indexes = df[df['Charge Difference'].abs() > tolerance].index

    df_filtered = df.drop(non_neutrality_indexes)

    # Plotting the charge difference
    plt.figure(figsize=(10, 6))
    plt.scatter(df.index, df['Charge Difference'], alpha=0.7, edgecolors='w', s=100)
    plt.axhline(y=0, color='r', linestyle='--', label='Neutrality Line (Charge = 24)')
    plt.axhline(y=tolerance, color='g', linestyle='--', label=f'Neutrality Tolerance ±{tolerance}')
    plt.axhline(y=-tolerance, color='g', linestyle='--')
    plt.xlabel('Index')
    plt.ylabel('Charge Difference (Total Charge - 24)')
    plt.title('Electric Neutrality of the Material')
    plt.legend()
    plt.grid(True)
    plt.show()

    return df_filtered, non_neutrality_indexes

In [None]:
plot_top_materials(df, 7)

In [None]:
df.info()

In [None]:
df, non_neutrality_indexes = plot_electric_neutrality(df, 0.2)

# Inspect the filtered DataFrame and the indices of non-neutral rows
df.info()

In [None]:
X=df.copy()
X = X.drop(['log_cond'],axis=1)
X = X.drop(['Publication year','Quality of ionic conductivity','source','Doping strategy'], axis=1)
X = X.drop(['MaterialFormula'],axis=1)
X = X.drop(['li_dopant','la_dopant','zr_dopant'], axis=1)
X = X.drop(['Total Charge','Charge Difference'], axis=1)

In [None]:
X.info()

In [None]:
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=3)
# Fit and transform the data to impute the missing values
X_knn = knn_imputer.fit_transform(X)

# Convert the imputed NumPy array back to a DataFrame and specify column names
X_knn = pd.DataFrame(data=X_knn, columns=X.columns)

# Assuming you already have 'Ionic conductivity' in the DataFrame
Y = X_knn['Ionic conductivity']

# Drop the 'Ionic conductivity' column from X
X_knn = X_knn.drop('Ionic conductivity', axis=1)

In [None]:
X_knn.columns


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import pandas as pd

# Define a list of regression models to test
models = [
    HistGradientBoostingRegressor(),
    LinearRegression(),
    Ridge(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    XGBRegressor(),
    LGBMRegressor(),
    CatBoostRegressor()
]

# Set the test split size
test_size = 0.20

# Preprocessing steps
# Define numerical and categorical features
numeric_features = X_knn.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_knn.select_dtypes(include=['object', 'category']).columns

# Preprocessing for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Initialize an empty list to store the results
results = []

# Iterate over the models and evaluate each one
for model in models:
    # Splitting the data into training and testing sets
    X_train, X_test, Y_train, Y_test = train_test_split(X_knn, Y, test_size=test_size, random_state=42)

    # Create a pipeline that includes preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])

    # Fit the model and make predictions
    pipeline.fit(X_train, Y_train)
    Y_test_pred = pipeline.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(Y_test, Y_test_pred)
    r2 = r2_score(Y_test, Y_test_pred)

    # Record the results
    results.append({
        "Model": type(model).__name__,
        "R-squared (R2)": r2,
        "Mean Squared Error (MSE)": mse
    })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results)




In [None]:
df.info()

In [None]:
X_test.info()

In [None]:
df.columns

In [None]:
results_df.sort_values(by='R-squared (R2)', ascending=False, inplace=True)

results_df.reset_index(drop=True, inplace=True)
results_df

## CatBoostRegressor

In [None]:
import itertools
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
import pandas as pd

# Define the groups
GROUP1 = [
    #['Li stoichiometry', 'La stoichiometry', 'Zr stoichiometry'],
    #['Li site dopant stoichiometry', 'La site dopant stoichiometry', 'Zr site dopant stoichiometry']
    ['Li stoichiometry', 'La stoichiometry', 'Zr stoichiometry','Relative density'],
    ['Li site dopant stoichiometry', 'La site dopant stoichiometry', 'Zr site dopant stoichiometry','Relative density']
]

GROUP2 = [
    ['Li site dopant ionic radius', 'La site dopant ionic radius', 'Zr site dopant ionic radius'],
    ['Li site dopant atomic radius', 'La site dopant atomic radius', 'Zr site dopant atomic radius pm'],
    ['Li site dopant electron affinity', 'La site dopant electron affinity', 'Zr site dopant electron affinity'],
    ['Li site dopant e_ionisation', 'La site dopant e_ionisation', 'Zr site dopant e_ionisation'],
    ['Li site dopant atomic number', 'La site dopant atomic number', 'Zr site dopant atomic number'],
    ['Li site dopant molar mass', 'La site dopant molar mass', 'Zr site dopant molar mass'],
    ['Li site dopant electroneg.', 'La site dopant electroneg.', 'Zr site dopant electroneg.'],
    ['Li site dopant ion charge', 'La site dopant ion charge', 'Zr site dopant ion charge'],
    ['Li site dopant crystal rad.', 'La site dopant crystal rad.', 'Zr site dopant crystal rad.']
]

# Initialize a list to store the results
results = []

# Iterate over all combinations of one or more lines from GROUP1
for group1_combination in itertools.combinations(GROUP1, r=1):
    
    # Iterate over all combinations of one or more lines from GROUP2
    for r in range(1, len(GROUP2) + 1):
        for group2_combination in itertools.combinations(GROUP2, r=r):
            
            # Combine the selected features from both groups
            selected_features = [cols for sublist in group1_combination for cols in sublist] + \
                                [cols for sublist in group2_combination for cols in sublist]
            
            # Select only the features for this combination
            X_temp = X_knn[selected_features]
            
            # Split the data
            X_train, X_test, Y_train, Y_test = train_test_split(X_temp, Y, test_size=0.2, random_state=42)
            
            # Train the model using CatBoostRegressor
            model = CatBoostRegressor()
            model.fit(X_train, Y_train)
            Y_test_pred = model.predict(X_test)
            
            # Evaluate the model
            r2 = r2_score(Y_test, Y_test_pred)
            
            # Store the result
            results.append({
                'Selected_features': selected_features,
                'R-squared (R2)': r2
            })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Find the best result based on the R² score
best_result = results_df.sort_values(by='R-squared (R2)', ascending=False).iloc[0]

# Extract the best features used
best_features = best_result['Selected_features']

# Create a new DataFrame with only the best features
best_X = X_knn[best_features]
best_Y = Y  # Assuming the target variable remains the same

# Print the best R² score and the corresponding features
print(f"Best R² score: {best_result['R-squared (R2)']}")
print("Features used:")
print(best_features)

# Display the new DataFrame
best_X.head()


In [None]:
best_X.columns

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define preprocessing for numerical features (scaling only)
scaler = StandardScaler()

# Create a pipeline with scaling and the CatBoost model
model_pipeline = Pipeline(steps=[
    ('scaler', scaler),
    ('model', CatBoostRegressor(verbose=0))  # `verbose=0` to suppress CatBoost's training output
])

# Split the data into training and testing sets using the best features
X_train, X_test, Y_train, Y_test = train_test_split(best_X, Y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, Y_train)

# Make predictions on the test set
Y_train_pred = model_pipeline.predict(X_train)
Y_test_pred = model_pipeline.predict(X_test)

# Evaluate the model's performance using R² score
best_r2 = r2_score(Y_test, Y_test_pred)
print(f"Best R² score on new model: {best_r2}")
#Best R² score on new model: 0.8374072940008164

In [None]:
import optuna
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

# Define the test split (20%)
test_split = 0.2
print(f"\nRunning for test split: {int(test_split * 100)}%")

# Split the data for the current test split
X_train, X_test, Y_train, Y_test = train_test_split(best_X, Y, test_size=test_split, random_state=42)

def objective(trial):
    # Define the hyperparameters to tune
    params = {
        'iterations': trial.suggest_int('iterations', 600, 1100),
        'depth': trial.suggest_int('depth', 8, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.04),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 1e-3),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0.0, 2),
        'border_count': trial.suggest_int('border_count', 32, 400)
    }
   

    # Train the model with the given parameters
    cat_model = CatBoostRegressor(**params, random_state=42, verbose=0)
    cat_model.fit(X_train, Y_train)
    
    # Predict on the validation set
    y_pred = cat_model.predict(X_test)
    
    # Calculate the R2 score
    r2 = r2_score(Y_test, y_pred)
    
    # Since Optuna minimizes the objective, we need to return the negative R2 score
    return -r2

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Store the best parameters and the best R2 score
best_params = study.best_params
best_r2 = -study.best_value

# Train the final model with the best parameters
final_model = CatBoostRegressor(**best_params, random_state=42, verbose=0)
final_model.fit(X_train, Y_train)

# Predict and evaluate the final model
final_pred = final_model.predict(X_test)
final_mse = mean_squared_error(Y_test, final_pred)
final_r2 = r2_score(Y_test, final_pred)

# Print the results
print("\nResults for 20% Test Split:")
print(f"Best Parameters: {best_params}")
print(f"Best R2 Score: {best_r2}")
print(f"Final MSE: {final_mse}")
print(f"Final R2 Score: {final_r2}")


In [None]:
# Retrieve feature importance from the final model
feature_importance = final_model.get_feature_importance()
feature_names = best_X.columns  # Replace 'best_X' with your feature dataframe name if different

# Create a DataFrame to hold feature names and their importance scores
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot the feature importance
plt.figure(figsize=(10, 8))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Final CatBoost Model')
plt.gca().invert_yaxis()  # To display the highest importance feature at the top
plt.grid(False)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Predict on the training set
train_pred = final_model.predict(X_train)

# Predict on the test set
final_pred = final_model.predict(X_test)

# Plot the actual vs predicted values for the training set
plt.figure(figsize=(8, 8))
plt.scatter(Y_train, train_pred, color='blue', edgecolors='k', alpha=0.6, label='Training Data')

# Plot the actual vs predicted values for the test set
plt.scatter(Y_test, final_pred, color='red', edgecolors='k', alpha=0.6, label='Test Data')

# Plot the ideal line representing perfect prediction
plt.plot([min(Y_train.min(), Y_test.min()), max(Y_train.max(), Y_test.max())], 
         [min(Y_train.min(), Y_test.min()), max(Y_train.max(), Y_test.max())], 
         color='red', linestyle='--', linewidth=2, label='Perfect Prediction Line')

# Add labels and title
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values - CatBoost Model')

# Add a legend
plt.legend()

# Add a grid
plt.grid(True)

# Add annotation to indicate feature selection and fine-tuning
plt.text(0.05, 0.95, 'Performance after feature selection and fine-tuning', 
         horizontalalignment='left', verticalalignment='top', 
         transform=plt.gca().transAxes, fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))

# Show the plot
plt.grid(False)
plt.show()


In [None]:
output_columns = ['Ionic conductivity', 'Relative density', 'source', 'MaterialFormula']
df_output = df[output_columns].copy()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate the relative differences between predictions and actual values
relative_differences = np.abs(final_pred - Y_test.values) / np.abs(Y_test.values)

# Get the indices of the 10 smallest relative differences
best_indices = np.argsort(relative_differences)[:10]

# Extract the best predictions and actual values for these indices
best_preds = final_pred[best_indices]
best_actuals = Y_test.values[best_indices]
best_diff = relative_differences[best_indices]

# Create a bar plot
plt.figure(figsize=(10, 6))

# Plot predictions
plt.bar(np.arange(10) - 0.2, best_preds, width=0.4, color='blue', alpha=0.7, label='Prediction')

# Plot actual values
plt.bar(np.arange(10) + 0.2, best_actuals, width=0.4, color='orange', alpha=0.7, label='Actual')

plt.xticks(np.arange(10), best_indices, rotation=45)
plt.xlabel('Index')
plt.ylabel('Ionic Conductivity')
plt.title('Top 10 Predictions vs Actual Values with Smallest Relative Differences')
plt.legend()

# Annotate predictions with percentage difference
for i, (pred, diff) in enumerate(zip(best_preds, best_diff)):
    plt.text(i - 0.2, pred + 0.02 * max(best_preds), f'{diff:.2%}', ha='center', va='bottom', fontsize=10, color='black')

plt.tight_layout()
plt.grid(False)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Predict on the training set
train_pred = final_model.predict(X_train)

# Plot the actual vs predicted values for the training set
plt.figure(figsize=(10, 6))
plt.scatter(Y_train, train_pred, color='blue', edgecolors='k', alpha=0.6, label='Training Data')

# Plot the actual vs predicted values for the test set
plt.scatter(Y_test, final_pred, color='red', edgecolors='k', alpha=0.6, label='Test Data')

# Plot the ideal line representing perfect prediction
plt.plot([min(Y_train.min(), Y_test.min()), max(Y_train.max(), Y_test.max())], 
         [min(Y_train.min(), Y_test.min()), max(Y_train.max(), Y_test.max())], 
         color='red', linestyle='--', linewidth=2, label='Perfect Prediction Line')
plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))

# Add labels and title
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Values - CatBoost Model')

# Add a legend
plt.legend()

# Add grid
plt.grid(True)


## Second model

In [None]:
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# plots

In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define preprocessing for numerical features (scaling only)
scaler = StandardScaler()

# Create a pipeline with scaling and the CatBoost model
model_pipeline = Pipeline(steps=[
    ('scaler', scaler),
    ('model', CatBoostRegressor(verbose=0))  # `verbose=0` to suppress CatBoost's training output
])

# Split the data into training and testing sets using the best features
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_knn, Y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train1, Y_train1)

# Make predictions on the test set
Y_train_pred1 = model_pipeline.predict(X_train1)
Y_test_pred1 = model_pipeline.predict(X_test1)

# Evaluate the model's performance using R² score
best_r2 = r2_score(Y_test1, Y_test_pred1)
print(f"Best R² score on new model: {best_r2}")


In [None]:
plt.figure(figsize=(8, 8))

# Scatter plot for the train set
plt.scatter(Y_train1, Y_train_pred1, color='blue', alpha=0.5, label='Train Predictions (CatBoostRegressor)')
# Scatter plot for the test set
plt.scatter(Y_test1, Y_test_pred1, color='red', alpha=0.5, label='Test Predictions (CatBoostRegressor)')

# Line for perfect predictions
plt.plot([min(Y), max(Y)], [min(Y), max(Y)], 'k--', lw=2)

# Title and labels
plt.title('Actual vs. Predicted Values for CatBoostRegressor')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Use scientific notation for the x-axis
plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))

# Removing grid
plt.grid(False)

# Legend
plt.legend()

# Show plot
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define preprocessing for numerical features (scaling only)
scaler = StandardScaler()

# Create a pipeline with scaling and the CatBoost model
model_pipeline = Pipeline(steps=[
    ('scaler', scaler),
    ('model', LGBMRegressor(verbose=0))  # `verbose=0` to suppress CatBoost's training output
])

# Split the data into training and testing sets using the best features
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_knn, Y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train2, Y_train2)

# Make predictions on the test set
Y_train_pred2 = model_pipeline.predict(X_train2)
Y_test_pred2 = model_pipeline.predict(X_test2)

# Evaluate the model's performance using R² score
best_r2 = r2_score(Y_test2, Y_test_pred2)
print(f"Best R² score on new model: {best_r2}")


In [None]:
plt.figure(figsize=(8, 8))

# Scatter plot for the train set
plt.scatter(Y_train2, Y_train_pred2, color='blue', alpha=0.5, label='Train Predictions (LGBMRegressor)')
# Scatter plot for the test set
plt.scatter(Y_test2, Y_test_pred2, color='red', alpha=0.5, label='Test Predictions (LGBMRegressor)')

# Line for perfect predictions
plt.plot([min(Y), max(Y)], [min(Y), max(Y)], 'k--', lw=2)

# Title and labels
plt.title('Actual vs. Predicted Values for LGBMRegressor')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Use scientific notation for the x-axis
plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))

# Removing grid
plt.grid(False)

# Legend
plt.legend()

# Show plot
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Define preprocessing for numerical features (scaling only)
scaler = StandardScaler()

# Create a pipeline with scaling and the CatBoost model
model_pipeline = Pipeline(steps=[
    ('scaler', scaler),
    ('model', LGBMRegressor(verbose=0))  # `verbose=0` to suppress CatBoost's training output
])

# Split the data into training and testing sets using the best features
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_knn, Y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train1, Y_train1)

# Make predictions on the test set
Y_train_pred = model_pipeline.predict(X_train1)
Y_test_pred = model_pipeline.predict(X_test1)

# Evaluate the model's performance using R² score
best_r2 = r2_score(Y_test, Y_test_pred)
print(f"Best R² score on new model: {best_r2}")
