## Preparing dataset

In [13]:
import pandas as pd
import glob
import numpy as np

required_columns = ['radius', 'vpv', 'vsv', 'vph', 'vsh', 'rho']

bm_files = glob.glob("Data_module-7/*.bm")  
all_data = []

for file in bm_files:
    try:
       
        with open(file, 'r') as f:
            preview = [next(f) for _ in range(5)]  
            print(f"Preview of {file}:\n" + "".join(preview))
        
        
        df = pd.read_csv(file, sep='\s+', engine='python', comment='#', skip_blank_lines=True)
        
        if not all(col in df.columns for col in required_columns):
            print(f"Skipping {file} due to missing columns: {df.columns}")
            continue
        
        df = df[required_columns]
        
        filtered_df = df[(df['vsv'] == 0) & (df['vsh'] == 0)]
        max_radius = filtered_df['radius'].max() if not filtered_df.empty else None
        
        if max_radius is not None:
            df['radius'] = max_radius
        
        df['file'] = file
        
        all_data.append(df)
    
    except Exception as e:
        print(f"Error processing {file}: {e}")

final_df = pd.concat(all_data, ignore_index=True)
final_df = final_df.drop(columns=['file'])


# Calculating P_v and S_v columns
final_df['P_v'] = np.sqrt(df['vpv']**2 + df['vph']**2)
final_df['S_v'] = np.sqrt(df['vsv']**2 + df['vsh']**2)

final_df.to_csv('final_data.csv', index=False)



Preview of Data_module-7/EH45ThotCrust2.bm:
radius rho vpv vsv qka qmu vph vsh eta
        0.  6620.24  5608.43     0.00  57822.0    143.0  5608.43     0.00  1.00000
    20169.  6620.16  5608.35     0.00  57822.0    143.0  5608.35     0.00  1.00000
    40337.  6619.90  5608.11     0.00  57822.0    143.0  5608.11     0.00  1.00000
    60506.  6619.48  5607.71     0.00  57822.0    143.0  5607.71     0.00  1.00000

Preview of Data_module-7/LFAK.bm:
radius rho vpv vsv qka qmu vsh vph eta
       0.0  6743.10  5566.71     0.00 10000.00     0.00     0.00  5566.71  1.00 
  249356.0  6743.10  5566.71     0.00 10000.00     0.00     0.00  5566.71  1.00 
  498711.9  6705.17  5531.13     0.00 10000.00     0.00     0.00  5531.13  1.00 
  748067.9  6641.57  5471.41     0.00 10000.00     0.00     0.00  5471.41  1.00 

Preview of Data_module-7/EH45TcoldCrust1r.bm:
radius rho vpv vsv qka qmu vph vsh eta
        0.  6776.10  5635.43     0.00  57822.0    143.0  5635.43     0.00  1.00000
    19303.  6776.0

  df = pd.read_csv(file, sep='\s+', engine='python', comment='#', skip_blank_lines=True)


## Predicting Core radius using Random Forest Classifier

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler


file_path = "final_data.csv"  
df = pd.read_csv(file_path)

X = df.drop(columns=['radius'])  
y = df['radius']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error'
)
grid_search.fit(X_train, y_train)

# Best Random Forest model
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Evaluate the model
rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_percentage_error = (rf_mae / y_test.mean()) * 100

print("Random Forest Model Performance:")
print(f"Mean Percentage Error: {rf_percentage_error:.2f}%")
print(f"Best Hyperparameters: {grid_search.best_params_}")

cv_scores = cross_val_score(best_rf, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
mean_cv_error = -cv_scores.mean()
mean_cv_percentage_error = (mean_cv_error / y.mean()) * 100
print(f"Cross-Validated Mean Percentage Error: {mean_cv_percentage_error:.2f}%")

Random Forest Model Performance:
Mean Percentage Error: 1.12%
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Cross-Validated Mean Percentage Error: 1.44%


## Predicting Core Radius using XGB Regressor

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor


file_path = "final_data.csv"  
df = pd.read_csv(file_path)

X = df.drop(columns=['radius'])  
y = df['radius']  

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(
    XGBRegressor(random_state=42, objective='reg:squarederror'),
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error'
)
grid_search.fit(X_train, y_train)

# Best XGBoost model
best_xgb = grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

# Evaluate the model
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)
xgb_percentage_error = (xgb_mae / y_test.mean()) * 100

print("XGBoost Model Performance:")
print(f"Mean Percentage Error: {xgb_percentage_error:.2f}%")
print(f"Best Hyperparameters: {grid_search.best_params_}")


cv_scores = cross_val_score(best_xgb, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
mean_cv_error = -cv_scores.mean()
mean_cv_percentage_error = (mean_cv_error / y.mean()) * 100
print(f"Cross-Validated Mean Percentage Error: {mean_cv_percentage_error:.2f}%")


XGBoost Model Performance:
Mean Percentage Error: 1.09%
Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0}
Cross-Validated Mean Percentage Error: 1.32%


As we can see that XGB Regressor performed better, therefore, we choose XGB Regressor for the final predictions.