# Regression

In this notebook, we will predict the bandgap of materials. The dataset that we will use is built in the `dataset_preparation.ipynb` file. We will test many possible algorithms and to assess which one gives the smallest mean squared error. The workflow is essentially the same for all algorithms: we perform a train test split; then perform a grid search evaluated against a 5-fold split of the training set as our validation set to find the best set of hyperparameters; finally, we evaluate the error on the test data.

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import multiprocessing
import xgboost as xgb #For parallel gradient boosting

In [2]:
#Dataset loading
df = pd.read_csv('gap_prediction.csv')

#Turning space group into a categorical variable
df["Space Group"] = df["Space Group"].astype('category')

#Building a dict that maps the space groups in unique integers
mapping_dict = dict(zip(df['Space Group'], df['Space Group'].cat.codes))

#Transforms the categorical space group to numbers
df['Space Group'] = df['Space Group'].map(mapping_dict)

#Target
y = df['gap']
df.drop(['gap','Material','Unnamed: 0'], axis='columns', inplace=True)
X = df.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Models

## Linear Regression (ElasticNet)

In [135]:
# Parameter Tuning with Cross-Validation
# Define the hyperparameters to tune and their possible values
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1],  # Regularization parameter
    'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]  # Mixing parameter (0: L2, 1: L1, [0,1]: ElasticNet)
}

# Create an Elastic Net regressor
en_regressor = ElasticNet(max_iter=10000)

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(en_regressor, param_grid, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
scaler = StandardScaler().fit(X_train)
grid_search.fit(scaler.transform(X_train), y_train)
print(grid_search.best_params_)
best_params = grid_search.best_params_

{'alpha': 0.01, 'l1_ratio': 0.1}


In [136]:
# Train the Elastic Net regressor with the best hyperparameters
best_en_regressor = ElasticNet(**best_params,max_iter=10000)
best_en_regressor.fit(scaler.transform(X_train), y_train)

# Evaluate the model on the test set
y_pred = best_en_regressor.predict(scaler.transform(X_test))
mse_en = mean_squared_error(y_test, y_pred)
print("Test Accuracy:", mse_en)

# Perform Cross-Validation with the best hyperparameters
cv_scores_en = cross_val_score(best_en_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
print("Cross-Validation Error:", -cv_scores_en)
print("Mean CV Error:", -np.mean(cv_scores_en))

Test Accuracy: 0.839231301570435
Cross-Validation Error: -0.839231301570435
Mean CV Error: -0.839231301570435


## Decision Tree

In [137]:
# Parameter Tuning with Cross-Validation
# Define the hyperparameters to tune and their possible values
param_grid = {
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples required to be at a leaf node
}

# Create a Decision Tree regressor
dt_regressor = DecisionTreeRegressor()

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(dt_regressor, param_grid, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
best_params = grid_search.best_params_

{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [138]:
# Train the Decision Tree regressor with the best hyperparameters
best_dt_regressor = DecisionTreeRegressor(**best_params)
best_dt_regressor.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_dt_regressor.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred)
print("Test Accuracy:", mse_dt)

# Perform Cross-Validation with the best hyperparameters
cv_scores_dt = cross_val_score(best_dt_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
print("Cross-Validation Error:", -cv_scores_dt)
print("Mean CV Error:", -np.mean(cv_scores_dt))

Test Accuracy: 0.7028940925914161
Cross-Validation Error: [0.54439814 0.69227875 0.53731888 0.65323159 0.73357715]
Mean CV Error: 0.6321609005151523


## Random Forest

In [139]:
# Parameter Tuning with Cross-Validation
# Define the hyperparameters to tune and their possible values
param_grid = {
    #'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],     # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],    # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]       # Minimum samples required to be at a leaf node
}

# Create a Random Forest regressor
rf_regressor = RandomForestRegressor(n_jobs=-1)

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
best_params = grid_search.best_params_

{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [140]:
# Train the Random Forest regressor with the best hyperparameters
best_rf_regressor = RandomForestRegressor(n_jobs=-1, **best_params)
best_rf_regressor.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = best_rf_regressor.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred)
print("Test Error:", mse_rf)

# Perform Cross-Validation with the best hyperparameters
cv_scores_rf = cross_val_score(best_rf_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
print("Cross-Validation Error:", -cv_scores_rf)
print("Mean CV Error:", -np.mean(cv_scores_rf))

Test Error: 0.4365270436868386
Cross-Validation Error: [0.31522441 0.46697272 0.3815803  0.52346953 0.46723009]
Mean CV Error: 0.4308954106278805


## Gradient Boosting

In [141]:
# Parameter Tuning with Cross-Validation
# Define the hyperparameters to tune and their possible values
param_grid = {
    'n_estimators': [50, 100, 200],      # Number of boosting stages to be used
    'learning_rate': [0.1, 0.2, 0.3, 0.4],  # Step size shrinks the contribution of each tree
    'max_depth': [5, 6, 7, 8]              # Maximum depth of each tree
}

# Create a Gradient Boosting Regressor
xgb_model = xgb.XGBRegressor(
    n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist"
)


grid_search = GridSearchCV(xgb_model,param_grid,cv=5,scoring='neg_mean_squared_error',n_jobs=2)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
best_params = grid_search.best_params_

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sp

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}


In [142]:
# Train the Gradient Boosting regressor with the best hyperparameters
best_gb_regressor = xgb.XGBRegressor(
    n_jobs=multiprocessing.cpu_count() // 2, tree_method="hist", **best_params)
best_gb_regressor.fit(X_train, y_train,verbose=3)

# Evaluate the model on the test set
y_pred = best_gb_regressor.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred)
print("Test Accuracy:", mse_gb)

# Perform Cross-Validation with the best hyperparameters
cv_scores_gb = cross_val_score(best_gb_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Cross-Validation Error:", -cv_scores_gb)
print("Mean CV Error:", -np.mean(cv_scores_gb))

Test Accuracy: 0.3517656511213685


  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


Cross-Validation Error: [0.24505851 0.36793458 0.27596039 0.39405086 0.32384137]
Mean CV Error: 0.32136914110180126


# Summary

In [143]:
df = pd.DataFrame(columns=['Algorithm', 'Test MSE', 'Mean CV MSE'])
df.loc[len(df)] = ['Linear Regression (ElasticNet)', mse_en, -np.mean(cv_scores_en)]
df.loc[len(df)] = ['Decision Tree', mse_dt, -np.mean(cv_scores_dt)]
df.loc[len(df)] = ['Random Forrest', mse_rf, -np.mean(cv_scores_rf)]
df.loc[len(df)] = ['Gradient Boosting', mse_gb, -np.mean(cv_scores_gb)]
df.sort_values(by='Mean CV MSE')

Unnamed: 0,Algorithm,Test MSE,Mean CV MSE
3,Gradient Boosting,0.351766,0.321369
2,Random Forrest,0.436527,0.430895
1,Decision Tree,0.702894,0.632161
0,Linear Regression (ElasticNet),0.839231,0.728426


# Prediction of novel Materials

In [144]:
random_df = pd.read_csv('gap_prediction_random.csv')

random_df["Space Group"] = random_df["Space Group"].astype('category')
random_df['Space Group'] = random_df['Space Group'].map(mapping_dict)

random_df.drop(['Material','Unnamed: 0'], axis='columns', inplace=True)
X_random = random_df.to_numpy()

In [145]:
random_df

Unnamed: 0,Space Group,Z_mean,Electronegativity_mean,IonizationPotential_mean,ElectronAffinity_mean,HOMO_mean,LUMO_mean,r_s_orbital_mean,r_p_orbital_mean,r_d_orbital_mean,...,r_p_orbital_wstd,r_d_orbital_wstd,r_atomic_nonbonded_wstd,r_valence_lastorbital_wstd,r_covalent_wstd,Valence_wstd,PeriodicColumn_wstd,PeriodicColumn_upto18_wstd,NumberUnfilledOrbitals_wstd,Polarizability_wstd
0,49,49.000000,2.006667,-8.735433,-0.850100,-5.112633,0.641800,1.369533,1.569300,0.632500,...,0.154737,0.027169,0.031489,0.222409,0.023514,74.755102,5.176871,40.278912,8.795918,2975.246600
1,22,32.200000,1.952000,-8.739540,-1.546640,-5.187260,2.533320,1.312900,1.437760,1.109640,...,0.502055,0.300624,0.201184,0.487616,0.265811,30.530178,4.376331,48.115976,7.178698,21427.942230
2,53,45.800000,2.050000,-9.481840,-1.348640,-5.268600,2.436940,1.092980,1.254620,0.985500,...,0.148839,0.131669,0.039076,0.017869,0.094091,34.206612,1.056198,32.692562,7.879339,1397.835646
3,9,37.000000,2.170000,-9.311760,-1.353260,-5.128540,2.605620,1.201700,1.345560,1.168560,...,0.492235,0.282555,0.215189,0.397098,0.263287,49.306944,3.473611,40.306944,2.973611,13678.420200
4,4,51.000000,1.983333,-9.915933,-1.112683,-5.659450,2.001400,0.947167,1.117283,0.412250,...,0.323487,0.080155,0.385099,0.076545,0.204352,107.221453,4.889273,39.918108,7.129181,1049.137070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,23,58.400000,2.062000,-8.896920,-2.084540,-5.337920,2.088920,0.706300,0.949740,0.599540,...,0.299298,0.241416,0.252880,0.113023,0.314051,84.351111,10.200000,31.000000,0.404444,13023.502100
4996,43,42.333333,2.406667,-10.429333,-1.133700,-5.322667,0.189900,1.117633,1.190233,0.859500,...,0.231781,0.066061,0.132147,0.010811,0.131073,55.510204,2.748299,12.136054,1.115646,670.306667
4997,22,48.200000,2.258000,-10.796280,-1.654140,-6.576260,2.628560,0.973760,1.216480,0.636740,...,0.109686,0.202766,0.312757,0.023854,0.033006,72.023140,3.824793,14.965289,3.024793,275.185530
4998,9,48.333333,2.453333,-10.190533,-2.347633,-5.936633,3.235000,1.037167,1.174733,0.841433,...,0.108092,0.135153,0.294300,0.061724,0.039123,113.666667,0.666667,16.395833,1.583333,444.081540


In [146]:
y_pred = best_gb_regressor.predict(X_random)