In [45]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

In [None]:
df = pd.read_csv('gap_prediction.csv')

df.drop('Unnamed: 0', axis='columns', inplace=True) #Drops first column

df["Material"] = df["Material"].astype('category')
df["Space Group"] = df["Space Group"].astype('category')
df["Material"] = df["Material"].cat.codes
df["Space Group"] = df["Space Group"].cat.codes

gaps = df['gap']
ismetal = [1 if gap==0 else 0 for gap in gaps]
df.drop('gap', axis='columns', inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df,ismetal, test_size=0.2, random_state=42)

# Random Forrest

In [None]:
# Step 2: Cross-Validation and Parameter Tuning
# Define the hyperparameters to tune and their possible values
param_grid = {
    'n_estimators': [100],      # Number of trees in the forest
    'max_depth': [None],     # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],    # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]       # Minimum samples required to be at a leaf node
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Step 3: Train the Random Forest classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(random_state=42, **best_params)
best_rf_classifier.fit(X_train, y_train)

# Step 4: Evaluate the model on the test set
y_pred = best_rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Step 5: Perform Cross-Validation with the best hyperparameters
cv_scores = cross_val_score(best_rf_classifier, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

# Gradient Boosting

In [42]:
# Step 2: Cross-Validation and Parameter Tuning
# Define the hyperparameters to tune and their possible values
param_grid = {
    'n_estimators': [100],      # Number of boosting stages to be used
    'learning_rate': [0.01, 0.1, 0.2],  # Step size shrinks the contribution of each tree
    'max_depth': [3, 4, 5]              # Maximum depth of each tree
}

# Create a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [43]:
best_params = grid_search.best_params_

# Step 3: Train the Gradient Boosting classifier with the best hyperparameters
best_gb_classifier = GradientBoostingClassifier(random_state=42, **best_params)
best_gb_classifier.fit(X_train, y_train)

# Step 4: Evaluate the model on the test set
y_pred = best_gb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Step 5: Perform Cross-Validation with the best hyperparameters
cv_scores = cross_val_score(best_gb_classifier, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

Test Accuracy: 0.8653250773993808
Cross-Validation Scores: [0.87814313 0.87015504 0.88372093 0.84689922 0.86046512]
Mean CV Accuracy: 0.8678766887079604


In [46]:
f1_score(y_test, y_pred)

0.892459826946848