# b) Finding the best equation

In [1]:
import numpy as np
import pandas as pd

# Load the dataset
dataset = pd.read_csv("hardness_dataset.csv")

# Check the first few rows to understand the structure
print(dataset.head())

     material  Formula CrystalSystem   bulk  shear   young  poisson  Hexp
0       mp-66  Diamond         cubic  435.3  520.5  1116.5     0.07  96.0
1    mp-30148     BC2N  orthorhombic  361.0  422.7   912.1     0.08  76.0
2   mp-629458     BC2N  orthorhombic  361.6  409.0   891.1     0.09  76.0
3  mp-1018649    c-BC5      trigonal  405.8  378.2   865.6     0.14  71.0
4     mp-1639       BN         cubic  408.0  374.5   860.2     0.15  63.0


In [2]:
# Define the equations for predicting hardness
def H1(G):
    return 0.1475 * G

def H2(Y):
    return 0.0607 * Y

def H3(G):
    return 0.1769 * G - 2.899

def H4(Y):
    return 0.0635 * Y

def H5(B, v):
    return (1 - 2 * v) * B / (6 * (1 + v))

def H6(G, B):
    k = G / B
    return 2*(((k**2) *G)**0.585 ) -3

In [3]:
# Calculate predicted hardness for each equation
dataset['H1_pred'] = H1(dataset['shear'])
dataset['H2_pred'] = H2(dataset['young'])
dataset['H3_pred'] = H3(dataset['shear'])
dataset['H4_pred'] = H4(dataset['young'])
dataset['H5_pred'] = H5(dataset['bulk'], dataset['poisson'])
dataset['H6_pred'] = H6(dataset['shear'], dataset['bulk'])

# Calculate the absolute error for each equation
dataset['H1_error'] = abs(dataset['Hexp'] - dataset['H1_pred'])
dataset['H2_error'] = abs(dataset['Hexp'] - dataset['H2_pred'])
dataset['H3_error'] = abs(dataset['Hexp'] - dataset['H3_pred'])
dataset['H4_error'] = abs(dataset['Hexp'] - dataset['H4_pred'])
dataset['H5_error'] = abs(dataset['Hexp'] - dataset['H5_pred'])
dataset['H6_error'] = abs(dataset['Hexp'] - dataset['H6_pred'])

# For each data point, find the equation with the minimum error
dataset['best_equation'] = dataset[['H1_error', 'H2_error', 'H3_error', 'H4_error', 'H5_error', 'H6_error']].idxmin(axis=1)

# Calculate the average error for each equation
average_errors = dataset[['H1_error', 'H2_error', 'H3_error', 'H4_error', 'H5_error', 'H6_error']].mean()

# Print the average error for each equation
print("Average error for each equation:")
print(average_errors)

# Find the best equation on average over the entire dataset
best_equation_overall = average_errors.idxmin()
best_equation_avg_error = average_errors.min()

print(f"Best equation on average: {best_equation_overall} with an average error of {best_equation_avg_error:.4f}")

# Optionally, save the dataset with the best equation for each data point
#dataset.to_csv("hardness_with_best_equation.csv", index=False)

# Optionally, display the first few rows of the dataset with the best equations
#print(dataset[['Hexp', 'H1_pred', 'H2_pred', 'H3_pred', 'H4_pred', 'H5_pred', 'H6_pred', 'best_equation']].head())

Average error for each equation:
H1_error    3.326114
H2_error    3.655455
H3_error    3.525926
H4_error    3.861047
H5_error    4.048435
H6_error    4.081855
dtype: float64
Best equation on average: H1_error with an average error of 3.3261


# c) Classification using ANN

In [4]:
#preprocessing

#working on a copied dataset
df = dataset.copy()

from sklearn.preprocessing import LabelEncoder

def label_encoder(label):
    le = LabelEncoder()
    df[label] = le.fit_transform(df[label])

label_list = ["best_equation"]
for l in label_list:
    label_encoder(l)

# setup xdata, which is input and y data which is target
xInput = df[['bulk', 'shear', 'young', 'poisson']]
yTarget = df["best_equation"]

# split data for training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(xInput,yTarget, test_size=0.2, random_state=0)

#scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [6]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#setting up the ANN
# first layer is number of features
# last layer is output layer. Nodes=1 for binary classification
model = Sequential()
model.add(Dense(units=x_train.shape[1],activation='sigmoid'))
model.add(Dense(units=6,activation='relu'))
model.add(Dense(units=6,activation='softmax'))

model.compile(loss='mse',optimizer='adam',metrics=["accuracy"])

model.fit(x=x_train, y=y_train,epochs=40,validation_data=(x_test, y_test), batch_size = 10, verbose=1 )

#making predictions
y_pred_proba = model.predict(x_test)
y_pred = np.argmax(y_pred_proba, axis=-1)

from sklearn.metrics import log_loss
print ('logloss',log_loss(y_test,y_pred_proba))

from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm = confusion_matrix(y_test, y_pred)
print ("\nConfusion Matrix:\n",cm)
print('\n')
print ("accuracy: {0:5.2f} %".format(accuracy_score(y_test, y_pred)*100))

Epoch 1/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 136ms/step - accuracy: 0.1572 - loss: 9.2166 - val_accuracy: 0.2069 - val_loss: 10.2123
Epoch 2/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1379 - loss: 8.7753 - val_accuracy: 0.2069 - val_loss: 10.2121
Epoch 3/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1751 - loss: 10.0193 - val_accuracy: 0.2414 - val_loss: 10.2120
Epoch 4/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1682 - loss: 9.9911 - val_accuracy: 0.2759 - val_loss: 10.2119
Epoch 5/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1848 - loss: 10.4354 - val_accuracy: 0.2414 - val_loss: 10.2118
Epoch 6/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2237 - loss: 10.0177 - val_accuracy: 0.2414 - val_loss: 10.2118
Epoch 7/40
[1m12/12[0m [32

In [7]:
#hyperparameter tuning
from tensorflow.keras.optimizers import Adam, RMSprop

def build_model(hp):

    model = Sequential()

    # Input + first hidden layer
    model.add(Dense(
        units=hp.Int('units_input', min_value=4, max_value=64, step=4),
        activation='relu',
        input_shape=(x_train.shape[1],)
    ))

    # Tune number of additional hidden layers (1 to 3)
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=4, max_value=64, step=4),
            activation='relu'
        ))
        #uncomment if you want a droput layer
        # if hp.Boolean(f'dropout_{i}'):
        #     model.add(Dropout(rate=hp.Float(f'dropout_rate_{i}', 0.1, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(units=6,activation='softmax'))

    # Optimizer tuning
    optimizer_choice = hp.Choice('optimizer', ['adam', 'rmsprop'])
    if optimizer_choice == 'adam':
        optimizer = Adam(learning_rate=hp.Float('adam_lr', 1e-4, 1e-2, sampling='log'))
    else:
        optimizer = RMSprop(learning_rate=hp.Float('rms_lr', 1e-4, 1e-2, sampling='log'))

    model.compile(optimizer=optimizer, loss='mse', metrics=["accuracy"])
    return model


In [8]:
!pip install -q -U keras-tuner

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
#from kerastuner.tuners import RandomSearch
import keras_tuner as kt

tuner = kt.RandomSearch(build_model,
                        objective='val_accuracy',       # minimize validation MAE
                        max_trials=10,           # number of random configurations to test
                        directory='tuner_directory',
                        project_name='ann_random_search'
)

tuner.search(x_train, y_train, epochs=10, validation_data=(x_test, y_test), batch_size=5, verbose=2)


Trial 10 Complete [00h 00m 05s]
val_accuracy: 0.27586206793785095

Best val_accuracy So Far: 0.3448275923728943
Total elapsed time: 00h 00m 55s


In [13]:
best_hp = tuner.get_best_hyperparameters(1)[0]

print("Best hyperparameters found:")
for k, v in best_hp.values.items():
    print(f"{k}: {v}")

#extract best model
best_model = tuner.get_best_models(1)[0]

#refit
best_model.fit(x=x_train, y=y_train,epochs=40,validation_data=(x_test, y_test), batch_size = 10, verbose=1 )

y_pred_proba = best_model.predict(x_test)
y_pred = np.argmax(y_pred_proba, axis=-1)

print("\nMetrics for Test data after hyper paramter tuning")
print ('logloss',log_loss(y_test,y_pred_proba))
cm = confusion_matrix(y_test, y_pred)
print ("\nConfusion Matrix:\n",cm)
print('\n')
print ("accuracy: {0:5.2f} %".format(accuracy_score(y_test, y_pred)*100))

Best hyperparameters found:
units_input: 36
num_layers: 1
units_0: 16
optimizer: adam
adam_lr: 0.00326110302723218


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  saveable.load_own_variables(weights_store.get(inner_path))


Epoch 1/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 117ms/step - accuracy: 0.1299 - loss: 10.9598 - val_accuracy: 0.2759 - val_loss: 10.2117
Epoch 2/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1248 - loss: 10.5494 - val_accuracy: 0.0690 - val_loss: 10.2117
Epoch 3/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0932 - loss: 10.1897 - val_accuracy: 0.2069 - val_loss: 10.2117
Epoch 4/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.2892 - loss: 10.4462 - val_accuracy: 0.1724 - val_loss: 10.2117
Epoch 5/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1644 - loss: 10.0971 - val_accuracy: 0.3103 - val_loss: 10.2117
Epoch 6/40
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1400 - loss: 10.6173 - val_accuracy: 0.2069 - val_loss: 10.2117
Epoch 7/40
[1m12/12[0m 



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step

Metrics for Test data after hyper paramter tuning
logloss 1.7917594455917456

Confusion Matrix:
 [[5 0 0 0 0 0]
 [2 0 0 0 0 0]
 [3 0 0 0 0 0]
 [7 0 0 0 0 0]
 [6 0 0 0 0 0]
 [6 0 0 0 0 0]]


accuracy: 17.24 %


Make sure to summerize the results and highlight the best model and metrics.