In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('/kaggle/input/new-compressor-data/combined_file_ap_500.csv')  #the respective csv file for the corresponding cluster size can be loaded here

# the weights obtained using the 500 cluster size file are later used to test the model on other datasets

# first few rows of the DataFrame
print(df.head())

# basic data exploration
print(df.info())
print(df.describe())



In [None]:
# Checking for missing values
print(df.isnull().sum())

#  dropping
df = df.dropna()

# Separating features and target variable 'Multiplier'
X = df.drop(columns=['Multiplier', 'Area', 'Power']).values
y = df['Multiplier'].values

print("Unique multipliers:", df['Multiplier'].value_counts())

# Encoding categorical target variable 'Multiplier' to numeric labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Normalizing the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Dataset split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_encoded, test_size=0.2, random_state=42)



In [None]:
# The neural network model 
model = Sequential()

# Input layer
model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))

# Additional hidden layers
model.add(Dense(128, activation='gelu'))
model.add(Dense(64, activation='gelu'))
model.add(Dense(64, activation='gelu'))
model.add(Dense(32, activation='gelu'))

# Output layer
num_classes = len(set(y_encoded))  # Number of unique classes
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_accuracy', patience=70, restore_best_weights=True, verbose=1)

# Training the model with early stopping
history = model.fit(X_train, y_train, epochs=1500, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping])

#  Model weights saved after training
model.save_weights('model.weights.h5')
print("Model weights saved successfully.")

# Custom evaluation function
def custom_evaluate(model, X_test, y_test, label_encoder):
    # Get the predicted probabilities from the model
    y_pred_probs = model.predict(X_test)
    
    # Converting predicted probabilities to class labels
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    # Decoding the predicted and actual labels (if using encoded labels)
    y_pred_decoded = label_encoder.inverse_transform(y_pred)
    y_test_decoded = label_encoder.inverse_transform(y_test)
    
    # Calculating custom accuracy based on conditions
    correct = 0
    ssim_diff = 0.01
    cost_diff = 1
    for i in range(len(y_test)):
        for j in range(len(y_pred)):
            if abs(X_test[j][0] - X_test[i][0]) <= ssim_diff and abs(X_test[j][1] - X_test[i][1]) <= cost_diff:
                if y_pred[i] == y_test[j]:
                    correct += 1
                    break
    
    accuracy = correct / len(y_pred)
    
    
    print("Final Test Accuracy:", accuracy)
    return accuracy


loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Exact Test Accuracy: {accuracy:.4f}")


# Calling the custom evaluation function on the test set
custom_accuracy = custom_evaluate(model, X_test, y_test, label_encoder)

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()



In [None]:


# cluster sizes for the dataset
sizes = [ 2000, 3000, 4000]


# Iterating over all sizes to evaluate each corresponding dataset
for size in sizes:
    # Loading the new dataset for testing corresponding to the current cluster size
    csv_file = f'/kaggle/input/mnist-data/combined_file_ap_{size}.csv'
    new_df = pd.read_csv(csv_file)

    # Preprocessing the new dataset (similar to the training dataset)
    new_df = new_df.dropna()

    # Separation of features and target variable 'Multiplier'
    X_new = new_df.drop(columns=['Multiplier', 'Area', 'Power']).values
    y_new = new_df['Multiplier'].values

    # Combining old and new labels for the LabelEncoder
    combined_labels = np.concatenate((y, y_new))

    # Re-fitting the LabelEncoder with the combined labels
    label_encoder.fit(combined_labels)

    # Encoding the new labels
    y_new_encoded = label_encoder.transform(y_new)

    # Normalizing the features of the new dataset
    X_new_normalized = scaler.transform(X_new)

    # Loading the saved model weights
    model.load_weights('model.weights.h5')
    print(f"Model weights loaded successfully for size {size}.")

    # Test the model on the new data with updated labels
    custom_accuracy_new = custom_evaluate(model, X_new_normalized, y_new_encoded, label_encoder)
    print(f"Custom accuracy on the new test dataset (size {size}): {custom_accuracy_new}")

    # Predicting class labels for the new data
    y_new_pred_encoded = model.predict(X_new_normalized)

    # Converting the predicted probabilities to class labels
    y_new_pred = np.argmax(y_new_pred_encoded, axis=1)

    # Calculating normal accuracy
    normal_accuracy_new = accuracy_score(y_new_encoded, y_new_pred)
    print(f"Normal accuracy on the new dataset (size {size}): {normal_accuracy_new}")


