In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

In [11]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [12]:

# Load the CSV file into a DataFrame
df = pd.read_csv('pre_processed_data.csv')

In [13]:

# Preprocess the data
df = df[df['admit'] != 0]
X = df.drop(columns=['userName', 'major', "univName", "admit", 'greV', 'greQ'])
y = df["univName"]
print("Number of unique universities:", df['univName'].nunique())

# Split the dataset into features (X) and target labels (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values with the mean
X_train.fillna(X_train.mean(), inplace=True)

# Replace infinite values with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values with the mean
X_train.fillna(X_train.mean(), inplace=True)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Number of unique universities: 54


In [14]:

# Encode categorical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert labels to one-hot encoding
num_classes = len(label_encoder.classes_)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=num_classes)

# Build the ANN model
model = Sequential([
    Dense(256, input_shape=(X_train_scaled.shape[1],), activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),  # Dropout with 50% probability
    Dense(128, activation='relu'),
    Dropout(0.4),  # Dropout with 40% probability
    Dense(64, activation='relu'),
    Dropout(0.3),  # Dropout with 30% probability
    Dense(32, activation='relu'),
    Dropout(0.3),  # Dropout with 30% probability
    Dense(16, activation='relu'),
    Dropout(0.2),  # Dropout with 20% probability
    Dense(8, activation='relu'),
    Dropout(0.2),  # Dropout with 20% probability
    Dense(num_classes, activation='softmax')
])
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_scaled, y_train_one_hot, epochs=500, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_one_hot)
print(f'Test Accuracy: {test_accuracy}')

# Save the trained model
model.save('ann_model.h5')


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78