In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
train_data = pd.read_excel('01 Train Data.xlsx')
test_data = pd.read_excel('02 Test Data.xlsx')

In [3]:
train_data = train_data.dropna(axis=1, how='all')
test_data = test_data.dropna(axis=1, how='all')

# Remove duplicate rows 
train_data.drop_duplicates(subset=['First Name', 'Email ID'], keep='first', inplace=True)

In [4]:
train_data = train_data.dropna(subset=['Placement Status'])
train_data['Placement Status'] = train_data['Placement Status'].map({'Placed': 1, 'Not placed': 0})

selected_features = ['CGPA', 'Speaking Skills', 'ML Knowledge']

# Perform one-hot encoding for the 'College Name' column
train_data = pd.get_dummies(train_data, columns=['College Name'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['College Name'], drop_first=True)

train_data.dropna(subset=selected_features, inplace=True)

X_train = train_data[selected_features]
y_train = train_data['Placement Status']


## Random Forest (RF)

In [5]:
#Random Forest Classifier 
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Feature selection and preprocessing for the test data
X_test = test_data[selected_features]

# Make predictions on the test data
y_pred = model.predict(X_test)

result_df = test_data[['First Name', 'Email ID']]
result_df['Placement Prediction'] = y_pred

result_df.to_excel('Test_Data_with_Predictions_rf.xlsx', index=False)

# Display the accuracy of the model on the training data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['Placement Prediction'] = y_pred


Training Accuracy: 0.75


 ## Logistic Regression Model

In [6]:
from sklearn.linear_model import LogisticRegression
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

X_test = test_data[selected_features]
y_pred = model.predict(X_test)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')


Training Accuracy: 0.67


## K-Nearest Neighbors (KNN)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Train a k-NN classifier
k = 3  
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)

y_train_pred = knn_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

Training Accuracy: 0.70


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## Gradient Boosting Model

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
# Create a Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

Training Accuracy: 0.71


## XGBoost Model

In [9]:
import xgboost as xgb

# Create an XGBoost Classifier
model = xgb.XGBClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

Training Accuracy: 0.74


## Neural Network Model

In [10]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Standardize the features if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Build and compile the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the training and validation data
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Make predictions on the test data
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Display the accuracy of the model
y_train_pred = (model.predict(X_train) > 0.5).astype(int)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy:.2f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training Accuracy: 0.66


## Comparing models to find Best Model

In [11]:
from sklearn.metrics import accuracy_score

# Define a dictionary of models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'Neural Network': model
}

# Calculate and print training accuracies
model_accuracies = {}
for model_name, model in models.items():
    if model_name == 'Neural Network':
        y_train_pred = (model.predict(X_train) > 0.5).astype(int)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    model_accuracies[model_name] = train_accuracy
    print(f'{model_name} Training Accuracy: {train_accuracy:.2f}')

# Find the model with the highest accuracy
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_model_accuracy = model_accuracies[best_model_name]

# Print the selected model
print(f'\nSelected Model: {best_model_name} (Training Accuracy: {best_model_accuracy:.2f})')


Random Forest Training Accuracy: 0.75
Logistic Regression Training Accuracy: 0.66
K-Nearest Neighbors Training Accuracy: 0.69
Gradient Boosting Training Accuracy: 0.71


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


XGBoost Training Accuracy: 0.74
Neural Network Training Accuracy: 0.66

Selected Model: Random Forest (Training Accuracy: 0.75)


In [12]:
from sklearn.metrics import classification_report

# Define a dictionary of models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'Neural Network': model
}

# Loop through models and evaluate each one
for model_name, model in models.items():
    if model_name == 'Neural Network':
        y_train_pred = (model.predict(X_train) > 0.5).astype(int)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
    
    # Calculate classification report
    report = classification_report(y_train, y_train_pred)
    
    # Print the classification report
    print(f'{model_name} Classification Report:\n{report}\n')


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.92      0.83       415
           1       0.72      0.42      0.53       213

    accuracy                           0.75       628
   macro avg       0.74      0.67      0.68       628
weighted avg       0.74      0.75      0.73       628


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.66      1.00      0.80       415
           1       0.00      0.00      0.00       213

    accuracy                           0.66       628
   macro avg       0.33      0.50      0.40       628
weighted avg       0.44      0.66      0.53       628


K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       415
           1       0.55      0.43      0.49       213

    accuracy                           0.69       628
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.98      0.82       415
           1       0.82      0.19      0.31       213

    accuracy                           0.71       628
   macro avg       0.76      0.59      0.56       628
weighted avg       0.74      0.71      0.65       628


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.89      0.82       415
           1       0.68      0.43      0.53       213

    accuracy                           0.74       628
   macro avg       0.72      0.66      0.67       628
weighted avg       0.73      0.74      0.72       628


Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.66      1.00      0.80       415
           1       0.00      0.00      0.00       213

    accuracy                           0.66       628
   macro avg    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
