In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load dataset
df = pd.read_csv("OriginalDataset.csv")

In [3]:
# Identify numerical columns (excluding 'Career')
numerical_cols = df.columns[1:-1]  # Exclude 'Id' and 'Career'

In [4]:
# Initialize Min-Max Scaler
scaler = MinMaxScaler()

In [5]:

# Apply Min-Max Scaling
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [6]:
# Save the normalized dataset
df.to_csv("normalized_dataset.csv", index=False)

In [7]:
# Display first few rows
print(df.head())

   Id   O_score   C_score   E_score   A_score   N_score  Numerical Aptitude  \
0   1  0.410029  0.867572  0.086822  0.345336  0.256214            0.966463   
1   2  0.901180  0.358234  0.258915  0.527005  0.256214            0.339939   
2   3  0.508850  0.546689  1.000000  0.708674  0.405354            0.509146   
3   4  0.951327  0.886248  0.207752  0.399345  0.319312            0.762195   
4   5  0.557522  0.679117  0.431008  0.963993  0.256214            0.373476   

   Spatial Aptitude  Perceptual Aptitude  Abstract Reasoning  \
0          0.322206             0.595577            0.770932   
1          0.886792             0.770932            0.526066   
2          0.274311             0.543444            0.578199   
3          0.435414             0.543444            1.000000   
4          0.242380             0.508689            0.437599   

   Verbal Reasoning              Career  
0          0.453311          Accountant  
1          0.434635    Graphic Designer  
2          0.8

In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Load dataset from a CSV file
data = pd.read_csv('normalized_dataset.csv')

# Step 2: Split the data into features (X) and target label (y)
X = data.iloc[:, :-1].values  # All rows, all columns except the last
y = data.iloc[:, -1].values   # All rows, last column (career labels)

# Step 3: Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Step 4: Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Step 5: Initialize Leave-One-Out Cross-Validation (LOOCV)
loocv = LeaveOneOut()

accuracies = []

# Step 6: Train and test the model using LOOCV
for train_index, test_index in loocv.split(X_normalized):
    X_train, X_test = X_normalized[train_index], X_normalized[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Step 7: Train the KNN model on the training set
    knn.fit(X_train, y_train)
    
    # Step 8: Predict the target value for the test set
    y_pred = knn.predict(X_test)
    
    # Step 9: Calculate accuracy for this fold
    accuracies.append(accuracy_score(y_test, y_pred))

# Step 10: Calculate the average accuracy across all folds
average_accuracy = np.mean(accuracies)

# Output the result
print(f"Average Accuracy: {average_accuracy * 100:.2f}%")


Average Accuracy: 6.67%


In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Load dataset from a CSV file
data = pd.read_csv('normalized_dataset.csv')

# Step 2: Split the data into features (X) and target label (y)
X = data.iloc[:, :-1].values  # All rows, all columns except the last
y = data.iloc[:, -1].values   # All rows, last column (career labels)

# Step 3: Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Step 4: Initialize K-Fold cross-validation
kf = KFold(n_splits=5)

# Step 5: Define a parameter grid for tuning KNN
param_grid = {'n_neighbors': [3, 5, 7, 9, 11], 'metric': ['euclidean', 'manhattan']}

# Step 6: Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=kf, scoring='accuracy')
grid_search.fit(X_normalized, y)

# Step 7: Get the best parameters from GridSearchCV
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Step 8: Train the model with the best parameters
best_knn = grid_search.best_estimator_

# Step 9: Evaluate the model using K-Fold cross-validation
accuracies = []
for train_index, test_index in kf.split(X_normalized):
    X_train, X_test = X_normalized[train_index], X_normalized[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    best_knn.fit(X_train, y_train)
    y_pred = best_knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

# Step 10: Calculate average accuracy
average_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {average_accuracy * 100:.2f}%")


Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3}
Average Accuracy: 27.14%


In [67]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Step 1: Load your dataset from a CSV file
data = pd.read_csv('normalized_dataset.csv')

# Step 2: Split the data into features (X) and target label (y)
X = data.iloc[:, :-1].values  # All rows, all columns except the last
y = data.iloc[:, -1].values   # All rows, last column (career labels)

# Step 3: Normalize the features
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Step 4: Apply PCA to reduce the dimensions
pca = PCA(n_components=10) 
X_pca = pca.fit_transform(X_normalized)

# Step 5: Initialize KNN classifier
knn = KNeighborsClassifier(n_neighbors=2)

# Step 6: Train the KNN model with PCA features
knn.fit(X_pca, y)

# Step 7: Predict the career labels
y_pred = knn.predict(X_pca)

# Step 8: Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy with PCA: {accuracy * 100:.2f}%")


Accuracy with PCA: 62.86%
