Step -1 : Load and Inspect the Dataset

In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv('student_data.csv')

# Inspect the dataset
print(df.head())        # View the first 5 rows
print(df.info())        # Summary of columns and data types
print(df.describe())    # Statistics for numerical columns

# Check for missing values
print(df.isnull().sum())

   Student ID  Operating System  DSA  Frontend  Backend  Machine Learning  \
0           1                88   78        68       70                86   
1           2                65   68        78       92                84   
2           3                73   69        82       74                67   
3           4                85   95        58       52                89   
4           5                64   55        70       50                84   

   Data Analytics  Attendance (Operating System)  Attendance (DSA)  \
0              86                             75                97   
1              64                             71                83   
2              76                             96                91   
3              51                             79                91   
4              58                             74                87   

   Attendance (Frontend)  Attendance (Backend)  Attendance (Machine Learning)  \
0                     86           

Step 2 : Preprocess the Data

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Select numerical features
numerical_features = df[['Operating System', 'DSA', 'Frontend', 'Backend', 
                         'Machine Learning', 'Data Analytics',
                         'Attendance (Operating System)', 'Attendance (DSA)', 
                         'Attendance (Frontend)', 'Attendance (Backend)', 
                         'Attendance (Machine Learning)', 'Attendance (Data Analytics)']]

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)

# Encode project domains
project_columns = ['Project 1', 'Project 2', 'Project 3', 'Project 4']
projects_encoded = pd.get_dummies(df[project_columns].stack()).groupby(level=0).sum()

# Combine scaled numerical features and encoded project data
final_data = pd.concat([pd.DataFrame(scaled_features, columns=numerical_features.columns), projects_encoded], axis=1)

Step 3 : Apply K-Means Clustering

In [7]:
from sklearn.cluster import KMeans

# Apply K-Means
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(final_data)

# Inspect cluster assignments
print(df[['Cluster']].value_counts())


Cluster
1          340
2          333
0          327
Name: count, dtype: int64
