In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("Students_Grading_Dataset.csv")

In [None]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 Student_ID                       0
First_Name                       0
Last_Name                        0
Email                            0
Gender                           0
Age                              0
Department                       0
Attendance (%)                 516
Midterm_Score                    0
Final_Score                      0
Assignments_Avg                517
Quizzes_Avg                      0
Participation_Score              0
Projects_Score                   0
Total_Score                      0
Grade                            0
Study_Hours_per_Week             0
Extracurricular_Activities       0
Internet_Access_at_Home          0
Parent_Education_Level        1794
Family_Income_Level              0
Stress_Level (1-10)              0
Sleep_Hours_per_Night            0
dtype: int64


In [None]:
df.drop_duplicates(inplace=True)


In [None]:
df.loc[:, "Attendance (%)"] = df["Attendance (%)"].fillna(df["Attendance (%)"].mean())
df.loc[:, "Assignments_Avg"] = df["Assignments_Avg"].fillna(df["Assignments_Avg"].mean())


In [None]:
df.loc[:, "Parent_Education_Level"] = df["Parent_Education_Level"].fillna(df["Parent_Education_Level"].mode()[0])


In [None]:
#Here we are cleaning up string formats and making everything lowercase and striping:
df["Extracurricular_Activities"] = df["Extracurricular_Activities"].str.lower().str.strip()
df["Internet_Access_at_Home"] = df["Internet_Access_at_Home"].str.lower().str.strip()




In [None]:
print(" Missing values after cleaning:\n", df.isnull().sum())


 Missing values after cleaning:
 Student_ID                    0
First_Name                    0
Last_Name                     0
Email                         0
Gender                        0
Age                           0
Department                    0
Attendance (%)                0
Midterm_Score                 0
Final_Score                   0
Assignments_Avg               0
Quizzes_Avg                   0
Participation_Score           0
Projects_Score                0
Total_Score                   0
Grade                         0
Study_Hours_per_Week          0
Extracurricular_Activities    0
Internet_Access_at_Home       0
Parent_Education_Level        0
Family_Income_Level           0
Stress_Level (1-10)           0
Sleep_Hours_per_Night         0
dtype: int64


In [None]:
# Data Reduction being implemented using Attribute Subset Selection

# Final list of important features (Student_ID removed)
columns_to_keep = [
    'Gender', 'Age', 'Department', 'Attendance (%)', 'Midterm_Score',
    'Final_Score', 'Assignments_Avg', 'Quizzes_Avg', 'Participation_Score',
    'Projects_Score', 'Study_Hours_per_Week', 'Extracurricular_Activities',
    'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level',
    'Stress_Level (1-10)', 'Sleep_Hours_per_Night', 'Grade'
]

# Createing a new reduced DataFrame
df_reduced = df[columns_to_keep]

# Checking the shape and preview the result
print("✅ Reduced dataset shape:", df_reduced.shape)
df_reduced.head()


✅ Reduced dataset shape: (5000, 18)


Unnamed: 0,Gender,Age,Department,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Parent_Education_Level,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night,Grade
0,Female,22,Engineering,52.29,55.03,57.82,84.22,74.06,3.99,85.9,6.2,no,yes,High School,Medium,5,4.7,F
1,Male,18,Engineering,97.27,97.23,45.8,74.798673,94.24,8.32,55.65,19.0,no,yes,PhD,Medium,4,9.0,A
2,Male,24,Business,57.19,67.05,93.68,67.7,85.7,5.05,73.79,20.7,no,yes,Master's,Low,6,6.2,D
3,Female,24,Mathematics,95.15,47.79,80.63,66.06,93.51,6.54,92.12,24.8,yes,yes,High School,High,3,6.7,A
4,Female,23,CS,54.18,46.59,78.89,96.85,83.7,5.97,68.42,15.4,yes,yes,High School,High,2,7.1,F


In [None]:
df_reduced[numeric_cols].dtypes


Unnamed: 0,0
Attendance (%),float64
Midterm_Score,float64
Final_Score,float64
Assignments_Avg,float64
Quizzes_Avg,float64
Participation_Score,float64
Projects_Score,float64
Study_Hours_per_Week,float64
Sleep_Hours_per_Night,float64
Stress_Level (1-10),float64


In [None]:
# Family_Income_Level to numeric values
income_map = {
    'low': 0,
    'medium': 1,
    'high': 2
}

df_reduced['Family_Income_Level'] = df_reduced['Family_Income_Level'].map(income_map)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['Family_Income_Level'] = df_reduced['Family_Income_Level'].map(income_map)


In [None]:
#  Manual normalization using the formula

for col in numeric_cols:
    min_val = df_reduced[col].min()
    max_val = df_reduced[col].max()
    df_reduced[col] = (df_reduced[col] - min_val) / (max_val - min_val)

# Showing the normalized results
df_reduced.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced[col] = (df_reduced[col] - min_val) / (max_val - min_val)


Unnamed: 0,Gender,Age,Department,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Parent_Education_Level,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night,Grade
0,Female,22,Engineering,0.045609,0.250584,0.297099,0.684674,0.481274,0.399,0.717944,0.048,no,yes,High School,,0.444444,0.14,F
1,Male,18,Engineering,0.945389,0.954151,0.096699,0.496172,0.88544,0.832,0.112823,0.56,no,yes,PhD,,0.333333,1.0,A
2,Male,24,Business,0.143629,0.450984,0.894965,0.354142,0.7144,0.505,0.475695,0.628,no,yes,Master's,,0.555556,0.44,D
3,Female,24,Mathematics,0.902981,0.129877,0.677392,0.321329,0.870819,0.654,0.842368,0.792,yes,yes,High School,,0.222222,0.54,A
4,Female,23,CS,0.083417,0.10987,0.648383,0.937375,0.674344,0.597,0.368274,0.416,yes,yes,High School,,0.111111,0.62,F


In [None]:
# Discretize 'Stress_Level (1-10)' into 3 categories

def discretize_stress(value):
    if value <= 0.33:
        return 'Low'
    elif value <= 0.66:
        return 'Medium'
    else:
        return 'High'

df_reduced.loc[:, 'Stress_Level_Category'] = df_reduced['Stress_Level (1-10)'].apply(discretize_stress)



df_reduced[['Stress_Level (1-10)', 'Stress_Level_Category']].head()


Unnamed: 0,Stress_Level (1-10),Stress_Level_Category
0,0.444444,Medium
1,0.333333,Medium
2,0.555556,Medium
3,0.222222,Low
4,0.111111,Low


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Showing the full DataFrame
df_reduced.head(55)

Unnamed: 0,Gender,Age,Department,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Parent_Education_Level,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night,Grade,Stress_Level_Category
0,Female,22,Engineering,0.045609,0.250584,0.297099,0.684674,0.481274,0.399,0.717944,0.048,no,yes,High School,,0.444444,0.14,F,Medium
1,Male,18,Engineering,0.945389,0.954151,0.096699,0.496172,0.88544,0.832,0.112823,0.56,no,yes,PhD,,0.333333,1.0,A,Medium
2,Male,24,Business,0.143629,0.450984,0.894965,0.354142,0.7144,0.505,0.475695,0.628,no,yes,Master's,,0.555556,0.44,D,Medium
3,Female,24,Mathematics,0.902981,0.129877,0.677392,0.321329,0.870819,0.654,0.842368,0.792,yes,yes,High School,,0.222222,0.54,A,Low
4,Female,23,CS,0.083417,0.10987,0.648383,0.937375,0.674344,0.597,0.368274,0.416,yes,yes,High School,,0.111111,0.62,F,Low
5,Male,21,Engineering,0.50853,0.647716,0.058853,0.428171,0.043461,0.638,0.345669,0.14,yes,yes,PhD,,0.0,0.2,B,Low
6,Male,24,Business,0.15183,0.437813,0.818106,0.690676,0.968756,0.23,0.872975,0.652,no,yes,PhD,,0.444444,0.48,F,Medium
7,Male,19,Engineering,0.038008,0.094532,0.566189,0.602641,0.918686,0.373,0.864773,0.892,yes,no,PhD,,0.333333,0.06,F,Medium
8,Female,21,CS,0.719344,0.74058,0.848116,0.141056,0.126177,0.051,0.880176,0.12,no,no,Bachelor's,,0.888889,0.96,A,High
9,Female,22,Engineering,0.280056,0.7996,0.974825,0.939976,0.112157,0.588,0.571914,0.184,no,yes,PhD,,1.0,0.48,A,High


In [None]:
# Saving cleaned and processed dataset to a new CSV file
df_reduced.to_csv("preprocessed_students_data.csv", index=False)
