In [5]:
# ```python?code_reference&code_event_index=6
import pandas as pd
import numpy as np


In [6]:
filepath = r'D:\Download\student_performance_updated_1000.csv' 
df = pd.read_csv(filepath)
df

Unnamed: 0,StudentID,Name,Gender,AttendanceRate,StudyHoursPerWeek,PreviousGrade,ExtracurricularActivities,ParentalSupport,FinalGrade,Study Hours,persentage,Online Classes Taken
0,1.0,John,Male,85.0,15.0,78.0,1.0,High,80.0,4.8,59.0,False
1,2.0,Sarah,Female,90.0,20.0,85.0,2.0,Medium,87.0,2.2,70.0,True
2,3.0,Alex,Male,78.0,10.0,65.0,0.0,Low,68.0,4.6,92.0,False
3,4.0,Michael,Male,92.0,25.0,90.0,3.0,High,92.0,2.9,96.0,False
4,5.0,Emma,Female,,18.0,82.0,2.0,Medium,85.0,4.1,97.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
995,,Kenneth Murray,Male,85.0,20.0,,1.0,High,72.0,0.8,80.0,True
996,4497.0,Amy Stout,Female,91.0,,86.0,0.0,High,90.0,3.9,80.0,True
997,1886.0,,Male,85.0,8.0,82.0,2.0,Low,68.0,0.4,54.0,False
998,7636.0,Joseph Sherman,Male,88.0,17.0,60.0,2.0,High,85.0,0.9,53.0,True


In [61]:
print("--- Initial Data Info (Before Wrangling) ---")
# Use pandas .info() to see initial dtypes and non-null counts
df.info()
print("-" * 50)


--- Initial Data Info (Before Wrangling) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   StudentID                  960 non-null    float64
 1   Name                       966 non-null    object 
 2   Gender                     952 non-null    object 
 3   AttendanceRate             960 non-null    float64
 4   StudyHoursPerWeek          950 non-null    float64
 5   PreviousGrade              967 non-null    float64
 6   ExtracurricularActivities  957 non-null    float64
 7   ParentalSupport            978 non-null    object 
 8   FinalGrade                 960 non-null    float64
 9   Study Hours                976 non-null    float64
 10  percentage                 959 non-null    float64
 11  Online Classes Taken       975 non-null    object 
dtypes: float64(8), object(4)
memory usage: 93.9+ KB
------------

In [7]:
# === 2. Check for duplicates ===
duplicates = df.duplicated().sum()
print(f"Duplicates found: {duplicates}")

Duplicates found: 0


In [8]:
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
  df[col].fillna(df[col].mode()[0], inplace=True)


In [9]:
grades = df["FinalGrade"]        # Series|
print("First 5 Final Grades:\n", grades.head())

First 5 Final Grades:
 0    80.0
1    87.0
2    68.0
3    92.0
4    85.0
Name: FinalGrade, dtype: float64


In [10]:
info = df[["Name", "Gender", "FinalGrade"]]   # DataFrame
print("\nFirst 5 Students Info:\n", info.head())


First 5 Students Info:
       Name  Gender  FinalGrade
0     John    Male        80.0
1    Sarah  Female        87.0
2     Alex    Male        68.0
3  Michael    Male        92.0
4     Emma  Female        85.0


In [11]:
grades = df["FinalGrade"]        # Series
print("First 5 Final Grades:\n", grades.head())

info = df[["Name", "Gender", "FinalGrade"]]   # DataFrame
print("\nFirst 5 Students Info:\n", info.head())

First 5 Final Grades:
 0    80.0
1    87.0
2    68.0
3    92.0
4    85.0
Name: FinalGrade, dtype: float64

First 5 Students Info:
       Name  Gender  FinalGrade
0     John    Male        80.0
1    Sarah  Female        87.0
2     Alex    Male        68.0
3  Michael    Male        92.0
4     Emma  Female        85.0


In [15]:
df["Performance"] = np.where(df["FinalGrade"] >= 85, "High",
                    np.where(df["FinalGrade"] >= 70, "Medium", "Low"))

In [17]:
low_attendance = df[df["AttendanceRate"] < 80]
print("\nLow Attendance Students:\n", low_attendance[["Name", "AttendanceRate"]].head())


Low Attendance Students:
                Name  AttendanceRate
2              Alex            78.0
6            Daniel            70.0
14   James Williams            70.0
19  Brittany Sutton            78.0
20   Annette Medina            78.0


In [21]:
extra = pd.DataFrame({
    "Gender": ["Male", "Female"],
    "AvgHours": [df[df["Gender"] == "Male"]["StudyHoursPerWeek"].mean(),
                 df[df["Gender"] == "Female"]["StudyHoursPerWeek"].mean()]})

In [22]:
merged = pd.merge(df, extra, on="Gender", how="left")
print("\nMerged Data (showing average study hours by gender):\n", merged[["Name", "Gender", "AvgHours"]].head())


Merged Data (showing average study hours by gender):
       Name  Gender   AvgHours
0     John    Male  17.611115
1    Sarah  Female  17.654156
2     Alex    Male  17.611115
3  Michael    Male  17.611115
4     Emma  Female  17.654156


In [23]:
avg_grade_gender = df.groupby("Gender")["FinalGrade"].mean()
print("\nAverage Final Grade by Gender:\n", avg_grade_gender)


Average Final Grade by Gender:
 Gender
Female    80.460454
Male      79.676765
Name: FinalGrade, dtype: float64


In [24]:
part1 = df.iloc[:500]
part2 = df.iloc[500:]
joined = pd.concat([part1, part2])
print("\nJoined Data Shape:", joined.shape)


Joined Data Shape: (1000, 13)


In [25]:
print("\nCorrelation with FinalGrade:\n", df.corr(numeric_only=True)["FinalGrade"].round(2))


Correlation with FinalGrade:
 StudentID                    0.08
AttendanceRate              -0.01
StudyHoursPerWeek            0.03
PreviousGrade                0.00
ExtracurricularActivities   -0.03
FinalGrade                   1.00
Study Hours                  0.03
persentage                   0.04
Online Classes Taken         0.00
Name: FinalGrade, dtype: float64


In [26]:
print("\nTop 5 Students:\n", df.nlargest(5, "FinalGrade")[["Name", "FinalGrade", "Performance"]])
print("\nLow Performers:\n", df[df["Performance"] == "Low"][["Name", "FinalGrade"]].head())


Top 5 Students:
               Name  FinalGrade Performance
3          Michael        92.0        High
17     James Smith        92.0        High
26     Jeremy Hall        92.0        High
38  Jerry Browning        92.0        High
45     Andrea Frey        92.0        High

Low Performers:
                  Name  FinalGrade
2                Alex        68.0
6              Daniel        62.0
12     Katherine Gray        62.0
15  Derrick Alexander        62.0
16     Courtney Clark        68.0
