In [1]:
# ===============================
# STEP 0: Import Required Libraries
# ===============================
import pandas as pd
import numpy as np

# ===============================
# STEP 1: Load the Dataset
# ===============================
df = pd.read_csv(r"C:\Users\padma\Downloads\StudentsPerformance.csv")

# Display first 5 rows
print("First 5 records:")
display(df.head())

# Display last 5 rows
print("\nLast 5 records:")
display(df.tail())

# ===============================
# STEP 2: Identify Feature Types Manually
# ===============================
numerical_features = ['math score', 'reading score', 'writing score']

categorical_features = [
    'gender',
    'race/ethnicity',
    'parental level of education',
    'lunch',
    'test preparation course'
]

binary_features = ['gender', 'lunch', 'test preparation course']

ordinal_features = ['parental level of education']

print("\nNumerical Features:", numerical_features)
print("Categorical Features:", categorical_features)
print("Binary Features:", binary_features)
print("Ordinal Features:", ordinal_features)

# ===============================
# STEP 3: Dataset Information & Statistics
# ===============================
print("\nDataset Info:")
df.info()

print("\nStatistical Summary:")
display(df.describe())

# ===============================
# STEP 4: Check Unique Values in Categorical Columns
# ===============================
for col in categorical_features:
    print(f"\nUnique values in '{col}':")
    print(df[col].unique())

# ===============================
# STEP 5: Identify Target Variable & Input Features
# ===============================
# Target variable (example)
target_variable = 'math score'

# Input features
input_features = df.columns.drop(target_variable)

print("\nTarget Variable:", target_variable)
print("Input Features:", list(input_features))

# ===============================
# STEP 6: Dataset Size & ML Suitability
# ===============================
rows, columns = df.shape
print(f"\nDataset Size: {rows} rows and {columns} columns")

if rows >= 500:
    print("✔ Dataset size is suitable for Machine Learning models.")
else:
    print("⚠ Dataset size may be small for complex ML models.")

# ===============================
# STEP 7: Data Quality Observations
# ===============================
print("\nMissing Values:")
print(df.isnull().sum())

print("\nClass Distribution (Gender):")
print(df['gender'].value_counts())

print("\nGeneral Observations:")
print("""
- The dataset has no missing values.
- Numerical features are exam scores.
- Categorical features describe student background.
- Dataset is balanced and suitable for ML tasks.
- Can be used for regression or classification problems.
""")


First 5 records:


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75



Last 5 records:


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86



Numerical Features: ['math score', 'reading score', 'writing score']
Categorical Features: ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
Binary Features: ['gender', 'lunch', 'test preparation course']
Ordinal Features: ['parental level of education']

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: 

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0



Unique values in 'gender':
['female' 'male']

Unique values in 'race/ethnicity':
['group B' 'group C' 'group A' 'group D' 'group E']

Unique values in 'parental level of education':
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']

Unique values in 'lunch':
['standard' 'free/reduced']

Unique values in 'test preparation course':
['none' 'completed']

Target Variable: math score
Input Features: ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'reading score', 'writing score']

Dataset Size: 1000 rows and 8 columns
✔ Dataset size is suitable for Machine Learning models.

Missing Values:
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

Class Distribution (G