In [5]:
import numpy as np
import pandas as pd

In [6]:
np.random.seed(42)
n_students = 500
subjects = ['Math', 'Science', 'English']

data = {
    'Student_ID': range(1, n_students + 1),
    'Subject': np.random.choice(subjects, n_students),
    'Study_Hours': np.random.randint(2, 16, n_students) 
}

In [7]:
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,Student_ID,Subject,Study_Hours
0,1,English,11
1,2,Math,11
2,3,English,4
3,4,English,11
4,5,Math,2
...,...,...,...
495,496,Science,4
496,497,Science,8
497,498,Math,4
498,499,Science,8


In [9]:
# Logic: Calculate Exam Score based on study hours + subject difficulty
def generate_score(row):
    base = {'Math': 40, 'Science': 50, 'English': 65}
    multiplier = {'Math': 4.2, 'Science': 3.8, 'English': 3.2}
    noise = np.random.normal(0, 5) 
    score = base[row['Subject']] + (multiplier[row['Subject']] * row['Study_Hours']) + noise
    return min(100, max(0, int(score)))

In [10]:
df['Exam_Score'] = df.apply(generate_score, axis=1)

In [11]:
df

Unnamed: 0,Student_ID,Subject,Study_Hours,Exam_Score
0,1,English,11,98
1,2,Math,11,90
2,3,English,4,73
3,4,English,11,99
4,5,Math,2,44
...,...,...,...,...
495,496,Science,4,61
496,497,Science,8,78
497,498,Math,4,62
498,499,Science,8,83


In [12]:
df.to_csv('student_performance_data.csv', index=False)

In [13]:
# 2. DATA ANALYSIS (Using Pandas)
# Calculating the averages for each subject with just one line of code
summary_df = df.groupby('Subject').agg({
    'Study_Hours': 'mean',
    'Exam_Score': 'mean'
}).reset_index().round(2)

In [14]:
print("Analysis Results:")
print(summary_df)

Analysis Results:
   Subject  Study_Hours  Exam_Score
0  English         8.68       89.45
1     Math         8.40       75.20
2  Science         8.21       80.43
