#  Data Aggregation & Grouping  
Performing grouping, merging, and summarization with Pandas.


In [1]:
import pandas as pd

## Load the Dataset  
Load the Students Performance dataset from GitHub and display basic information and sample rows.


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/os7i/Data-Analysis-Preprocessing-/refs/heads/main/StudentsPerformance.csv')

print(" Data loaded successfully!")
df.info()
df.head()


 Data loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


##  Merge Example Using Sample Data  
Split the dataset into training and testing parts, create two small DataFrames, and merge them based on the 'gender' column.


In [None]:
train = df.sample(frac=0.8, random_state=42)
test = df.drop(train.index)

df1 = pd.DataFrame({'gender': ['male', 'female'], 'math score': [75, 88]})
df2 = pd.DataFrame({'gender': ['male', 'female'], 'reading score': [70, 90]})

df_merged = pd.merge(df1, df2, on='gender')
print('Merged Data:')
print(df_merged)


##  Create Students and Subjects DataFrames


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/os7i/Data-Analysis-Preprocessing-/refs/heads/main/StudentsPerformance.csv')
student_name = df['gender'].head(4)
math_score = df['math score'].head(4)
reading_score = df['reading score'].head(4)

students = pd.DataFrame({
    'StudentID': [1, 2, 3, 4],
    'StudentGender': student_name,
    'MathScore': math_score,
    'ReadingScore': reading_score
})

subjects = pd.DataFrame({
    'SubjectID': [101, 102, 103, 104],
    'SubjectName': ['Math', 'Reading', 'Writing', 'Science']
})

print("Students Dataset:")
print(students)
print("\nSubjects Dataset:")
print(subjects)


Students Dataset:
   StudentID StudentGender  MathScore  ReadingScore
0          1        female         72            72
1          2        female         69            90
2          3        female         90            95
3          4          male         47            57

Subjects Dataset:
   SubjectID SubjectName
0        101        Math
1        102     Reading
2        103     Writing
3        104     Science


##  Merge Operations (Inner & Left Join)


In [4]:
subjects = pd.DataFrame({
    'SubjectID': [1, 2, 3, 4],
    'SubjectName': ['Math', 'Reading', 'Writing', 'Science']
})

merged_data = pd.merge(students, subjects, left_on='StudentID', right_on='SubjectID', how='inner')
print("\nInner Join Result:")
print(merged_data)

left_join = pd.merge(subjects, students, left_on='SubjectID', right_on='StudentID', how='left')
print("\nLeft Join Result:")
print(left_join)



Inner Join Result:
   StudentID StudentGender  MathScore  ReadingScore  SubjectID SubjectName
0          1        female         72            72          1        Math
1          2        female         69            90          2     Reading
2          3        female         90            95          3     Writing
3          4          male         47            57          4     Science

Left Join Result:
   SubjectID SubjectName  StudentID StudentGender  MathScore  ReadingScore
0          1        Math          1        female         72            72
1          2     Reading          2        female         69            90
2          3     Writing          3        female         90            95
3          4     Science          4          male         47            57


##  Aggregations: Average, Count, and Range


In [5]:
avg_math_score = students.groupby('StudentGender')['MathScore'].mean().reset_index()
print("\nAverage Math Score by Gender:")
print(avg_math_score)

student_count = students.groupby(['StudentGender']).size().reset_index(name='StudentCount')
print("\nStudent Count by Gender:")
print(student_count)

def score_range(x):
    return x.max() - x.min()

score_range_data = students.groupby('StudentGender')['MathScore'].agg(score_range).reset_index()
score_range_data.rename(columns={'MathScore': 'ScoreRange'}, inplace=True)
print("\nScore Range by Gender:")
print(score_range_data)



Average Math Score by Gender:
  StudentGender  MathScore
0        female       77.0
1          male       47.0

Student Count by Gender:
  StudentGender  StudentCount
0        female             3
1          male             1

Score Range by Gender:
  StudentGender  ScoreRange
0        female          21
1          male           0


##  Save Final Results


In [6]:
final_data = pd.merge(subjects, avg_math_score, left_on='SubjectID', right_index=True, how='left')
final_data.rename(columns={'MathScore': 'AverageMathScore'}, inplace=True)
print("\nSubjects with Average Math Scores:")
print(final_data)


Subjects with Average Math Scores:
   SubjectID SubjectName StudentGender  AverageMathScore
0          1        Math          male              47.0
1          2     Reading           NaN               NaN
2          3     Writing           NaN               NaN
3          4     Science           NaN               NaN


##  Save the Final Grouped Data


In [8]:
# Save the grouped and summarized dataset
final_data.to_csv('Students_Grouped_Data.csv', index=False)
print("File saved successfully as 'Students_Grouped_Data.csv'")


File saved successfully as 'Students_Grouped_Data.csv'
