# Assignment 2 B - Student Dataset Analysis

## Complete Solution with Data Analysis and Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Load and Explore the Dataset

In [None]:
# Load dataset
df = pd.read_csv("student_dataset.csv")

# Display first few rows
print("First 10 rows of the dataset:")
print(df.head(10))

# Dataset information
print("\nDataset Information:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nData Types:")
print(df.dtypes)

## 2. Data Cleaning and Missing Values

In [None]:
# Check for missing values
print("Missing values in each column:")
missing_values = df.isnull().sum()
print(missing_values)

# Check for duplicates
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")

# Basic statistics
print("\nBasic Statistics:")
print(df.describe())

## 3. Data Analysis

In [None]:
# Gender distribution
gender_counts = df['Gender'].value_counts()
print("Gender Distribution:")
print(gender_counts)

# Age distribution
age_stats = df['Age'].describe()
print("\nAge Statistics:")
print(age_stats)

# Marks distribution
marks_stats = df['Marks'].describe()
print("\nMarks Statistics:")
print(marks_stats)

# Correlation between Age and Marks
correlation = df['Age'].corr(df['Marks'])
print(f"\nCorrelation between Age and Marks: {correlation:.3f}")

## 4. Data Visualization

In [None]:
# Create subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Student Dataset Analysis', fontsize=16, fontweight='bold')

# 1. Gender Distribution Pie Chart
gender_counts.plot.pie(autopct='%1.1f%%', ax=axes[0, 0])
axes[0, 0].set_title('Gender Distribution')
axes[0, 0].set_ylabel('')

# 2. Age Distribution Histogram
df['Age'].plot.hist(bins=15, alpha=0.7, ax=axes[0, 1])
axes[0, 1].set_title('Age Distribution')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Frequency')

# 3. Marks Distribution Histogram
df['Marks'].plot.hist(bins=20, alpha=0.7, ax=axes[0, 2])
axes[0, 2].set_title('Marks Distribution')
axes[0, 2].set_xlabel('Marks')
axes[0, 2].set_ylabel('Frequency')

# 4. Boxplot of Marks by Gender
df.boxplot(column='Marks', by='Gender', ax=axes[1, 0])
axes[1, 0].set_title('Marks by Gender')
axes[1, 0].set_xlabel('Gender')
axes[1, 0].set_ylabel('Marks')

# 5. Scatter Plot: Age vs Marks
df.plot.scatter(x='Age', y='Marks', alpha=0.6, ax=axes[1, 1])
axes[1, 1].set_title('Age vs Marks')
axes[1, 1].set_xlabel('Age')
axes[1, 1].set_ylabel('Marks')

# 6. Marks Distribution by Age Group
age_bins = [17, 20, 22, 24]
age_labels = ['18-20', '21-22', '23-24']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)
df.groupby('AgeGroup')['Marks'].mean().plot.bar(ax=axes[1, 2])
axes[1, 2].set_title('Average Marks by Age Group')
axes[1, 2].set_xlabel('Age Group')
axes[1, 2].set_ylabel('Average Marks')
axes[1, 2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Advanced Analysis

In [None]:
# Performance categories based on marks
def categorize_performance(marks):
    if marks >= 90:
        return 'Excellent'
    elif marks >= 75:
        return 'Good'
    elif marks >= 60:
        return 'Average'
    elif marks >= 40:
        return 'Below Average'
    else:
        return 'Poor'

df['Performance'] = df['Marks'].apply(categorize_performance)

# Performance distribution
performance_counts = df['Performance'].value_counts()
print("Performance Distribution:")
print(performance_counts)

# Performance by gender
performance_by_gender = pd.crosstab(df['Gender'], df['Performance'])
print("\nPerformance by Gender:")
print(performance_by_gender)

# Performance by age group
performance_by_age = pd.crosstab(df['AgeGroup'], df['Performance'])
print("\nPerformance by Age Group:")
print(performance_by_age)

In [None]:
# Visualization of performance analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Performance distribution
performance_counts.plot.pie(autopct='%1.1f%%', ax=axes[0])
axes[0].set_title('Performance Distribution')
axes[0].set_ylabel('')

# Performance by gender (stacked bar chart)
performance_by_gender.plot.bar(stacked=True, ax=axes[1])
axes[1].set_title('Performance by Gender')
axes[1].set_xlabel('Gender')
axes[1].set_ylabel('Count')
axes[1].legend(title='Performance')

plt.tight_layout()
plt.show()

## 6. Statistical Tests and Insights

In [None]:
from scipy import stats

# T-test for marks difference between genders
male_marks = df[df['Gender'] == 'Male']['Marks']
female_marks = df[df['Gender'] == 'Female']['Marks']

t_stat, p_value = stats.ttest_ind(male_marks, female_marks)
print(f"T-test for Marks by Gender:")
print(f"T-statistic: {t_stat:.3f}")
print(f"P-value: {p_value:.3f}")

if p_value < 0.05:
    print("There is a statistically significant difference in marks between genders.")
else:
    print("There is no statistically significant difference in marks between genders.")

# ANOVA test for marks across age groups
age_groups = [df[df['AgeGroup'] == group]['Marks'] for group in age_labels]
f_stat, p_value_anova = stats.f_oneway(*age_groups)
print(f"\nANOVA test for Marks across Age Groups:")
print(f"F-statistic: {f_stat:.3f}")
print(f"P-value: {p_value_anova:.3f}")

if p_value_anova < 0.05:
    print("There is a statistically significant difference in marks across age groups.")
else:
    print("There is no statistically significant difference in marks across age groups.")

## 7. Key Insights and Summary

In [None]:
print("KEY INSIGHTS:")
print("=" * 50)

# Overall statistics
print(f"1. Total Students: {len(df)}")
print(f"2. Gender Distribution: {gender_counts['Male']} Male, {gender_counts['Female']} Female")
print(f"3. Average Age: {df['Age'].mean():.1f} years")
print(f"4. Average Marks: {df['Marks'].mean():.1f}")
print(f"5. Marks Range: {df['Marks'].min()} - {df['Marks'].max()}")

# Performance insights
print(f"\n6. Performance Categories:")
for category, count in performance_counts.items():
    percentage = (count / len(df)) * 100
    print(f"   - {category}: {count} students ({percentage:.1f}%)")

# Gender performance comparison
male_avg = male_marks.mean()
female_avg = female_marks.mean()
print(f"\n7. Average Marks by Gender:")
print(f"   - Male: {male_avg:.1f}")
print(f"   - Female: {female_avg:.1f}")
print(f"   - Difference: {abs(male_avg - female_avg):.1f}")

# Age group performance
print(f"\n8. Average Marks by Age Group:")
for age_group in age_labels:
    avg_marks = df[df['AgeGroup'] == age_group]['Marks'].mean()
    print(f"   - {age_group}: {avg_marks:.1f}")

# Correlation insight
print(f"\n9. Correlation between Age and Marks: {correlation:.3f}")
if abs(correlation) > 0.3:
    print("   - Moderate correlation observed")
elif abs(correlation) > 0.1:
    print("   - Weak correlation observed")
else:
    print("   - Very weak or no correlation observed")

# Statistical significance
print(f"\n10. Statistical Tests:")
print(f"    - Gender difference p-value: {p_value:.3f} {'(Significant)' if p_value < 0.05 else '(Not Significant)'}")
print(f"    - Age group difference p-value: {p_value_anova:.3f} {'(Significant)' if p_value_anova < 0.05 else '(Not Significant)'}")

## 8. Recommendations

In [None]:
print("RECOMMENDATIONS:")
print("=" * 50)

# Based on analysis
print("1. Focus on students with marks below 60 (Below Average and Poor categories)")
print("2. Consider gender-specific support programs if significant differences exist")
print("3. Monitor age group performance trends for targeted interventions")
print("4. Implement regular assessments to track student progress")
print("5. Provide additional support for students in the 23-24 age group if needed")
print("6. Consider peer mentoring programs between high and low performers")
print("7. Regular feedback sessions to understand challenges faced by students")
print("8. Implement data-driven decision making for academic improvements")

## 9. Export Cleaned Data (Optional)

In [None]:
# Export the cleaned and analyzed data
df.to_csv('student_dataset_analyzed.csv', index=False)
print("Analyzed dataset exported as 'student_dataset_analyzed.csv'")