<a href="https://colab.research.google.com/github/memrranmian/my-first-repo1/blob/main/MSDSF25A007_Assignment5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np

# STEP 0: Create our student scores data
print("STEP 0: Creating student scores data")
print("We're making scores for 1200 students in 5 subjects")
print("Scores will be between -1 and 102 (so we have some outliers)")

# Create random scores between -1 and 102 in integer form
student_scores = np.random.uniform(-2, 103, size=(1200, 5)).astype(int)

print(f"Created array with shape: {student_scores.shape}")
print("This means 1200 rows (students) and 5 columns (subjects)")
print()

# STEP 1: Replace outliers with NaN
print("STEP 1: Finding and replacing outliers")
print("Outliers are scores > 100 or < 0 (impossible test scores)")

# Make a copy to keep original data safe
clean_scores = student_scores.copy().astype(int)

# Find positions where scores are too high or too low
too_high = clean_scores > 100
too_low. = clean_scores < 0

print(f"Found {np.sum(too_high)} scores that are too high (>100)")
print(f"Found {np.sum(too_low)} scores that are too low (<0)")

# Replace outlier scores with NaN (which means 'Not a Number' - missing value)
clean_scores[too_high] = np.nan.astype(int)
clean_scores[too_low] = np.nan.astype(int)

print("Replaced outliers with NaN (missing values)")
print()


SyntaxError: cannot assign to function call here. Maybe you meant '==' instead of '='? (ipython-input-664270302.py, line 23)

In [33]:
# Q1: Weighted Imputation + Outlier Removal
# A dataset contains student test scores for 5 subjects stored in a NumPy array of shape (1200, 5).
# Some values are missing (NaN), and some values are outliers (> 100 or < 0).
# Task:
# 1. Replace outliers with NaN.
# 2. For each subject (column), compute a weighted mean, where weights are proportional
# to:
# w = e^(score / max_score)
# 3. Replace NaN values with this weighted mean (vectorized solution only).
# 4. Return indices of students whose total score is in the top 1%.



import numpy as np


# STEP 0: Create the student scores data as requested
print("STEP 0: Creating student scores data")
print("We need 1200 students with 5 subjects each")
print("Scores will be random integers between -1 and 102")

# Create random scores between -1 and 102 (so we have some outliers)
student_scores = np.random.randint(-1, 103, size=(1200, 5))

print(f"Created array with shape: {student_scores.shape}")
print(f"First few rows look like:\n{student_scores[:3]}")
print()

# STEP 1: Replace outliers with NaN
print("STEP 1: Finding and replacing outliers")
print("Outliers are scores > 100 or < 0")

# Make a copy to keep original data safe
clean_scores = student_scores.astype(float).copy()  # Convert to float to allow NaN

print("Finding scores that are too high (>100) or too low (<0)")
too_high = clean_scores > 100
too_low = clean_scores < 0

print(f"Found {np.sum(too_high)} scores that are too high")
print(f"Found {np.sum(too_low)} scores that are too low")

# Replace outliers with NaN (which means "Not a Number" - missing value)
clean_scores[too_high] = np.nan
clean_scores[too_low] = np.nan

print("Replaced outliers with NaN (missing values)")
print(f"Now we have {np.sum(np.isnan(clean_scores))} missing values in total")
print()

# STEP 2: Calculate weighted mean for each subject
print("STEP 2: Calculating weighted mean for each subject")
print("Weighted mean gives more importance to higher scores")
print("We use the formula: weight = e^(score / max_score)")

# We'll calculate for each subject (column) one by one
weighted_means = []  # Empty list to store our results

# Loop through each subject (columns 0 to 4)
for subject_index in range(5):
    print(f"\n--- Working on Subject {subject_index + 1} ---")

    # Get all scores for this subject
    subject_scores = clean_scores[:, subject_index]

    # Find valid scores (not missing)
    valid_scores = subject_scores[~np.isnan(subject_scores)]

    # Find the maximum score in this subject
    max_score = np.max(valid_scores)
    print(f"Maximum score in this subject: {max_score}")

    # Calculate weights using the formula: w = e^(score / max_score)
    # e^ means exponential - it makes numbers grow faster
    weights = np.exp(valid_scores / max_score)
    print(f"Calculated weights for {len(weights)} valid scores")

    # Calculate weighted mean: (sum of weight×score) ÷ (sum of weights)
    numerator = np.sum(weights * valid_scores)  # Sum of (weight × score)
    denominator = np.sum(weights)               # Sum of weights
    weighted_mean = numerator / denominator

    print(f"Weighted mean for subject {subject_index + 1}: {weighted_mean:.2f}")
    weighted_means.append(weighted_mean)

print(f"\nAll weighted means: {[f'{wm:.2f}' for wm in weighted_means]}")
print()

# STEP 3: Vectorized solution to replace NaN values with weighted means
print("STEP 3: Vectorized replacement of NaN values with weighted means")

# Create final scores array
final_scores = clean_scores.copy()

# Vectorized approach: Replace NaN values with weighted means for each column
for col in range(5):
    # Get the current column
    current_col = clean_scores[:, col]

    # Create a mask of NaN positions in this column
    nan_mask = np.isnan(current_col)

    # Vectorized replacement: Where mask is True, use weighted mean; where False, keep original value
    final_scores[:, col] = np.where(nan_mask, weighted_means[col], current_col)

print("Completed vectorized NaN replacement")
print()

# STEP 4: Find top 1% students
print("STEP 4: Finding top 1% students by total score")

# Calculate total score for each student (sum across all subjects)
total_scores = np.sum(final_scores, axis=1)
print(f"Calculated total scores for all {len(total_scores)} students")
print(f"Highest total score: {np.max(total_scores):.2f}")
print(f"Lowest total score: {np.min(total_scores):.2f}")

# Calculate how many students are in top 1%
total_students = len(total_scores)
top_1_percent_count = max(1, total_students // 100)  # At least 1 student
print(f"Top 1% means {top_1_percent_count} students out of {total_students}")

# Find the threshold for top 1%
# We sort scores and take the top ones
sorted_indices = np.argsort(total_scores)[::-1]  # Sort from highest to lowest
top_student_indices = sorted_indices[:top_1_percent_count]

print(f"\nRESULT: Top {top_1_percent_count} student indices: {top_student_indices}")
print("These are the positions of the best students in our list!")

# Optional: Show their total scores
print(f"Their total scores are: {total_scores[top_student_indices]}")

STEP 0: Creating student scores data
We need 1200 students with 5 subjects each
Scores will be random integers between -1 and 102
Created array with shape: (1200, 5)
First few rows look like:
[[ 8 40 17 60 30]
 [37 55 45  3 98]
 [48 31 47 82 48]]

STEP 1: Finding and replacing outliers
Outliers are scores > 100 or < 0
Finding scores that are too high (>100) or too low (<0)
Found 108 scores that are too high
Found 56 scores that are too low
Replaced outliers with NaN (missing values)
Now we have 164 missing values in total

STEP 2: Calculating weighted mean for each subject
Weighted mean gives more importance to higher scores
We use the formula: weight = e^(score / max_score)

--- Working on Subject 1 ---
Maximum score in this subject: 100.0
Calculated weights for 1161 valid scores
Weighted mean for subject 1: 58.00

--- Working on Subject 2 ---
Maximum score in this subject: 100.0
Calculated weights for 1168 valid scores
Weighted mean for subject 2: 58.00

--- Working on Subject 3 ---


In [34]:
# Q2: Conditional Imputation Based on Student Group
# A dataset (900, 5) contains:
# • Columns: Math, Physics, English, CS, Chemistry
# • NaN values exist randomly.
# Students are divided into 3 groups:
# • Group A: rows 0–299
# • Group B: rows 300–599
# • Group C: rows 600–899
# Task:
# Impute values using different strategies per group:
# • Group A: Replace NaNs with row mean.
# • Group B: Replace NaNs with column median.
# • Group C: Replace NaNs with global median of all non-NaN values.
# Return:
# • Mean score of each group
# • Group with highest average Math performance
# Must be fully vectorized — no loops.


import numpy as np

# STEP 1: Create sample student data with missing values
print("STEP 1: Creating student data")
print("We have 900 students with 5 subjects: Math, Physics, English, CS, Chemistry")

# Create random scores between 50-100 with some missing values
student_scores = np.random.randint(50, 101, size=(900, 5)).astype(float)

# Add some missing values (NaN) randomly
mask = np.random.random(size=(900, 5)) < 0.1  # 10% chance of missing value
student_scores[mask] = np.nan

print(f"Created array shape: {student_scores.shape}")
print(f"Number of missing values: {np.sum(np.isnan(student_scores))}")
print()

# STEP 2: Define the student groups
print("STEP 2: Defining student groups")
print("Group A: Students 0 to 299 (first 300 students)")
print("Group B: Students 300 to 599 (next 300 students)")
print("Group C: Students 600 to 899 (last 300 students)")

# Create group masks (these are like highlighters for each group)
group_a_mask = np.zeros(900, dtype=bool)  # Create empty mask for Group A
group_a_mask[0:300] = True                # Highlight students 0-299

group_b_mask = np.zeros(900, dtype=bool)  # Create empty mask for Group B
group_b_mask[300:600] = True              # Highlight students 300-599

group_c_mask = np.zeros(900, dtype=bool)  # Create empty mask for Group C
group_c_mask[600:900] = True              # Highlight students 600-899

print(f"Group A has {np.sum(group_a_mask)} students")
print(f"Group B has {np.sum(group_b_mask)} students")
print(f"Group C has {np.sum(group_c_mask)} students")
print()

# STEP 3: Group A - Replace NaNs with row mean
print("STEP 3: Group A - Replace missing values with each student's average")

# Make a copy to work with
filled_scores = student_scores.copy()

# For Group A students only
group_a_scores = filled_scores[group_a_mask]

# Calculate each student's average (ignoring missing values)
row_means = np.nanmean(group_a_scores, axis=1, keepdims=True)

# Find where values are missing in Group A
nan_mask_a = np.isnan(group_a_scores)

# Replace missing values with row means
# np.where works like: if condition True use value1, else use value2
group_a_scores_filled = np.where(nan_mask_a, row_means, group_a_scores)

# Put the filled scores back
filled_scores[group_a_mask] = group_a_scores_filled

print("Group A: Missing values replaced with each student's average")
print()

# STEP 4: Group B - Replace NaNs with column median
print("STEP 4: Group B - Replace missing values with subject medians")

# For Group B students only
group_b_scores = filled_scores[group_b_mask]

# Calculate median for each subject (ignoring missing values)
column_medians = np.nanmedian(group_b_scores, axis=0, keepdims=True)

# Find where values are missing in Group B
nan_mask_b = np.isnan(group_b_scores)

# Replace missing values with column medians
group_b_scores_filled = np.where(nan_mask_b, column_medians, group_b_scores)

# Put the filled scores back
filled_scores[group_b_mask] = group_b_scores_filled

print("Group B: Missing values replaced with subject medians")
print()

# STEP 5: Group C - Replace NaNs with global median
print("STEP 5: Group C - Replace missing values with global median")

# For Group C students only
group_c_scores = filled_scores[group_c_mask]

# Calculate global median of all non-missing values
global_median = np.nanmedian(filled_scores)

# Find where values are missing in Group C
nan_mask_c = np.isnan(group_c_scores)

# Replace missing values with global median
group_c_scores_filled = np.where(nan_mask_c, global_median, group_c_scores)

# Put the filled scores back
filled_scores[group_c_mask] = group_c_scores_filled

print(f"Global median (all subjects): {global_median:.2f}")
print("Group C: Missing values replaced with global median")
print()

# STEP 6: Calculate mean scores for each group
print("STEP 6: Calculate average scores for each group")

# Calculate mean for Group A (all subjects)
mean_group_a = np.mean(filled_scores[group_a_mask])
print(f"Group A average score: {mean_group_a:.2f}")

# Calculate mean for Group B (all subjects)
mean_group_b = np.mean(filled_scores[group_b_mask])
print(f"Group B average score: {mean_group_b:.2f}")

# Calculate mean for Group C (all subjects)
mean_group_c = np.mean(filled_scores[group_c_mask])
print(f"Group C average score: {mean_group_c:.2f}")
print()

# STEP 7: Find group with highest Math performance
print("STEP 7: Find which group has best Math scores")

# Get Math scores (first column) for each group
math_scores_a = filled_scores[group_a_mask, 0]  # Math is column 0
math_scores_b = filled_scores[group_b_mask, 0]
math_scores_c = filled_scores[group_c_mask, 0]

# Calculate Math averages
math_mean_a = np.mean(math_scores_a)
math_mean_b = np.mean(math_scores_b)
math_mean_c = np.mean(math_scores_c)

print(f"Group A Math average: {math_mean_a:.2f}")
print(f"Group B Math average: {math_mean_b:.2f}")
print(f"Group C Math average: {math_mean_c:.2f}")

# Find which group has highest Math average
math_means = [math_mean_a, math_mean_b, math_mean_c]
best_group_index = np.argmax(math_means)
best_group_name = ["A", "B", "C"][best_group_index]

print(f"\nRESULT: Group {best_group_name} has the highest Math performance!")

STEP 1: Creating student data
We have 900 students with 5 subjects: Math, Physics, English, CS, Chemistry
Created array shape: (900, 5)
Number of missing values: 495

STEP 2: Defining student groups
Group A: Students 0 to 299 (first 300 students)
Group B: Students 300 to 599 (next 300 students)
Group C: Students 600 to 899 (last 300 students)
Group A has 300 students
Group B has 300 students
Group C has 300 students

STEP 3: Group A - Replace missing values with each student's average
Group A: Missing values replaced with each student's average

STEP 4: Group B - Replace missing values with subject medians
Group B: Missing values replaced with subject medians

STEP 5: Group C - Replace missing values with global median
Global median (all subjects): 75.00
Group C: Missing values replaced with global median

STEP 6: Calculate average scores for each group
Group A average score: 74.27
Group B average score: 75.37
Group C average score: 75.15

STEP 7: Find which group has best Math scores


In [37]:
# Q3: Multi-Level Integrity Check + Replacement
# You have a (1500, 5) score matrix with:
# • Some scores missing (NaN)
# • Some scores duplicated (e.g., same value repeated in entire row)
# Task:
# 1. Detect rows where any 3+ columns have the same score → treat the entire row as
# invalid.
# 2. Replace such rows using the mean vector of all valid rows.
# 3. After cleaning, compute percentile rankings (0–100) for each student based on total
# scores.
# Output an array of percentile ranks of shape (1500,).



import numpy as np

# STEP 1: Create sample student data
print("STEP 1: Creating student score data")
print("1500 students, 5 subjects with some missing and duplicated values")

# Create random scores between 60-100
student_scores = np.random.randint(60, 101, size=(1500, 5)).astype(float)

# Add some missing values (NaN)
missing_mask = np.random.random(size=(1500, 5)) < 0.05  # 5% missing
student_scores[missing_mask] = np.nan

# Add some duplicated rows (where 3+ subjects have same score)
for i in range(50):  # Make 50 students have suspicious scores
    student_idx = np.random.randint(0, 1500)
    duplicate_value = np.random.randint(60, 101)
    # Make 3 or more subjects have the same score
    subjects_to_duplicate = np.random.choice(5, size=3, replace=False)
    student_scores[student_idx, subjects_to_duplicate] = duplicate_value

print(f"Created scores for {student_scores.shape[0]} students, {student_scores.shape[1]} subjects")
print(f"Missing values: {np.sum(np.isnan(student_scores))}")
print()

# STEP 2: Detect rows with 3+ same scores
print("STEP 2: Finding suspicious rows with 3+ same scores")
print("This detects possible cheating or data errors")

# Make a copy to clean missing values temporarily for detection
temp_scores = student_scores.copy()
temp_scores[np.isnan(temp_scores)] = -999  # Replace NaN with special value

# Check each row for duplicates
suspicious_rows = []  # List to store bad row indices

for i in range(len(temp_scores)):
    row = temp_scores[i]  # Get one student's scores
    unique_scores, counts = np.unique(row, return_counts=True)  # Count each score

    # Check if any score appears 3 or more times (and ignore our special -999 value)
    valid_counts = counts[unique_scores != -999]
    if len(valid_counts) > 0 and np.max(valid_counts) >= 3:
        suspicious_rows.append(i)

print(f"Found {len(suspicious_rows)} suspicious rows with 3+ same scores")
print(f"Suspicious row indices: {suspicious_rows[:10]}...")  # Show first 10
print()

# STEP 3: Replace suspicious rows with mean of valid rows
print("STEP 3: Replacing suspicious rows with average of good rows")

# Find valid rows (not suspicious)
valid_rows_mask = np.ones(1500, dtype=bool)  # Start with all True
valid_rows_mask[suspicious_rows] = False     # Mark suspicious rows as False

# Calculate mean of each subject from valid rows (ignoring NaN)
valid_scores = student_scores[valid_rows_mask]
subject_means = np.nanmean(valid_scores, axis=0)

print(f"Subject means from good rows: {subject_means}")

# Replace each suspicious row with the subject means
cleaned_scores = student_scores.copy()
for row_idx in suspicious_rows:
    cleaned_scores[row_idx] = subject_means  # Replace entire row with means

print(f"Replaced {len(suspicious_rows)} suspicious rows")
print()

# STEP 4: Calculate percentile rankings
print("STEP 4: Calculating percentile rankings (0-100)")
print("Percentile shows how each student compares to others")

# Calculate total score for each student (sum of all subjects)
total_scores = np.nansum(cleaned_scores, axis=1)

print(f"Total scores range: {np.min(total_scores):.1f} to {np.max(total_scores):.1f}")

# Calculate percentile for each student
percentile_ranks = np.zeros(1500)  # Create empty array for results

for i in range(1500):
    # Percentile = (number of students with lower scores / total students) × 100
    student_score = total_scores[i]
    students_with_lower_score = np.sum(total_scores < student_score)
    percentile = (students_with_lower_score / 1500) * 100
    percentile_ranks[i] = percentile

print(f"Percentile ranks calculated: {percentile_ranks.shape}")
print(f"Percentiles range: {np.min(percentile_ranks):.1f} to {np.max(percentile_ranks):.1f}")
print()

# STEP 5: Show results
print("STEP 5: Final Results")
print(f"Original suspicious rows: {len(suspicious_rows)}")
print(f"Percentile array shape: {percentile_ranks.shape}")
print(f"First 10 percentile values: {percentile_ranks[:10]}")
print("\nExample: A percentile of 85 means the student scored better than 85% of students")
print("Example: A percentile of 25 means the student scored better than only 25% of students")

STEP 1: Creating student score data
1500 students, 5 subjects with some missing and duplicated values
Created scores for 1500 students, 5 subjects
Missing values: 382

STEP 2: Finding suspicious rows with 3+ same scores
This detects possible cheating or data errors
Found 57 suspicious rows with 3+ same scores
Suspicious row indices: [8, 31, 66, 127, 196, 234, 309, 356, 367, 406]...

STEP 3: Replacing suspicious rows with average of good rows
Subject means from good rows: [80.62417097 80.21945867 79.34956395 80.51169591 79.49671293]
Replaced 57 suspicious rows

STEP 4: Calculating percentile rankings (0-100)
Percentile shows how each student compares to others
Total scores range: 160.0 to 475.0
Percentile ranks calculated: (1500,)
Percentiles range: 0.0 to 99.9

STEP 5: Final Results
Original suspicious rows: 57
Percentile array shape: (1500,)
First 10 percentile values: [24.53333333  5.86666667 47.66666667 91.53333333 83.4        62.93333333
 84.66666667 21.66666667 59.13333333 26.4   

In [38]:
# Q4 — Finding Students With Suspicious Patterns
# You have a dataset (1100, 5) containing:
# • Exam 1 scores (columns 0–2)
# • Practical scores (columns 3–4)
# Task:
# • Replace NaNs using the column-wise mean.
# • Compute the coefficient of variation (std/mean) for each student across all 5 subjects.
# • Mark students as suspicious if:
# (Mean Exam Score > 85) AND (Std of Practicals > 20)
# 1. Return percentage of suspicious students.
# Must use NumPy boolean indexing only.


import numpy as np

# STEP 1: Create sample student data
print("STEP 1: Creating student data")
print("1100 students: 3 Exam scores + 2 Practical scores")

# Create random scores
student_scores = np.random.randint(40, 101, size=(1100, 5)).astype(float)

# Add some missing values (NaN)
missing_mask = np.random.random(size=(1100, 5)) < 0.08  # 8% missing
student_scores[missing_mask] = np.nan

print(f"Created {student_scores.shape[0]} students, {student_scores.shape[1]} scores")
print(f"Missing values: {np.sum(np.isnan(student_scores))}")
print()

# STEP 2: Replace NaN with column means
print("STEP 2: Replacing missing values with column averages")

# Make a copy to work with
filled_scores = student_scores.copy()

# Calculate mean for each column (subject)
column_means = np.nanmean(filled_scores, axis=0)

print(f"Column means: {column_means}")

# Find where values are missing
missing_positions = np.isnan(filled_scores)

# Replace missing values with column means
filled_scores[missing_positions] = np.take(column_means, np.where(missing_positions)[1])

print("All missing values replaced with column averages")
print()

# STEP 3: Calculate coefficient of variation for each student
print("STEP 3: Calculating variation coefficient for each student")
print("Variation coefficient = Standard Deviation / Mean")
print("Shows how consistent a student's scores are")

# Calculate mean for each student (across all 5 subjects)
student_means = np.mean(filled_scores, axis=1)

# Calculate standard deviation for each student (across all 5 subjects)
student_stds = np.std(filled_scores, axis=1)

# Calculate coefficient of variation for each student
coefficient_of_variation = student_stds / student_means

print(f"Calculated variation coefficients for all 1100 students")
print(f"Example coefficients: {coefficient_of_variation[:5]}")
print()

# STEP 4: Identify suspicious students
print("STEP 4: Finding suspicious students")
print("Suspicious if: (Exam Mean > 85) AND (Practical Std > 20)")

# Separate Exam scores (columns 0-2) and Practical scores (columns 3-4)
exam_scores = filled_scores[:, 0:3]   # First 3 columns
practical_scores = filled_scores[:, 3:5]  # Last 2 columns

# Calculate mean of Exam scores for each student
exam_means = np.mean(exam_scores, axis=1)

# Calculate standard deviation of Practical scores for each student
practical_stds = np.std(practical_scores, axis=1)

print(f"Exam means range: {np.min(exam_means):.1f} to {np.max(exam_means):.1f}")
print(f"Practical stds range: {np.min(practical_stds):.1f} to {np.max(practical_stds):.1f}")

# Create boolean conditions for suspicious students
condition1 = exam_means > 85   # Students with high exam averages
condition2 = practical_stds > 20  # Students with inconsistent practicals

print(f"Students with Exam Mean > 85: {np.sum(condition1)}")
print(f"Students with Practical Std > 20: {np.sum(condition2)}")

# Find suspicious students using boolean indexing
suspicious_students = condition1 & condition2  # Both conditions must be True

print(f"Suspicious students found: {np.sum(suspicious_students)}")
print()

# STEP 5: Calculate percentage of suspicious students
print("STEP 5: Calculating percentage")

total_students = len(filled_scores)
suspicious_count = np.sum(suspicious_students)
percentage_suspicious = (suspicious_count / total_students) * 100

print(f"Total students: {total_students}")
print(f"Suspicious students: {suspicious_count}")
print(f"Percentage suspicious: {percentage_suspicious:.2f}%")

print(f"\nFINAL RESULT: {percentage_suspicious:.2f}% of students are suspicious")

STEP 1: Creating student data
1100 students: 3 Exam scores + 2 Practical scores
Created 1100 students, 5 scores
Missing values: 434

STEP 2: Replacing missing values with column averages
Column means: [69.57199211 69.6023622  69.41090555 69.94257426 70.73773774]
All missing values replaced with column averages

STEP 3: Calculating variation coefficient for each student
Variation coefficient = Standard Deviation / Mean
Shows how consistent a student's scores are
Calculated variation coefficients for all 1100 students
Example coefficients: [0.20700783 0.27385016 0.26126713 0.19614948 0.24888761]

STEP 4: Finding suspicious students
Suspicious if: (Exam Mean > 85) AND (Practical Std > 20)
Exam means range: 42.3 to 98.7
Practical stds range: 0.0 to 30.0
Students with Exam Mean > 85: 64
Students with Practical Std > 20: 102
Suspicious students found: 5

STEP 5: Calculating percentage
Total students: 1100
Suspicious students: 5
Percentage suspicious: 0.45%

FINAL RESULT: 0.45% of students ar

In [39]:
# Q5: Detecting Students With Unusual Performance Patterns
# You have a dataset of shape (1100, 5) representing students' scores:
# • Columns 0–1: Midterm scores
# • Columns 2–3: Final exam scores
# • Column 4: Project score
# Some values are missing and need to be cleaned.
# Task
# 1. Replace NaNs using column-wise median
# Use NumPy operations (no loops, no pandas) to fill missing values:
# NaN → column_median
# 2. Compute "Performance Stability Score" for each student
# For each row:
# stability_score = std(row) / max(row)
# (Use NumPy std and max across axis=1)
# 3. Mark students as flagged if they show unusual performance
# A student is flagged if:
# (Average Final Exam Score > 90) AND (Project Score < 50) AND
# (stability_score > 0.25)
# Where:
# • Average Final Exam Score = mean(columns 2 and 3)
# • Project Score = column 4
# Use boolean masking and vector operations only.
# 4. Return the percentage of flagged students
# Compute:
# percentage = (flagged_count / 1100) * 100
# Return the final numeric value.
# Constraints
# • Use NumPy-only operations
# • Use boolean indexing
# • No explicit Python loops
# • No pandas or sklearn




import numpy as np

# STEP 1: Create sample student data
print("STEP 1: Creating student score data")
print("1100 students with:")
print("- Columns 0-1: Midterm exams")
print("- Columns 2-3: Final exams")
print("- Column 4: Project score")

# Create random scores
student_scores = np.random.randint(30, 101, size=(1100, 5)).astype(float)

# Add some missing values (NaN)
missing_mask = np.random.random(size=(1100, 5)) < 0.07  # 7% missing
student_scores[missing_mask] = np.nan

print(f"Created {student_scores.shape[0]} students, {student_scores.shape[1]} scores")
print(f"Missing values: {np.sum(np.isnan(student_scores))}")
print()

# STEP 2: Replace NaN with column medians
print("STEP 2: Replacing missing values with column medians")

# Make a copy to work with
filled_scores = student_scores.copy()

# Calculate median for each column (subject)
# Median = middle value when sorted
column_medians = np.nanmedian(filled_scores, axis=0)

print(f"Column medians: {column_medians}")

# Find where values are missing
missing_positions = np.isnan(filled_scores)

# Replace missing values with column medians
# This is vectorized - no loops!
filled_scores[missing_positions] = np.take(column_medians, np.where(missing_positions)[1])

print("All missing values replaced with column medians")
print()

# STEP 3: Calculate Performance Stability Score
print("STEP 3: Calculating Performance Stability Score")
print("Stability Score = Standard Deviation / Maximum Score")
print("Lower score = more consistent performance")

# Calculate standard deviation for each student (across all 5 scores)
student_std = np.std(filled_scores, axis=1)

# Calculate maximum score for each student (highest of their 5 scores)
student_max = np.max(filled_scores, axis=1)

# Calculate stability score: std / max
stability_scores = student_std / student_max

print(f"Calculated stability scores for all 1100 students")
print(f"Stability scores range: {np.min(stability_scores):.3f} to {np.max(stability_scores):.3f}")
print()

# STEP 4: Identify flagged students using boolean conditions
print("STEP 4: Finding flagged students with unusual patterns")
print("Flagged if ALL conditions are True:")
print("1. Final Exam Average > 90")
print("2. Project Score < 50")
print("3. Stability Score > 0.25")

# Get Final Exam scores (columns 2-3)
final_exam_scores = filled_scores[:, 2:4]

# Calculate average of Final Exam scores for each student
final_exam_means = np.mean(final_exam_scores, axis=1)

# Get Project scores (column 4)
project_scores = filled_scores[:, 4]

print(f"Final exam averages range: {np.min(final_exam_means):.1f} to {np.max(final_exam_means):.1f}")
print(f"Project scores range: {np.min(project_scores):.1f} to {np.max(project_scores):.1f}")
print(f"Stability scores range: {np.min(stability_scores):.3f} to {np.max(stability_scores):.3f}")

# Create boolean conditions for flagged students
condition1 = final_exam_means > 90    # High final exam performance
condition2 = project_scores < 50      # Low project score
condition3 = stability_scores > 0.25  # High variability

print(f"Students with Final Exam > 90: {np.sum(condition1)}")
print(f"Students with Project < 50: {np.sum(condition2)}")
print(f"Students with Stability > 0.25: {np.sum(condition3)}")

# Combine all conditions using AND (&)
# All three must be True for a student to be flagged
flagged_students = condition1 & condition2 & condition3

print(f"Flagged students found: {np.sum(flagged_students)}")
print()

# STEP 5: Calculate percentage of flagged students
print("STEP 5: Calculating percentage of flagged students")

total_students = len(filled_scores)
flagged_count = np.sum(flagged_students)
percentage_flagged = (flagged_count / total_students) * 100

print(f"Total students: {total_students}")
print(f"Flagged students: {flagged_count}")
print(f"Percentage flagged: {percentage_flagged:.2f}%")

print(f"\nFINAL RESULT: {percentage_flagged:.2f}%")

STEP 1: Creating student score data
1100 students with:
- Columns 0-1: Midterm exams
- Columns 2-3: Final exams
- Column 4: Project score
Created 1100 students, 5 scores
Missing values: 366

STEP 2: Replacing missing values with column medians
Column medians: [66. 65. 65. 66. 66.]
All missing values replaced with column medians

STEP 3: Calculating Performance Stability Score
Stability Score = Standard Deviation / Maximum Score
Lower score = more consistent performance
Calculated stability scores for all 1100 students
Stability scores range: 0.024 to 0.312

STEP 4: Finding flagged students with unusual patterns
Flagged if ALL conditions are True:
1. Final Exam Average > 90
2. Project Score < 50
3. Stability Score > 0.25
Final exam averages range: 32.0 to 99.0
Project scores range: 30.0 to 100.0
Stability scores range: 0.024 to 0.312
Students with Final Exam > 90: 43
Students with Project < 50: 255
Students with Stability > 0.25: 70
Flagged students found: 4

STEP 5: Calculating percent