In [3]:
import pandas as pd
from faker import Faker
import random
import numpy as np

# Initialize Faker
fake = Faker()

# Function to generate a single student's data
def generate_student_data(student_id):
    total_marks = random.randint(300, 500)
    student_data = {
        'id': student_id,
        'StudentName': fake.name() if random.random() > 0.05 else None,  # 5% chance of missing name
        'City': fake.city() if random.random() > 0.1 else None,  # 10% chance of missing city
        'TotalMarks': total_marks,
        'MathsMarks': random.randint(50, 100) if random.random() > 0.05 else None,  # 5% chance of missing maths marks
        'ScienceMarks': random.randint(50, 100) if random.random() > 0.05 else None,  # 5% chance of missing science marks
        'AI_Marks': random.randint(50, 100) if random.random() > 0.05 else None,  # 5% chance of missing AI marks
        'ML_Marks': random.randint(50, 100) if random.random() > 0.05 else None,  # 5% chance of missing ML marks
        'DL_Marks': random.randint(50, 100) if random.random() > 0.05 else None,  # 5% chance of missing DL marks
        'Percentage': None  # We'll calculate this later
    }
    return student_data

# Generate data for 200 students
students_data = [generate_student_data(student_id) for student_id in range(1, 201)]

# Create a DataFrame
df = pd.DataFrame(students_data)

# Calculate the Percentage column
def calculate_percentage(row):
    marks = [row['MathsMarks'], row['ScienceMarks'], row['AI_Marks'], row['ML_Marks'], row['DL_Marks']]
    valid_marks = [mark for mark in marks if mark is not None]
    if valid_marks:
        total = sum(valid_marks)
        return round((total / (len(valid_marks) * 100)) * 100, 2)
    else:
        return None

df['Percentage'] = df.apply(calculate_percentage, axis=1)

# Introduce some missing values in Percentage
for _ in range(10):  # Add 10 missing values
    df.loc[random.randint(0, 199), 'Percentage'] = None

# Save to CSV file
df.to_csv('students_data_with_missing_values.csv', index=False)

# Display the first few rows
print(df.head())


   id       StudentName              City  TotalMarks  MathsMarks  \
0   1      Bobby Jacobs  East Johnborough         396        84.0   
1   2      Jon Gonzalez        Ernestside         332         NaN   
2   3  Mr. Daniel Hicks       Powellmouth         392        81.0   
3   4        Derek Kent          Hesstown         448        74.0   
4   5      Megan Mclean         New Cathy         396        74.0   

   ScienceMarks  AI_Marks  ML_Marks  DL_Marks  Percentage  
0          98.0      60.0      64.0      93.0        79.8  
1           NaN      97.0      52.0       NaN         NaN  
2          89.0      66.0      56.0      57.0         NaN  
3          73.0      52.0      65.0      70.0        66.8  
4          78.0      66.0      62.0      55.0        67.0  


In [4]:
df.head()

Unnamed: 0,id,StudentName,City,TotalMarks,MathsMarks,ScienceMarks,AI_Marks,ML_Marks,DL_Marks,Percentage
0,1,Bobby Jacobs,East Johnborough,396,84.0,98.0,60.0,64.0,93.0,79.8
1,2,Jon Gonzalez,Ernestside,332,,,97.0,52.0,,
2,3,Mr. Daniel Hicks,Powellmouth,392,81.0,89.0,66.0,56.0,57.0,
3,4,Derek Kent,Hesstown,448,74.0,73.0,52.0,65.0,70.0,66.8
4,5,Megan Mclean,New Cathy,396,74.0,78.0,66.0,62.0,55.0,67.0


In [6]:
df.isnull().sum()

id               0
StudentName     13
City            19
TotalMarks       0
MathsMarks      12
ScienceMarks    10
AI_Marks        13
ML_Marks        12
DL_Marks        12
Percentage      62
dtype: int64

In [11]:
df["StudentName"].isna().value_counts()

StudentName
False    187
True      13
Name: count, dtype: int64

In [13]:
print(df["StudentName"].to_string())

0              Bobby Jacobs
1              Jon Gonzalez
2          Mr. Daniel Hicks
3                Derek Kent
4              Megan Mclean
5             Samuel Powell
6             Timothy Brown
7         Danielle Robinson
8          Stephanie Wilson
9            Caitlyn Brewer
10         Stephanie Dorsey
11          Michelle Burton
12              Gary Miller
13                Eric Mack
14              Paula Jones
15       Francisco Espinoza
16         Jennifer Mcclure
17           Selena Perkins
18                     None
19             Colin Duncan
20          Courtney Powell
21          Christian Banks
22            Joseph Walker
23            Kimberly Cruz
24                     None
25        Melinda Rodriguez
26             David Nelson
27              David Walsh
28         Kimberly Johnson
29            Michael Davis
30             Joshua Scott
31             Robert Moore
32      Christopher Hoffman
33          Michael Sweeney
34            Kyle Benjamin
35           Austin 

In [14]:
df["StudentName"]=df["StudentName"].replace(np.nan,"Not Registered")

In [18]:
print(df["StudentName"].to_string())

0              Bobby Jacobs
1              Jon Gonzalez
2          Mr. Daniel Hicks
3                Derek Kent
4              Megan Mclean
5             Samuel Powell
6             Timothy Brown
7         Danielle Robinson
8          Stephanie Wilson
9            Caitlyn Brewer
10         Stephanie Dorsey
11          Michelle Burton
12              Gary Miller
13                Eric Mack
14              Paula Jones
15       Francisco Espinoza
16         Jennifer Mcclure
17           Selena Perkins
18           Not Registered
19             Colin Duncan
20          Courtney Powell
21          Christian Banks
22            Joseph Walker
23            Kimberly Cruz
24           Not Registered
25        Melinda Rodriguez
26             David Nelson
27              David Walsh
28         Kimberly Johnson
29            Michael Davis
30             Joshua Scott
31             Robert Moore
32      Christopher Hoffman
33          Michael Sweeney
34            Kyle Benjamin
35           Austin 

In [19]:
df.columns

Index(['id', 'StudentName', 'City', 'TotalMarks', 'MathsMarks', 'ScienceMarks',
       'AI_Marks', 'ML_Marks', 'DL_Marks', 'Percentage'],
      dtype='object')

In [20]:
df.isna().sum()

id               0
StudentName      0
City            19
TotalMarks       0
MathsMarks      12
ScienceMarks    10
AI_Marks        13
ML_Marks        12
DL_Marks        12
Percentage      62
dtype: int64

In [21]:
print(df["City"].to_string())

0          East Johnborough
1                Ernestside
2               Powellmouth
3                  Hesstown
4                 New Cathy
5                      None
6                      None
7               East Cassie
8               Josephburgh
9                 Hayshaven
10           North Benjamin
11                 Langfurt
12           West Chrisview
13               Davidville
14              Michaelfort
15              North Susan
16                Calebberg
17              Tristanbury
18              Kristinland
19             Lake Tiffany
20              Jessicabury
21                 Johnside
22                     None
23         New Jonathanstad
24      North Daniellehaven
25          Port Cherylberg
26               Bradleyton
27           Matthewchester
28             North Thomas
29         East Timothybury
30         West Theresatown
31                     None
32              Angelamouth
33               New Ashley
34             West Michael
35               Wal