In [9]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
#create a sample data frame with duplications
data={
    "name":["john","jane","john","bob","jane"],
    "age":[25,30,25,35,30],
    "city":["new york","los angeles","new york","chicago","los angeles"]
}

df=pd.DataFrame(data)
print("Original dataframe: ")
print(df)

#identify duplicate rows
duplicates=df[df.duplicated()]
print("\nDuplicate rows: ")
print(duplicates)

#remove duplicates, keeping the first occurrence
df_cleaned=df.drop_duplicates()
print("\nDataFrame after removing duplicates: ")
print(df_cleaned)

Original dataframe: 
   name  age         city
0  john   25     new york
1  jane   30  los angeles
2  john   25     new york
3   bob   35      chicago
4  jane   30  los angeles

Duplicate rows: 
   name  age         city
2  john   25     new york
4  jane   30  los angeles

DataFrame after removing duplicates: 
   name  age         city
0  john   25     new york
1  jane   30  los angeles
3   bob   35      chicago


In [7]:
import pandas as pd

# Sample DataFrame with missing values for Helmet Detection use case
data = {
    "Age": [25, None, 30, 35, None, 4.5],
    "Height": [5.5, 6.1, 5.9, None, 5.8, 5.2],
    "HelmetStatus": [1, None, 1, 0, 1, 1],
    "Gender": ['M', 'F', 'M', None, 'F', 'M'],
    "Experience": [2, 4, 6, 1, None, None]
}

df = pd.DataFrame(data)

print("Original DataFrame with Missing Values:")
print(df)

df_filled_mean = df.copy()
df_filled_median = df.copy()
df_filled_mode = df.copy()

df_filled_mean['Age'].fillna(df['Age'].mean(), inplace=True)
df_filled_mean['Height'].fillna(df['Height'].mean(), inplace=True)
df_filled_mean['Experience'].fillna(df['Experience'].mean(), inplace=True)

df_filled_median['Age'].fillna(df['Age'].median(), inplace=True)
df_filled_median['Height'].fillna(df['Height'].median(), inplace=True)
df_filled_median['Experience'].fillna(df['Experience'].median(), inplace=True)

df_filled_mode['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df_filled_mode['HelmetStatus'].fillna(df['HelmetStatus'].mode()[0], inplace=True)

print("\nDataFrame after Mean Imputation:")
print(df_filled_mean)

print("\nDataFrame after Median Imputation:")
print(df_filled_median)

print("\nDataFrame after Mode Imputation:")
print(df_filled_mode)

Original DataFrame with Missing Values:
    Age  Height  HelmetStatus Gender  Experience
0  25.0     5.5           1.0      M         2.0
1   NaN     6.1           NaN      F         4.0
2  30.0     5.9           1.0      M         6.0
3  35.0     NaN           0.0   None         1.0
4   NaN     5.8           1.0      F         NaN
5   4.5     5.2           1.0      M         NaN

DataFrame after Mean Imputation:
      Age  Height  HelmetStatus Gender  Experience
0  25.000     5.5           1.0      M        2.00
1  23.625     6.1           NaN      F        4.00
2  30.000     5.9           1.0      M        6.00
3  35.000     5.7           0.0   None        1.00
4  23.625     5.8           1.0      F        3.25
5   4.500     5.2           1.0      M        3.25

DataFrame after Median Imputation:
    Age  Height  HelmetStatus Gender  Experience
0  25.0     5.5           1.0      M         2.0
1  27.5     6.1           NaN      F         4.0
2  30.0     5.9           1.0      M       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled_mean['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled_mean['Height'].fillna(df['Height'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obj