In [2]:
import pandas as pd

# Step 1: Load Data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [25, 30, None, 35, 40],
    'Gender': ['Female', 'Male', 'Male', 'Male', None]
}

df = pd.DataFrame(data)

# Step 2: Handling Missing Values
# Identify Missing Values
print("Missing Values in the DataFrame:")
print(df.isnull())

# Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].mean())  # Filling 'Age' with the mean value
df['Gender'] = df['Gender'].fillna('Unknown')  # Filling 'Gender' with 'Unknown'

print("\nData after Handling Missing Values:")
print(df)

# Step 3: Handling Duplicates
# Identifying duplicates
print("\nDuplicate Rows:")
print(df.duplicated())  # Returns a boolean series indicating duplicate rows

# Remove duplicates
df_cleaned = df.drop_duplicates()
print("\nData after Removing Duplicates:")
print(df_cleaned)

# Step 4: Combined Practice on a New Dataset
# New sample data
new_data = {
    'Name': ['John', 'Jane', 'Alice', 'Bob', 'John', None],
    'Age': [28, 32, 25, 29, 28, None],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', None]
}

df_new = pd.DataFrame(new_data)
print("\nOriginal DataFrame (New Data):")
print(df_new)

# Fill missing values
df_new['Age'] = df_new['Age'].fillna(df_new['Age'].median())  # Filling 'Age' with the median value
df_new['Gender'] = df_new['Gender'].fillna('Unknown')  # Filling 'Gender' with 'Unknown'

print("\nAfter Filling Missing Values (New Data):")
print(df_new)

# Remove duplicates
df_new_cleaned = df_new.drop_duplicates(subset=['Name'])
print("\nAfter Removing Duplicates (New Data):")
print(df_new_cleaned)

Missing Values in the DataFrame:
    Name    Age  Gender
0  False  False   False
1  False  False   False
2  False   True   False
3  False  False   False
4   True  False    True

Data after Handling Missing Values:
      Name   Age   Gender
0    Alice  25.0   Female
1      Bob  30.0     Male
2  Charlie  32.5     Male
3    David  35.0     Male
4     None  40.0  Unknown

Duplicate Rows:
0    False
1    False
2    False
3    False
4    False
dtype: bool

Data after Removing Duplicates:
      Name   Age   Gender
0    Alice  25.0   Female
1      Bob  30.0     Male
2  Charlie  32.5     Male
3    David  35.0     Male
4     None  40.0  Unknown

Original DataFrame (New Data):
    Name   Age  Gender
0   John  28.0    Male
1   Jane  32.0  Female
2  Alice  25.0  Female
3    Bob  29.0    Male
4   John  28.0    Male
5   None   NaN    None

After Filling Missing Values (New Data):
    Name   Age   Gender
0   John  28.0     Male
1   Jane  32.0   Female
2  Alice  25.0   Female
3    Bob  29.0     Male
4 

In [3]:
# Part 2: Apply Standardization & Formatting Rules

#     Step-by-Step Guidelines:
# 1. Standardize Text Data
#     1. Convert All Names to Lowercase:
# 2. Format Numerical Data
#     1. Round Age Column to the Nearest Integer:
# 3. Combined Practice on Another Dataset
#     1. New Sample Data:
#     2. Standardize Product Names:
#     3. Format Prices to Two Decimal Places:
        
import pandas as pd

# Step 1: Standardize Text Data
# Sample data
data = {
    'Name': ['Alice', 'Bob', 'CHARLIE', 'David', 'ALICE'],
    'Age': [25, 30, 22, 35, 30]
}

df = pd.DataFrame(data)

# Convert all names to lowercase
df['Name'] = df['Name'].str.lower()

print("After Standardizing Names to Lowercase:")
print(df)

# Step 2: Format Numerical Data
# Round the 'Age' column to the nearest integer
df['Age'] = df['Age'].round()

print("\nAfter Rounding 'Age' Column to Nearest Integer:")
print(df)

# Step 3: Combined Practice on Another Dataset
# New sample dataset with product names and prices
new_data = {
    'Product Name': ['apple', 'Banana', 'Orange', 'Grape', 'Pineapple'],
    'Price': [1.236, 0.998, 1.541, 2.345, 3.11]
}

df_new = pd.DataFrame(new_data)
print("\nOriginal New Dataset:")
print(df_new)

# Standardize product names to title case
df_new['Product Name'] = df_new['Product Name'].str.title()

print("\nAfter Standardizing Product Names to Title Case:")
print(df_new)

# Format prices to two decimal places
df_new['Price'] = df_new['Price'].apply(lambda x: round(x, 2))

print("\nAfter Formatting Prices to Two Decimal Places:")
print(df_new)        
        
        
        

After Standardizing Names to Lowercase:
      Name  Age
0    alice   25
1      bob   30
2  charlie   22
3    david   35
4    alice   30

After Rounding 'Age' Column to Nearest Integer:
      Name  Age
0    alice   25
1      bob   30
2  charlie   22
3    david   35
4    alice   30

Original New Dataset:
  Product Name  Price
0        apple  1.236
1       Banana  0.998
2       Orange  1.541
3        Grape  2.345
4    Pineapple  3.110

After Standardizing Product Names to Title Case:
  Product Name  Price
0        Apple  1.236
1       Banana  0.998
2       Orange  1.541
3        Grape  2.345
4    Pineapple  3.110

After Formatting Prices to Two Decimal Places:
  Product Name  Price
0        Apple   1.24
1       Banana   1.00
2       Orange   1.54
3        Grape   2.35
4    Pineapple   3.11
