In [1]:
import pandas as pd
import numpy as np
import random

# Set the random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Generate the dataset
data = {
    'id': np.arange(1, 101),
    'name': [f'Name_{i}' for i in range(1, 101)],
    'age': np.random.randint(18, 66, size=100),
    'salary': np.random.randint(30000, 120001, size=100),
    'department': np.random.choice(['HR', 'IT', 'Sales', 'Marketing'], size=100)
}

df = pd.DataFrame(data)

# Introduce missing values in 'age' and 'salary' columns
df.loc[5:10, 'age'] = np.nan
df.loc[20:25, 'salary'] = np.nan

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Fill missing values with the mean of the column
df['age'].fillna(df['age'].mean(), inplace=True)
df['salary'].fillna(df['salary'].mean(), inplace=True)

# Create a new column 'seniority_level' based on 'age'
df['seniority_level'] = pd.cut(df['age'], bins=[17, 30, 45, 65], labels=['Junior', 'Mid-level', 'Senior'])

# Create a new column 'annual_bonus' as 10% of 'salary'
df['annual_bonus'] = df['salary'] * 0.10

# Filter data to include only employees with a salary greater than $50,000
high_salary_df = df[df['salary'] > 50000]
print("Filtered DataFrame:\n", high_salary_df.head())

# Group data by department and calculate the average salary
grouped_df = df.groupby('department')['salary'].mean()
print("Grouped DataFrame:\n", grouped_df)

# Create a small DataFrame with department and head count
extra_data = pd.DataFrame({
    'department': ['HR', 'IT', 'Sales', 'Marketing'],
    'head_count': [25, 30, 20, 25]
})

# Merge the DataFrames on the 'department' column
merged_df = pd.merge(df, extra_data, on='department')
print("Merged DataFrame:\n", merged_df.head())


Missing values:
 id            0
name          0
age           6
salary        6
department    0
dtype: int64
Filtered DataFrame:
    id    name       age    salary department seniority_level  annual_bonus
0   1  Name_1  56.00000   97563.0         IT          Senior        9756.3
2   3  Name_3  32.00000   78190.0  Marketing       Mid-level        7819.0
4   5  Name_5  25.00000  117538.0      Sales          Junior       11753.8
5   6  Name_6  41.12766   69504.0         HR       Mid-level        6950.4
6   7  Name_7  41.12766   63159.0  Marketing       Mid-level        6315.9
Grouped DataFrame:
 department
HR           73883.681358
IT           64859.200000
Marketing    73038.384641
Sales        84587.453113
Name: salary, dtype: float64
Merged DataFrame:
    id     name       age   salary department seniority_level  annual_bonus  \
0   1   Name_1  56.00000  97563.0         IT          Senior        9756.3   
1   2   Name_2  46.00000  32695.0         IT          Senior        3269.5   
2 