# Task 1

In [28]:
import numpy as np
import pandas as pd
from faker import Faker
import random

fake = Faker()

random.seed(42)
Faker.seed(42)  


cities = [fake.city() for _ in range(3)]
jobs = [fake.job() for _ in range(5)]
companies = [fake.company() for _ in range(4)]

df = pd.DataFrame({
    'city': [random.choice(cities) for _ in range(10000)],
    'job': [random.choice(jobs) for _ in range(10000)],
    'company': [random.choice(companies) for _ in range(10000)],
    'department': [random.choice(['Sales', 'HR', 'IT', 'Finance', 'Marketing']) for _ in range(10000)]
})

combo_counts = df.value_counts(sort = False) 
popular = combo_counts[combo_counts >= 10].reset_index(name='count')

print(popular)

                 city                      job      company department  count
0           East Jill  Chief Financial Officer  Hoffman Ltd    Finance     31
1           East Jill  Chief Financial Officer  Hoffman Ltd         HR     26
2           East Jill  Chief Financial Officer  Hoffman Ltd         IT     34
3           East Jill  Chief Financial Officer  Hoffman Ltd  Marketing     33
4           East Jill  Chief Financial Officer  Hoffman Ltd      Sales     40
..                ...                      ...          ...        ...    ...
295  North Judithbury             Town planner    Wolfe LLC    Finance     30
296  North Judithbury             Town planner    Wolfe LLC         HR     34
297  North Judithbury             Town planner    Wolfe LLC         IT     28
298  North Judithbury             Town planner    Wolfe LLC  Marketing     33
299  North Judithbury             Town planner    Wolfe LLC      Sales     38

[300 rows x 5 columns]


# Task 2

In [None]:
def maybe_missing(value, prob=0.5):
    """Return the value or np.nan with probability `prob`."""
    return value if random.random() > prob else np.nan

n = 1000

df = pd.DataFrame({
    'name': [maybe_missing(fake.name()) for _ in range(n)],
    'email': [maybe_missing(fake.email()) for _ in range(n)],
    'city': [maybe_missing(fake.city()) for _ in range(n)],
    'age': [maybe_missing(random.randint(18, 65)) for _ in range(n)],
    'department': [maybe_missing(random.choice(['IT', 'Sales', 'HR', 'Finance'])) for _ in range(n)]
})




missing_values = df.isna().sum(axis=0).reset_index(name='missing_count').sort_values(by='missing_count', ascending=False)

print(missing_values)
print('=='*20)
top_3_missing = missing_values.head(3)
print(top_3_missing)
index = top_3_missing['index'].tolist()
removed_nans = df.dropna(subset=index)
print('=='*20)
print(removed_nans)




        index  missing_count
1       email            524
4  department            520
2        city            511
3         age            507
0        name            504
        index  missing_count
1       email            524
4  department            520
2        city            511
                name                         email               city   age  \
57       David Chang            sreyes@example.com     East Elizabeth   NaN   
63               NaN     allenkristine@example.com           Laneland  62.0   
81               NaN  jonathanferguson@example.com           Johntown   NaN   
82   Charles Stevens     kristinmedina@example.com       New Lorraine   NaN   
87               NaN     huertamichael@example.com  South Michaeltown   NaN   
..               ...                           ...                ...   ...   
966              NaN           james88@example.org        West Sandra   NaN   
976      John Ibarra           zgeorge@example.com        East Alexis   NaN   

# Task 3

In [None]:
n = 100_000  # number of rows

def maybe_missing(value, prob=0.1):
    """Return NaN with probability `prob`."""
    return value if random.random() > prob else np.nan

# some limited categories to make realistic repetition
departments = ['IT', 'Sales', 'HR', 'Finance', 'Marketing']
countries = ['USA', 'Germany', 'France', 'Japan', 'UK', 'Canada']

df = pd.DataFrame({
    'id': range(1, n + 1),  # integer id
    'name': [maybe_missing(fake.name(), 0.05) for _ in range(n)],  # string
    'email': [maybe_missing(fake.email(), 0.05) for _ in range(n)],  # string
    'country': [maybe_missing(random.choice(countries), 0.1) for _ in range(n)],  # categorical-like
    'department': [maybe_missing(random.choice(departments), 0.1) for _ in range(n)],
    'age': [maybe_missing(random.randint(18, 65), 0.05) for _ in range(n)],  # numeric int
    'salary': [maybe_missing(round(random.uniform(2000, 15000), 2), 0.05) for _ in range(n)],  # numeric float
    'is_remote': [random.choice([True, False]) for _ in range(n)],  # boolean
    'join_date': [maybe_missing(fake.date_between(start_date='-5y', end_date='today'), 0.05) for _ in range(n)]  # datetime
})

print(df.head())



In [52]:
most_frequent_values_per_column = df.apply(lambda col: col.value_counts().idxmax())

print(most_frequent_values_per_column)

id                             1
name               Michael Smith
email         zsmith@example.net
country                      USA
department               Finance
age                         65.0
salary                   9371.56
is_remote                  False
join_date             2021-08-08
dtype: object


# Task 4

In [58]:
n = 10_000  # number of rows

def maybe_missing(value, prob=0.05):
    """Return np.nan with probability `prob`."""
    return value if random.random() > prob else np.nan

# Create DataFrame
df_hours = pd.DataFrame({
    'Employee ID': [random.randint(1, 50) for _ in range(n)],  # 50 employees
    'Date': [maybe_missing(fake.date_between(start_date='-2y', end_date='today')) for _ in range(n)],
    'Hours Worked': [maybe_missing(round(random.uniform(6, 10), 1)) for _ in range(n)]
})

df_hours.sort_values(by=['Employee ID', 'Date'], inplace=True)
print(df_hours.head(100))


      Employee ID        Date  Hours Worked
4192            1  2023-11-03           6.7
8230            1  2023-11-28           9.0
2483            1  2023-12-06           NaN
7005            1  2023-12-09           6.8
1255            1  2023-12-12           NaN
...           ...         ...           ...
4872            1  2024-11-18           8.5
40              1  2024-11-22           9.4
9561            1  2024-11-25           9.0
8840            1  2024-11-27           7.8
910             1  2024-12-01           8.4

[100 rows x 3 columns]
