In [15]:
import pandas as pd
from faker import Faker
import numpy as np
import random

fake = Faker()

def generate_person(id, full_names):
    # Ensure unique name generation
    while True:
        name = fake.first_name()
        last_name = fake.last_name()
        if (name, last_name) not in full_names:
            full_names.add((name, last_name))
            break

    age = random.randint(18, 80)
    sex = "Male" if random.random() < 0.6 else "Female"  # Imbalance ratio
    married = random.choice(["Yes", "No"])
    
    # Income logic: more realistic with age, with potential for nulls and negatives
    income = np.random.normal(25000 + age * 250, 10000)
    if random.random() < 0.05:  # Chance for negative or null income
        income = random.choice([income, None, -abs(income)])

    # Children logic: more children with older age, with potential for outliers
    children = np.random.poisson(age / 25) if age > 20 else 0
    if random.random() < 0.1:  # Unrealistic amount of children for age
        children += random.randint(5, 10)

    # Height and Weight logic: realistic with potential for outliers
    height = np.random.normal(160 if sex == "Female" else 175, 10)
    weight = np.random.normal(65 if sex == "Female" else 80, 15)
    if random.random() < 0.05:  # Outlier chance
        weight += random.choice([-20, 20])  # Unusual weight adjustment

    # Introducing null values randomly for weight and height
    if random.random() < 0.05:  # 5% chance for null values
        height, weight = random.choice([(None, weight), (height, None), (None, None)])

    return {
        "id": id,
        "age": age,
        "name": name,
        "last_name": last_name,
        "sex": sex,
        "married": married,
        "income": income,
        "childrens": children if children <= 10 else None,  # Handling extreme outliers
        "weight": weight,
        "height": height
    }

def generate_dataset(num_samples=1000):
    full_names = set()
    data = []
    
    for id in range(1, num_samples + 1):
        person = generate_person(id, full_names)
        data.append(person)
    
    df = pd.DataFrame(data)
    df.to_csv("salary-dataset.csv", index=False)
    print(f"Dataset with {num_samples} samples saved as 'salary-dataset.csv'.")

generate_dataset()


Dataset with 1000 samples saved as 'salary-dataset.csv'.


In [None]:
# from itertools import product
# import pandas as pd
# import numpy as np
# import random

# # Expanded lists of names for greater variety
# male_names = ['Liam', 'Noah', 'William', 'James', 'Oliver', 'Ethan', 'Jacob', 'Michael', 'Daniel', 'Henry']
# female_names = ['Emma', 'Olivia', 'Ava', 'Isabella', 'Sophia', 'Mia', 'Charlotte', 'Amelia', 'Evelyn', 'Abigail']
# last_names = ['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Miller', 'Davis', 'Rodriguez', 'Martinez']

# # Generate a larger pool of names and last names combinations
# def generate_unique_name(last_used_names, sex):
#     if sex == 'male':
#         name_pool = set(product(male_names, last_names)) - last_used_names
#     else:
#         name_pool = set(product(female_names, last_names)) - last_used_names
    
#     if not name_pool:  # In case we run out of unique names
#         return None, None, last_used_names

#     name, last_name = random.choice(list(name_pool))
#     last_used_names.add((name, last_name))
#     return name, last_name, last_used_names
# def create_imbalanced_dataset_v2(n_samples=500, outlier_ratio=0.15, male_ratio=0.6):
#     data = []
#     used_ids = set()
#     last_used_names = set()
#     num_outliers = int(n_samples * outlier_ratio)
    
#     for _ in range(n_samples):
#         id = random.randint(100000000, 999999999)
#         while id in used_ids:
#             id = random.randint(100000000, 999999999)
#         used_ids.add(id)

#         age = random.randint(18, 90)
        
#         # Ensure gender imbalance
#         if len(data) < (n_samples * male_ratio):
#             sex = 'male'
#         else:
#             sex = 'female'
        
#         name, last_name, last_used_names = generate_unique_name(last_used_names, sex)

#         status = random.choice(['single', 'married', 'divorced', 'widowed'])
#         income = random.randint(0, 10000)
#         children = random.randint(0, 5)
#         weight = random.randint(50, 120)
#         height = random.randint(150, 200)
        
#         # Introduce bad data for a subset
#         if _ < num_outliers:
#             if random.choice([True, False]):
#                 income = random.choice([income, -income])  # Negative income
#             if random.choice([True, False]):
#                 weight = random.choice([None, weight])  # Null weight
#             if random.choice([True, False]):
#                 height = random.choice([None, height])  # Null height
#             if random.choice([True, False]):
#                 # Generate various unrealistic weight/height ratios
#                 weight = random.randint(10, 30)  # Unusually low weight
#                 height = random.randint(110, 145)  # Unusually low height for adults
            
#         data.append([id, age, name, last_name, sex, status, income, children, weight, height])
        
#     columns = ['id', 'age', 'name', 'last_name', 'sex', 'status', 'income', 'children', 'weight', 'height']
#     df = pd.DataFrame(data, columns=columns)
    
#     # Shuffle the dataset to mix good and bad data evenly
#     df = df.sample(frac=1).reset_index(drop=True)
    
#     return df
