In [1]:
import faker
import pandas as pd
import faker_commerce
import random
from datetime import date, timedelta
fake = faker.Faker()
fake.add_provider(faker_commerce.Provider)

pd.set_option('display.max_colwidth', None)

## Create fake companies

In [None]:
## Utilities

In [2]:
def get_unique_fakes(provider, num_records):
    # To ensure we get enough records we will initially generate 2X the number needed 
    records = [provider() for x in range(num_records * 2)]
    unique_records = list(set(records))
    if len(unique_records) < num_records:
        raise Exception('Not enough unique records. Try generating fewer records.')
    return unique_records[0:num_records]


## Create fake companies

In [3]:
num_companies = 10_000
company_name_provider = lambda : fake.company() + ' ' + fake.company_suffix()  # Adding suffix give more uniqe names
company_names = get_unique_fakes(company_name_provider, num_companies)
company_slogans = get_unique_fakes(fake.catch_phrase, num_companies)
company_purposes = get_unique_fakes(fake.bs, num_companies)

companies = zip(company_names, company_slogans, company_purposes)
company_records = [{'name': company[0], 'slogan': company[1], 'purpose': company[2]} for company in companies]
companies_df = pd.DataFrame(company_records)
companies_df.index.rename('id', inplace=True)
companies_df.head()
companies_df.to_csv('../seeds/sources/fake_companies.csv')


## Create dates 

In [5]:
start_date = date(2023, 1, 1)
end_date = date(2029, 12, 31)
num_added_days = (end_date - start_date).days + 1
dates = [start_date + timedelta(days=day) for day in range(num_added_days)]
dates_df = pd.DataFrame({'date': dates})
dates_df.head()
dates_df.to_csv('../seeds/sources/fake_dates.csv', index=False)

## Create number range

In [11]:
numbers = []
for number in range(1, 101):
    numbers.extend([number] * number)
numbers_df = pd.DataFrame({'number': numbers})
numbers_df.head(10)

numbers_df.to_csv('../seeds/sources/fake_numbers.csv', index=False)

## Create fake products

In [6]:
def generate_product():
    return {
        'category': fake.ecommerce_category(),
        'name': fake.ecommerce_name(),
        'price': random.randrange(3, 2500) + random.randrange(0, 99) / 100
    }
            
products = [generate_product() for i in range(10_000)]
products_df = pd.DataFrame(products)
products_df.index.rename('id', inplace=True)
products_df
products_df.to_csv('../seeds/sources/fake_products.csv', index=True)


## Create fake people

In [18]:
def generate_people_info():
    country_code = random.choice(['+1', '+44', '+91', '+81', '+86'])  # Add more country codes as needed
    phone_number = fake.numerify('##########')  # Assuming 10-digit phone numbers
    gender = random.choices(['M', 'F', 'X'], weights=(49,49,1))
    if gender == 'M':
        first_name = fake.first_name_male()
    elif gender == 'F':
        first_name = fake.first_name_female()
    else:
        first_name = fake.first_name_nonbinary()

    last_name = fake.last_name()

    return {
        'first_name' : first_name,
        'last_name' : last_name,
        'gender' : gender,
        'phone_number': f'{country_code} - {phone_number}',
        'address': {
            'street_address': fake.street_address(),
            'city': fake.city(),
            'state': fake.state(),
            'zipcode': fake.zipcode(),
        },
        'birthdate': fake.date_of_birth(minimum_age=18, maximum_age=80).strftime('%Y-%m-%d'),
        'blood_type': random.choice(['A+', 'B+', 'AB+', 'O+', 'A-', 'B-', 'AB-', 'O-']),  # Less common column
        'favorite_color': fake.color_name(),  # Less common column
        'credit_score': random.randint(300, 850),  # Less common column
    }

p_info = [generate_people_info() for _ in range(10_000)]
p_info_df = pd.DataFrame(p_info)
p_info_df.index.rename('id', inplace=True)

# Display the DataFrame
print(p_info_df.head())
p_info_df.to_csv('../seeds/sources/fake_personal_info.csv')


     first_name  last_name gender      phone_number  \
id                                                    
0       Jeffery  Hernandez    [M]  +81 - 9519266153   
1     Christine     Carter    [M]   +1 - 3122887570   
2          Tina     Archer    [M]  +86 - 9703656592   
3        Robert      Baker    [F]  +44 - 8193626775   
4   Christopher       Reid    [M]   +1 - 3998496705   

                                                                                                             address  \
id                                                                                                                     
0            {'street_address': '95970 Shelton Park', 'city': 'West Caleb', 'state': 'Nebraska', 'zipcode': '27948'}   
1            {'street_address': '43716 Brian Glen', 'city': 'Juanborough', 'state': 'Tennessee', 'zipcode': '46164'}   
2                 {'street_address': '030 Perez Burg', 'city': 'Scottmouth', 'state': 'Wyoming', 'zipcode': '18661'}   
3   {'street_a