# Setting up

In [3]:
# Importing packages
import pandas as pd
import numpy as np
import random
from faker import Faker
import geonamescache
import random

# Generating Countries

In [4]:
# Initialize GeonamesCache
gc = geonamescache.GeonamesCache()

# Define the countries with their ISO country codes
countries_iso = {'US': 'United States', 'GB': 'United Kingdom', 'AU': 'Australia', 
                 'NZ': 'New Zealand', 'CA': 'Canada'}

In [5]:
# Function to get cities from GeonamesCache by country ISO code
def get_cities_by_country(country_iso):
    # Get all cities
    cities = gc.get_cities()
    # Filter cities by country ISO code
    country_cities = {geoid: city for geoid, city in cities.items() if city['countrycode'] == country_iso}
    return list(country_cities.values())

# Create a list of all cities for the specified countries
all_cities = []
for iso, country_name in countries_iso.items():
    # Get the ISO code for the country from geonamescache
    country_iso = gc.get_countries_by_names()[country_name]['iso']
    all_cities.extend(get_cities_by_country(country_iso))


In [17]:
# Simulate the city and country for each individual
gender_list = ["male", "female"]
individuals = [{'country': random.choice(list(countries_iso.values())),
                'city': random.choice(all_cities)['name'],
                'gender': random.choice(gender_list)} for _ in range(500)]

# Check the first few entries
individuals[:5]

# Create a DataFrame with the simulated data
individuals_df = pd.DataFrame(individuals)

individuals_df.head(10)

Unnamed: 0,country,city,gender
0,United States,Tujunga,male
1,Australia,Blue Springs,female
2,Australia,Bridgwater,male
3,New Zealand,Harrogate,female
4,United Kingdom,Campsie,male
5,Canada,Johns Creek,female
6,Canada,Weymouth,female
7,United States,Oshawa,male
8,Australia,Estero,male
9,United States,Yellowknife,female


# Generating names and gender

In [18]:
def choose_faker(country, gender):
    if country == "United States":
        faker = Faker('en_US')
    elif country == "United Kingdom":
        faker = Faker('en_GB')
    elif country == "Australia":
        faker = Faker('en_AU')
    elif country == "New Zealand":
        faker = Faker('en_NZ')
    elif country == "Canada":
        faker = Faker('en_CA')
    else:
        print("some problems here lol")
        return None

    if gender == "male":
        return faker.name_male()
    elif gender == "female":
        return faker.name_female()
    else:
        return faker.name()

In [19]:
name_list =[]
for index in range(len(individuals_df)):
    row = individuals_df.iloc[index]
    country_name = row["country"]
    gender= row["gender"]
    name_list.append(choose_faker(country_name, gender))

In [20]:
individuals_df["name"] = name_list
individuals_df.head()

Unnamed: 0,country,city,gender,name
0,United States,Tujunga,male,Ernest Andrade
1,Australia,Blue Springs,female,Elizabeth Payne
2,Australia,Bridgwater,male,Justin Powell
3,New Zealand,Harrogate,female,Jason Martin
4,United Kingdom,Campsie,male,Raymond Campbell


# Generating Heights/BMI based on name/gender

In [21]:
mean_heights = {
    'United States' : {'male': 177, 'female': 163},
    'United Kingdom' : {'male': 178, 'female': 164},
    'Canada' : {'male': 178, 'female': 165},
    'Australia' : {'male': 179, 'female': 165},
    'New Zealand' : {'male': 178, 'female': 165},
}

st_dev_height = 6

In [25]:
# Function to generate height based on country and gender
def generate_height(country, gender):
    mean_height = mean_heights[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_height, st_dev_height)

In [34]:
# Generate heights for each individual in the DataFrame
individuals_df['height'] = individuals_df.apply(lambda x: generate_height(x['country'], x['gender']), axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height
0,United States,Tujunga,male,Ernest Andrade,180.358919
1,Australia,Blue Springs,female,Elizabeth Payne,164.491152
2,Australia,Bridgwater,male,Justin Powell,181.675956
3,New Zealand,Harrogate,female,Jason Martin,177.288837
4,United Kingdom,Campsie,male,Raymond Campbell,172.734311
5,Canada,Johns Creek,female,Cynthia Haynes,168.607771
6,Canada,Weymouth,female,Caitlin Simpson,162.282533
7,United States,Oshawa,male,Lawrence Raymond,174.023614
8,Australia,Estero,male,Christian Vasquez,178.076057
9,United States,Yellowknife,female,Christine Richardson,164.082652


In [39]:
# Function to generate BMI based on height and gender
def generate_bmi(height, gender):
    if gender == 'male':
        bmi_range = (18.5, 25)  # Typical healthy BMI range for males
    else:
        bmi_range = (18, 24)  # Typical healthy BMI range for females

        # Randomly choose a BMI value within the range
    bmi = np.random.uniform(*bmi_range)
    return round(bmi, 1)  # Return the BMI value rounded to one decimal place

In [40]:
# Generate heights for each individual in the DataFrame
individuals_df['bmi'] = individuals_df.apply(lambda x: generate_bmi(x['country'], x['gender']), axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi
0,United States,Tujunga,male,Ernest Andrade,180.358919,21.6
1,Australia,Blue Springs,female,Elizabeth Payne,164.491152,18.3
2,Australia,Bridgwater,male,Justin Powell,181.675956,24.0
3,New Zealand,Harrogate,female,Jason Martin,177.288837,20.9
4,United Kingdom,Campsie,male,Raymond Campbell,172.734311,23.9
5,Canada,Johns Creek,female,Cynthia Haynes,168.607771,23.6
6,Canada,Weymouth,female,Caitlin Simpson,162.282533,20.2
7,United States,Oshawa,male,Lawrence Raymond,174.023614,22.6
8,Australia,Estero,male,Christian Vasquez,178.076057,20.3
9,United States,Yellowknife,female,Christine Richardson,164.082652,21.0


# Building the table

# FINAL DATASET

In [None]:
Data = {
    'Name': ,
    'Sample_ID': ,
    'Age': ,
    'Gender': ,
    'BMI': ,
    'Height': ,
    'Country': ,
    'City': ,
    'Education level': ,

}