# Setting up

In [1]:
# Importing packages
import pandas as pd
import numpy as np
import random
from faker import Faker
import geonamescache
import random
import pandas as pd


# Generating Countries

In [2]:
# Initialize GeonamesCache
gc = geonamescache.GeonamesCache()

# Define the countries with their ISO country codes
countries_iso = {'US': 'United States', 'GB': 'United Kingdom', 'AU': 'Australia', 
                 'NZ': 'New Zealand', 'CA': 'Canada'}

In [3]:
# Function to get cities from GeonamesCache by country ISO code
def get_cities_by_country(country_iso):
    # Get all cities
    cities = gc.get_cities()
    # Filter cities by country ISO code
    country_cities = {geoid: city for geoid, city in cities.items() if city['countrycode'] == country_iso}
    return list(country_cities.values())

# Create a list of all cities for the specified countries
all_cities = []
for iso, country_name in countries_iso.items():
    # Get the ISO code for the country from geonamescache
    country_iso = gc.get_countries_by_names()[country_name]['iso']
    all_cities.extend(get_cities_by_country(country_iso))


In [5]:
# Simulate the city and country for each individual
gender_list = ["male", "female"]
individuals = [{'country': random.choice(list(countries_iso.values())),
                'city': random.choice(all_cities)['name'],
                'gender': random.choice(gender_list)} for _ in range(500)]

# Check the first few entries
individuals[:5]

# Create a DataFrame with the simulated data
individuals_df = pd.DataFrame(individuals)

individuals_df.head(10)

Unnamed: 0,country,city,gender
0,New Zealand,Holiday,female
1,United Kingdom,Walnut Creek,male
2,Australia,Mount Juliet,female
3,Canada,Palmers Green,female
4,Canada,Northport,female
5,United Kingdom,Hazelwood,male
6,New Zealand,South San Francisco,male
7,United Kingdom,Cordova,male
8,Canada,Ascot Vale,male
9,New Zealand,Oregon,male


# Generating names and gender

In [6]:
def choose_faker(country, gender):
    if country == "United States":
        faker = Faker('en_US')
    elif country == "United Kingdom":
        faker = Faker('en_GB')
    elif country == "Australia":
        faker = Faker('en_AU')
    elif country == "New Zealand":
        faker = Faker('en_NZ')
    elif country == "Canada":
        faker = Faker('en_CA')
    else:
        print("some problems here lol")
        return None

    if gender == "male":
        return faker.name_male()
    elif gender == "female":
        return faker.name_female()
    else:
        return faker.name()

In [7]:
name_list =[]
for index in range(len(individuals_df)):
    row = individuals_df.iloc[index]
    country_name = row["country"]
    gender= row["gender"]
    name_list.append(choose_faker(country_name, gender))

In [8]:
individuals_df["name"] = name_list
individuals_df.head()

Unnamed: 0,country,city,gender,name
0,New Zealand,Holiday,female,Ariana Drake
1,United Kingdom,Walnut Creek,male,Steven Hyde
2,Australia,Mount Juliet,female,Erica Powell
3,Canada,Palmers Green,female,Lori Montoya
4,Canada,Northport,female,Erin Fisher


# Generating Heights/BMI based on name/gender

In [9]:
mean_heights = {
    'United States' : {'male': 177, 'female': 163},
    'United Kingdom' : {'male': 178, 'female': 164},
    'Canada' : {'male': 178, 'female': 165},
    'Australia' : {'male': 179, 'female': 165},
    'New Zealand' : {'male': 178, 'female': 165},
}

st_dev_height = 6

In [10]:
# Function to generate height based on country and gender
def generate_height(country, gender):
    mean_height = mean_heights[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_height, st_dev_height)

In [11]:
# Generate heights for each individual in the DataFrame
individuals_df['height'] = individuals_df.apply(lambda x: generate_height(x['country'], x['gender']), axis=1)
individuals_df.head(20)

Unnamed: 0,country,city,gender,name,height
0,New Zealand,Holiday,female,Ariana Drake,161.681522
1,United Kingdom,Walnut Creek,male,Steven Hyde,176.096194
2,Australia,Mount Juliet,female,Erica Powell,165.535431
3,Canada,Palmers Green,female,Lori Montoya,171.469767
4,Canada,Northport,female,Erin Fisher,167.983263
5,United Kingdom,Hazelwood,male,Samuel Evans-Griffiths,185.569894
6,New Zealand,South San Francisco,male,Rochelle Ellery,174.564497
7,United Kingdom,Cordova,male,Terence Schofield,173.959975
8,Canada,Ascot Vale,male,Seth Harris Jr.,177.608977
9,New Zealand,Oregon,male,Melanie Davidson,180.64121


In [12]:
mean_bmis = {
    'United States' : {'male': 28.8, 'female': 28.8},
    'United Kingdom' : {'male': 27.5, 'female': 27.1},
    'Canada' : {'male': 27.6, 'female': 26.8},
    'Australia' : {'male': 26.5, 'female': 24.4},
    'New Zealand' : {'male': 28, 'female': 27.8},
}

st_dev_bmi = 2

In [13]:
# Function to generate height based on country and gender
def generate_bmi(country, gender):
    mean_bmi = mean_bmis[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_bmi, st_dev_bmi)

In [14]:
# Generate heights for each individual in the DataFrame
individuals_df['bmi'] = individuals_df.apply(lambda x: generate_bmi(x['country'], x['gender']), axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi
0,New Zealand,Holiday,female,Ariana Drake,161.681522,24.372221
1,United Kingdom,Walnut Creek,male,Steven Hyde,176.096194,28.222273
2,Australia,Mount Juliet,female,Erica Powell,165.535431,22.043593
3,Canada,Palmers Green,female,Lori Montoya,171.469767,29.059892
4,Canada,Northport,female,Erin Fisher,167.983263,26.132155
5,United Kingdom,Hazelwood,male,Samuel Evans-Griffiths,185.569894,29.68346
6,New Zealand,South San Francisco,male,Rochelle Ellery,174.564497,29.20468
7,United Kingdom,Cordova,male,Terence Schofield,173.959975,28.770115
8,Canada,Ascot Vale,male,Seth Harris Jr.,177.608977,26.438349
9,New Zealand,Oregon,male,Melanie Davidson,180.64121,28.658646


# SNP

In [16]:
# Set a random seed for reproducibility
np.random.seed(0)

snp1 = np.random.choice([2,2,2,2,2,2,2,1,1,0], size=500)
snp2 = np.random.choice([2,2,2,2,2,2,1,1,0,0], size=500)
snp3 = np.random.randint(0, 3, size=500)
snp4 = np.random.randint(0, 3, size=500)
snp5 = np.random.choice([2,1,1,1,1,0,0,0,0,0], size=500)


df_snp = pd.DataFrame({
    'SNP1': snp1,
    'SNP2': snp2,
    'SNP3': snp3,
    'SNP4': snp4,
    'SNP5': snp5
})

df_snp.head()

Unnamed: 0,SNP1,SNP2,SNP3,SNP4,SNP5
0,2,0,1,2,0
1,2,1,0,2,1
2,2,2,1,1,1
3,2,2,2,0,1
4,1,1,0,2,0


In [17]:
num_individuals = 500
np.random.seed(500)

# Random ages between 20 and 90
ages = np.random.randint(20, 90, num_individuals)

# Education levels
education_levels = ['primary', 'high school', 'bachelor', 'master', 'phd']

# Adjusting the distribution
education_probabilities = [0.1, 0.45, 0.25, 0.17, 0.03]

# Randomly assigning education levels
education = np.random.choice(education_levels, num_individuals, p=education_probabilities)

# Creating the DataFrame
data = pd.DataFrame({
    'Age': ages,
    'Education Level': education
})

data.head()

Unnamed: 0,Age,Education Level
0,75,high school
1,85,bachelor
2,37,high school
3,81,bachelor
4,51,high school


In [18]:
individuals_df = pd.concat([individuals_df, df_snp], axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5
0,New Zealand,Holiday,female,Ariana Drake,161.681522,24.372221,2,0,1,2,0
1,United Kingdom,Walnut Creek,male,Steven Hyde,176.096194,28.222273,2,1,0,2,1
2,Australia,Mount Juliet,female,Erica Powell,165.535431,22.043593,2,2,1,1,1
3,Canada,Palmers Green,female,Lori Montoya,171.469767,29.059892,2,2,2,0,1
4,Canada,Northport,female,Erin Fisher,167.983263,26.132155,1,1,0,2,0
5,United Kingdom,Hazelwood,male,Samuel Evans-Griffiths,185.569894,29.68346,0,2,2,1,0
6,New Zealand,South San Francisco,male,Rochelle Ellery,174.564497,29.20468,2,0,1,2,1
7,United Kingdom,Cordova,male,Terence Schofield,173.959975,28.770115,2,2,0,1,2
8,Canada,Ascot Vale,male,Seth Harris Jr.,177.608977,26.438349,2,2,2,2,0
9,New Zealand,Oregon,male,Melanie Davidson,180.64121,28.658646,2,2,0,2,1


# Pseudoanonymising names into sample_ID

In [19]:
def generate_sample_id(index):
    return f"sid_{str(index + 1).zfill(4)}"

individuals_df['sample_ID'] = individuals_df.index.to_series().apply(generate_sample_id)

individuals_df.head()

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5,sample_ID
0,New Zealand,Holiday,female,Ariana Drake,161.681522,24.372221,2,0,1,2,0,sid_0001
1,United Kingdom,Walnut Creek,male,Steven Hyde,176.096194,28.222273,2,1,0,2,1,sid_0002
2,Australia,Mount Juliet,female,Erica Powell,165.535431,22.043593,2,2,1,1,1,sid_0003
3,Canada,Palmers Green,female,Lori Montoya,171.469767,29.059892,2,2,2,0,1,sid_0004
4,Canada,Northport,female,Erin Fisher,167.983263,26.132155,1,1,0,2,0,sid_0005


# Education and Age

In [21]:
num_individuals = 500
np.random.seed(500)

# Random ages between 25 and 90
ages = np.random.randint(25, 90, num_individuals)

# Education levels
education_levels = ['primary', 'high school', 'bachelor', 'master', 'phd']

# Adjusting the distribution
education_probabilities = [0.1, 0.45, 0.25, 0.17, 0.03]

# Randomly assigning education levels
education = np.random.choice(education_levels, num_individuals, p=education_probabilities)

# Creating the DataFrame
data = pd.DataFrame({
    'Age': ages,
    'Education Level': education
})

data.head()

Unnamed: 0,Age,Education Level
0,80,high school
1,42,primary
2,86,bachelor
3,56,bachelor
4,42,high school


In [22]:
individuals_df = pd.concat([individuals_df, data], axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5,sample_ID,Age,Education Level
0,New Zealand,Holiday,female,Ariana Drake,161.681522,24.372221,2,0,1,2,0,sid_0001,80,high school
1,United Kingdom,Walnut Creek,male,Steven Hyde,176.096194,28.222273,2,1,0,2,1,sid_0002,42,primary
2,Australia,Mount Juliet,female,Erica Powell,165.535431,22.043593,2,2,1,1,1,sid_0003,86,bachelor
3,Canada,Palmers Green,female,Lori Montoya,171.469767,29.059892,2,2,2,0,1,sid_0004,56,bachelor
4,Canada,Northport,female,Erin Fisher,167.983263,26.132155,1,1,0,2,0,sid_0005,42,high school
5,United Kingdom,Hazelwood,male,Samuel Evans-Griffiths,185.569894,29.68346,0,2,2,1,0,sid_0006,42,primary
6,New Zealand,South San Francisco,male,Rochelle Ellery,174.564497,29.20468,2,0,1,2,1,sid_0007,66,phd
7,United Kingdom,Cordova,male,Terence Schofield,173.959975,28.770115,2,2,0,1,2,sid_0008,59,bachelor
8,Canada,Ascot Vale,male,Seth Harris Jr.,177.608977,26.438349,2,2,2,2,0,sid_0009,67,phd
9,New Zealand,Oregon,male,Melanie Davidson,180.64121,28.658646,2,2,0,2,1,sid_0010,43,bachelor


# Gene expression

In [23]:
def generate_positive_normal(mean, std):
    while True:
        num = np.random.normal(mean, std)
        if num > 0:
            return num

def generate_bimodal_distribution(mean1, std1, mean2, std2):
    value = np.random.choice([np.random.normal(mean1, std1), np.random.normal(mean2, std2)])
    return max(value, 0)

def generate_bimodal_gender(gender, mean1, std1, mean2, std2):
    if gender == "male":
        value = np.random.normal(mean1, std1)
    elif gender == "female":
        value = np.random.normal(mean2, std2)
    return max(value, 0)

def generate_bimodal_bmi(bmi, mean1, std1, mean2, std2):
    if bmi > 25:
        value = np.random.normal(mean1,std1)
    elif bmi < 25:
        value = np.random.normal(mean2,std2)
    return max(value, 0)

In [24]:
individuals_df['gene 1'] = individuals_df.apply(lambda x: generate_positive_normal(4, 1), axis=1)
individuals_df['gene 2'] = individuals_df.apply(lambda x: generate_positive_normal(7, 5), axis=1)
individuals_df['gene 3'] = individuals_df.apply(lambda x: generate_positive_normal(9, 3), axis=1)
individuals_df['gene 4'] = individuals_df.apply(lambda x: generate_bimodal_distribution(3, 2, 10, 2), axis=1)
individuals_df['gene 5'] = individuals_df.apply(lambda x: generate_bimodal_distribution(5, 1, 1, 1), axis=1)
individuals_df['gene 6'] = individuals_df.apply(lambda x: generate_bimodal_distribution(12, 5, 3, 3), axis=1)
individuals_df['gene 7'] = individuals_df.apply(lambda x: generate_bimodal_distribution(8, 3, 2, 1), axis=1)
individuals_df['gene 8'] = individuals_df.apply(lambda x: generate_bimodal_gender(x["gender"], 2, 2, 7, 2), axis=1)
individuals_df['gene 9'] = individuals_df.apply(lambda x: generate_bimodal_bmi(x["bmi"], 9, 1, 3, 1), axis=1)
individuals_df['gene 10'] = individuals_df.apply(lambda x: generate_bimodal_bmi(x["bmi"], 22, 2, 23, 2), axis=1)
individuals_df.head()

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,...,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10
0,New Zealand,Holiday,female,Ariana Drake,161.681522,24.372221,2,0,1,2,...,4.00483,4.867038,8.304498,10.520967,0.0,18.278833,1.581357,7.988996,3.983963,20.97542
1,United Kingdom,Walnut Creek,male,Steven Hyde,176.096194,28.222273,2,1,0,2,...,4.03132,9.323568,11.628594,1.433626,0.857954,3.905349,9.230056,4.842127,8.41999,23.421522
2,Australia,Mount Juliet,female,Erica Powell,165.535431,22.043593,2,2,1,1,...,4.187672,9.59354,13.232783,0.425663,4.720001,0.0,14.855773,5.54275,3.283091,22.281057
3,Canada,Palmers Green,female,Lori Montoya,171.469767,29.059892,2,2,2,0,...,5.383087,1.334459,14.171103,5.720492,5.08609,12.333924,4.475427,9.911531,10.288585,21.872279
4,Canada,Northport,female,Erin Fisher,167.983263,26.132155,1,1,0,2,...,4.073621,4.112309,8.228493,11.34688,4.203422,14.0824,0.759675,5.38853,10.403495,23.421872


# Case Control

In [25]:
individuals_df["case"] = individuals_df.apply(lambda x: 1 if x['SNP1'] + x['SNP2'] + x['SNP5'] < 5 else 0, axis=1)

# Building the table

In [26]:
# Reordering the columns
individuals_df = individuals_df.reindex(columns=['name', 'sample_ID','Age','gender', 'bmi', 
                                                 'height', 'country','city','Education Level',
                                                 'SNP1', 'SNP2', 'SNP3', 'SNP4', 'SNP5',
                                                 'gene 1', 'gene 2', 'gene 3', 'gene 4', 'gene 5',
                                                 'gene 6', 'gene 7', 'gene 8', 'gene 9', 'gene 10', 'case'])
individuals_df.head()

Unnamed: 0,name,sample_ID,Age,gender,bmi,height,country,city,Education Level,SNP1,...,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10,case
0,Ariana Drake,sid_0001,80,female,24.372221,161.681522,New Zealand,Holiday,high school,2,...,4.867038,8.304498,10.520967,0.0,18.278833,1.581357,7.988996,3.983963,20.97542,1
1,Steven Hyde,sid_0002,42,male,28.222273,176.096194,United Kingdom,Walnut Creek,primary,2,...,9.323568,11.628594,1.433626,0.857954,3.905349,9.230056,4.842127,8.41999,23.421522,1
2,Erica Powell,sid_0003,86,female,22.043593,165.535431,Australia,Mount Juliet,bachelor,2,...,9.59354,13.232783,0.425663,4.720001,0.0,14.855773,5.54275,3.283091,22.281057,0
3,Lori Montoya,sid_0004,56,female,29.059892,171.469767,Canada,Palmers Green,bachelor,2,...,1.334459,14.171103,5.720492,5.08609,12.333924,4.475427,9.911531,10.288585,21.872279,0
4,Erin Fisher,sid_0005,42,female,26.132155,167.983263,Canada,Northport,high school,1,...,4.112309,8.228493,11.34688,4.203422,14.0824,0.759675,5.38853,10.403495,23.421872,1


In [27]:
individuals_df.to_csv("our_fake_data.csv", index=False)

# FINAL DATASET