# Setting up

In [129]:
# Importing packages
import pandas as pd
import numpy as np
import random
from faker import Faker
import geonamescache
import random
import pandas as pd


# Generating Countries

In [130]:
# Initialize GeonamesCache
gc = geonamescache.GeonamesCache()

# Define the countries with their ISO country codes
countries_iso = {'US': 'United States', 'GB': 'United Kingdom', 'AU': 'Australia', 
                 'NZ': 'New Zealand', 'CA': 'Canada'}

In [131]:
# Function to get cities from GeonamesCache by country ISO code
def get_cities_by_country(country_iso):
    # Get all cities
    cities = gc.get_cities()
    # Filter cities by country ISO code
    country_cities = {geoid: city for geoid, city in cities.items() if city['countrycode'] == country_iso}
    return list(country_cities.values())

# Create a list of all cities for the specified countries
all_cities = []
for iso, country_name in countries_iso.items():
    # Get the ISO code for the country from geonamescache
    country_iso = gc.get_countries_by_names()[country_name]['iso']
    all_cities.extend(get_cities_by_country(country_iso))


In [132]:
# Simulate the city and country for each individual
gender_list = ["male", "female"]
individuals = [{'country': random.choice(list(countries_iso.values())),
                'city': random.choice(all_cities)['name'],
                'gender': random.choice(gender_list)} for _ in range(500)]

# Check the first few entries
individuals[:5]

# Create a DataFrame with the simulated data
individuals_df = pd.DataFrame(individuals)

individuals_df.head(10)

Unnamed: 0,country,city,gender
0,Canada,Lynnwood,female
1,Canada,Anderson,female
2,United States,Fountain Valley,male
3,United States,Willetton,female
4,Australia,Bainbridge Island,male
5,United Kingdom,Schertz,female
6,Australia,Huntington Park,male
7,New Zealand,Launceston,female
8,United Kingdom,Whitney,female
9,New Zealand,North Hollywood,female


# Generating names and gender

In [133]:
def choose_faker(country, gender):
    if country == "United States":
        faker = Faker('en_US')
    elif country == "United Kingdom":
        faker = Faker('en_GB')
    elif country == "Australia":
        faker = Faker('en_AU')
    elif country == "New Zealand":
        faker = Faker('en_NZ')
    elif country == "Canada":
        faker = Faker('en_CA')
    else:
        print("some problems here lol")
        return None

    if gender == "male":
        return faker.name_male()
    elif gender == "female":
        return faker.name_female()
    else:
        return faker.name()

In [134]:
name_list =[]
for index in range(len(individuals_df)):
    row = individuals_df.iloc[index]
    country_name = row["country"]
    gender= row["gender"]
    name_list.append(choose_faker(country_name, gender))

In [135]:
individuals_df["name"] = name_list
individuals_df.head()

Unnamed: 0,country,city,gender,name
0,Canada,Lynnwood,female,Sandra Williams
1,Canada,Anderson,female,Maria Lin
2,United States,Fountain Valley,male,Mark Compton
3,United States,Willetton,female,Sally Moss
4,Australia,Bainbridge Island,male,John Moon


# Generating Heights/BMI based on name/gender

In [136]:
mean_heights = {
    'United States' : {'male': 177, 'female': 163},
    'United Kingdom' : {'male': 178, 'female': 164},
    'Canada' : {'male': 178, 'female': 165},
    'Australia' : {'male': 179, 'female': 165},
    'New Zealand' : {'male': 178, 'female': 165},
}

st_dev_height = 6

In [137]:
# Function to generate height based on country and gender
def generate_height(country, gender):
    mean_height = mean_heights[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_height, st_dev_height)

In [138]:
# Generate heights for each individual in the DataFrame
individuals_df['height'] = individuals_df.apply(lambda x: generate_height(x['country'], x['gender']), axis=1)
individuals_df.head(20)

Unnamed: 0,country,city,gender,name,height
0,Canada,Lynnwood,female,Sandra Williams,161.082389
1,Canada,Anderson,female,Maria Lin,178.389131
2,United States,Fountain Valley,male,Mark Compton,184.598896
3,United States,Willetton,female,Sally Moss,155.127998
4,Australia,Bainbridge Island,male,John Moon,181.656045
5,United Kingdom,Schertz,female,Mrs Dawn Murphy,162.918433
6,Australia,Huntington Park,male,Richard Sims,179.598034
7,New Zealand,Launceston,female,Alice Freeth,162.894464
8,United Kingdom,Whitney,female,Hazel Owen,169.394882
9,New Zealand,North Hollywood,female,Nevaeh Baker,159.6411


In [139]:
mean_bmis = {
    'United States' : {'male': 28.8, 'female': 28.8},
    'United Kingdom' : {'male': 27.5, 'female': 27.1},
    'Canada' : {'male': 27.6, 'female': 26.8},
    'Australia' : {'male': 26.5, 'female': 24.4},
    'New Zealand' : {'male': 28, 'female': 27.8},
}

st_dev_bmi = 2

In [140]:
# Function to generate height based on country and gender
def generate_bmi(country, gender):
    mean_bmi = mean_bmis[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_bmi, st_dev_bmi)

In [141]:
# Generate heights for each individual in the DataFrame
individuals_df['bmi'] = individuals_df.apply(lambda x: generate_bmi(x['country'], x['gender']), axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi
0,Canada,Lynnwood,female,Sandra Williams,161.082389,25.412903
1,Canada,Anderson,female,Maria Lin,178.389131,27.386392
2,United States,Fountain Valley,male,Mark Compton,184.598896,25.41868
3,United States,Willetton,female,Sally Moss,155.127998,27.849098
4,Australia,Bainbridge Island,male,John Moon,181.656045,26.286107
5,United Kingdom,Schertz,female,Mrs Dawn Murphy,162.918433,24.094084
6,Australia,Huntington Park,male,Richard Sims,179.598034,30.551854
7,New Zealand,Launceston,female,Alice Freeth,162.894464,27.134741
8,United Kingdom,Whitney,female,Hazel Owen,169.394882,24.570602
9,New Zealand,North Hollywood,female,Nevaeh Baker,159.6411,30.659407


# SNP

In [142]:
# Set a random seed for reproducibility
np.random.seed(0)

snp1 = np.random.choice([2,2,2,2,2,2,2,1,1,0], size=500)
snp2 = np.random.choice([2,2,2,2,2,2,1,1,0,0], size=500)
snp3 = np.random.randint(0, 3, size=500)
snp4 = np.random.randint(0, 3, size=500)
snp5 = np.random.choice([2,2,2,2,1,1,1,0,0,0], size=500)


df_snp = pd.DataFrame({
    'SNP1': snp1,
    'SNP2': snp2,
    'SNP3': snp3,
    'SNP4': snp4,
    'SNP5': snp5
})

df_snp.head()

Unnamed: 0,SNP1,SNP2,SNP3,SNP4,SNP5
0,2,0,1,2,1
1,2,1,0,2,1
2,2,2,1,1,2
3,2,2,2,0,2
4,1,1,0,2,0


In [143]:
num_individuals = 500
np.random.seed(500)

# Random ages between 20 and 90
ages = np.random.randint(20, 90, num_individuals)

# Education levels
education_levels = ['primary', 'high school', 'bachelor', 'master', 'phd']

# Adjusting the distribution
education_probabilities = [0.1, 0.45, 0.25, 0.17, 0.03]

# Randomly assigning education levels
education = np.random.choice(education_levels, num_individuals, p=education_probabilities)

# Creating the DataFrame
data = pd.DataFrame({
    'Age': ages,
    'Education Level': education
})

data.head()

Unnamed: 0,Age,Education Level
0,75,high school
1,85,bachelor
2,37,high school
3,81,bachelor
4,51,high school


In [144]:
individuals_df = pd.concat([individuals_df, df_snp], axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5
0,Canada,Lynnwood,female,Sandra Williams,161.082389,25.412903,2,0,1,2,1
1,Canada,Anderson,female,Maria Lin,178.389131,27.386392,2,1,0,2,1
2,United States,Fountain Valley,male,Mark Compton,184.598896,25.41868,2,2,1,1,2
3,United States,Willetton,female,Sally Moss,155.127998,27.849098,2,2,2,0,2
4,Australia,Bainbridge Island,male,John Moon,181.656045,26.286107,1,1,0,2,0
5,United Kingdom,Schertz,female,Mrs Dawn Murphy,162.918433,24.094084,0,2,2,1,0
6,Australia,Huntington Park,male,Richard Sims,179.598034,30.551854,2,0,1,2,2
7,New Zealand,Launceston,female,Alice Freeth,162.894464,27.134741,2,2,0,1,2
8,United Kingdom,Whitney,female,Hazel Owen,169.394882,24.570602,2,2,2,2,0
9,New Zealand,North Hollywood,female,Nevaeh Baker,159.6411,30.659407,2,2,0,2,2


# Pseudoanonymising names into sample_ID

In [145]:
def generate_sample_id(index):
    return f"sid_{str(index + 1).zfill(4)}"

individuals_df['sample_ID'] = individuals_df.index.to_series().apply(generate_sample_id)

individuals_df.head()

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5,sample_ID
0,Canada,Lynnwood,female,Sandra Williams,161.082389,25.412903,2,0,1,2,1,sid_0001
1,Canada,Anderson,female,Maria Lin,178.389131,27.386392,2,1,0,2,1,sid_0002
2,United States,Fountain Valley,male,Mark Compton,184.598896,25.41868,2,2,1,1,2,sid_0003
3,United States,Willetton,female,Sally Moss,155.127998,27.849098,2,2,2,0,2,sid_0004
4,Australia,Bainbridge Island,male,John Moon,181.656045,26.286107,1,1,0,2,0,sid_0005


# Education and Age

In [146]:
num_individuals = 500
np.random.seed(500)

# Random ages between 20 and 90
ages = np.random.randint(20, 90, num_individuals)

# Education levels
education_levels = ['primary', 'high school', 'bachelor', 'master', 'phd']

# Adjusting the distribution
education_probabilities = [0.1, 0.45, 0.25, 0.17, 0.03]

# Randomly assigning education levels
education = np.random.choice(education_levels, num_individuals, p=education_probabilities)

# Creating the DataFrame
data = pd.DataFrame({
    'Age': ages,
    'Education Level': education
})

data.head()

Unnamed: 0,Age,Education Level
0,75,high school
1,85,bachelor
2,37,high school
3,81,bachelor
4,51,high school


In [147]:
individuals_df = pd.concat([individuals_df, data], axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5,sample_ID,Age,Education Level
0,Canada,Lynnwood,female,Sandra Williams,161.082389,25.412903,2,0,1,2,1,sid_0001,75,high school
1,Canada,Anderson,female,Maria Lin,178.389131,27.386392,2,1,0,2,1,sid_0002,85,bachelor
2,United States,Fountain Valley,male,Mark Compton,184.598896,25.41868,2,2,1,1,2,sid_0003,37,high school
3,United States,Willetton,female,Sally Moss,155.127998,27.849098,2,2,2,0,2,sid_0004,81,bachelor
4,Australia,Bainbridge Island,male,John Moon,181.656045,26.286107,1,1,0,2,0,sid_0005,51,high school
5,United Kingdom,Schertz,female,Mrs Dawn Murphy,162.918433,24.094084,0,2,2,1,0,sid_0006,37,high school
6,Australia,Huntington Park,male,Richard Sims,179.598034,30.551854,2,0,1,2,2,sid_0007,37,bachelor
7,New Zealand,Launceston,female,Alice Freeth,162.894464,27.134741,2,2,0,1,2,sid_0008,61,bachelor
8,United Kingdom,Whitney,female,Hazel Owen,169.394882,24.570602,2,2,2,2,0,sid_0009,54,master
9,New Zealand,North Hollywood,female,Nevaeh Baker,159.6411,30.659407,2,2,0,2,2,sid_0010,62,high school


# Gene expression

In [148]:
def generate_positive_normal(mean, std):
    while True:
        num = np.random.normal(mean, std)
        if num > 0:
            return num

def generate_bimodal_distribution(mean1, std1, mean2, std2):
    value = np.random.choice([np.random.normal(mean1, std1), np.random.normal(mean2, std2)])
    return max(value, 0)

def generate_bimodal_gender(gender, mean1, std1, mean2, std2):
    if gender == "male":
        value = np.random.normal(mean1, std1)
    elif gender == "female":
        value = np.random.normal(mean2, std2)
    return max(value, 0)

def generate_bimodal_bmi(bmi, mean1, std1, mean2, std2):
    if bmi > 25:
        value = np.random.normal(mean1,std1)
    elif bmi < 25:
        value = np.random.normal(mean2,std2)
    return max(value, 0)

In [152]:
individuals_df['gene 1'] = individuals_df.apply(lambda x: generate_positive_normal(4, 1), axis=1)
individuals_df['gene 2'] = individuals_df.apply(lambda x: generate_positive_normal(7, 5), axis=1)
individuals_df['gene 3'] = individuals_df.apply(lambda x: generate_positive_normal(9, 3), axis=1)
individuals_df['gene 4'] = individuals_df.apply(lambda x: generate_bimodal_distribution(3, 2, 10, 2), axis=1)
individuals_df['gene 5'] = individuals_df.apply(lambda x: generate_bimodal_distribution(5, 1, 1, 1), axis=1)
individuals_df['gene 6'] = individuals_df.apply(lambda x: generate_bimodal_distribution(12, 5, 3, 3), axis=1)
individuals_df['gene 7'] = individuals_df.apply(lambda x: generate_bimodal_distribution(8, 3, 2, 1), axis=1)
individuals_df['gene 8'] = individuals_df.apply(lambda x: generate_bimodal_gender(x["gender"], 2, 2, 7, 2), axis=1)
individuals_df['gene 9'] = individuals_df.apply(lambda x: generate_bimodal_bmi(x["bmi"], 9, 1, 3, 1), axis=1)
individuals_df['gene 10'] = individuals_df.apply(lambda x: generate_bimodal_bmi(x["bmi"], 22, 2, 23, 2), axis=1)
individuals_df.head()

Unnamed: 0,name,sample_ID,Age,gender,bmi,height,country,city,Education Level,SNP1,...,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10
0,Sandra Williams,sid_0001,75,female,25.412903,161.082389,Canada,Lynnwood,high school,2,...,3.626308,12.145523,5.58485,1.431837,0.438891,0.0,0.725677,11.511007,10.166501,21.012894
1,Maria Lin,sid_0002,85,female,27.386392,178.389131,Canada,Anderson,bachelor,2,...,3.732063,8.981231,10.664523,10.473008,4.402025,1.618726,14.552084,6.163375,8.746771,25.741204
2,Mark Compton,sid_0003,37,male,25.41868,184.598896,United States,Fountain Valley,high school,2,...,2.933003,7.280951,5.050869,7.353544,0.624152,12.868966,3.411052,1.424234,10.903257,24.191687
3,Sally Moss,sid_0004,81,female,27.849098,155.127998,United States,Willetton,bachelor,2,...,3.787897,6.517843,16.339073,2.18864,6.418976,1.780819,2.458137,6.465608,7.649296,21.376292
4,John Moon,sid_0005,51,male,26.286107,181.656045,Australia,Bainbridge Island,high school,1,...,1.997489,4.829872,5.104732,11.996885,5.136001,1.548381,2.780779,6.1896,9.913498,20.449417


# Case Control

In [155]:
individuals_df["case"] = individuals_df.apply(lambda x: 1 if x['SNP1'] + x['SNP2'] + x['SNP5'] < 5 else 0, axis=1)

# Building the table

In [156]:
# Reordering the columns
individuals_df = individuals_df.reindex(columns=['name', 'sample_ID','Age','gender', 'bmi', 
                                                 'height', 'country','city','Education Level',
                                                 'SNP1', 'SNP2', 'SNP3', 'SNP4', 'SNP5',
                                                 'gene 1', 'gene 2', 'gene 3', 'gene 4', 'gene 5',
                                                 'gene 6', 'gene 7', 'gene 8', 'gene 9', 'gene 10', 'case'])
individuals_df.head()

Unnamed: 0,name,sample_ID,Age,gender,bmi,height,country,city,Education Level,SNP1,...,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10,case
0,Sandra Williams,sid_0001,75,female,25.412903,161.082389,Canada,Lynnwood,high school,2,...,12.145523,5.58485,1.431837,0.438891,0.0,0.725677,11.511007,10.166501,21.012894,1
1,Maria Lin,sid_0002,85,female,27.386392,178.389131,Canada,Anderson,bachelor,2,...,8.981231,10.664523,10.473008,4.402025,1.618726,14.552084,6.163375,8.746771,25.741204,1
2,Mark Compton,sid_0003,37,male,25.41868,184.598896,United States,Fountain Valley,high school,2,...,7.280951,5.050869,7.353544,0.624152,12.868966,3.411052,1.424234,10.903257,24.191687,0
3,Sally Moss,sid_0004,81,female,27.849098,155.127998,United States,Willetton,bachelor,2,...,6.517843,16.339073,2.18864,6.418976,1.780819,2.458137,6.465608,7.649296,21.376292,0
4,John Moon,sid_0005,51,male,26.286107,181.656045,Australia,Bainbridge Island,high school,1,...,4.829872,5.104732,11.996885,5.136001,1.548381,2.780779,6.1896,9.913498,20.449417,1


In [157]:
individuals_df.to_csv("our_fake_data.csv", index=False)

# FINAL DATASET