# Setting up

In [88]:
# Importing packages
import pandas as pd
import numpy as np
import random
from faker import Faker
import geonamescache
import random
import pandas as pd


# Generating Countries

In [89]:
# Initialize GeonamesCache
gc = geonamescache.GeonamesCache()

# Define the countries with their ISO country codes
countries_iso = {'US': 'United States', 'GB': 'United Kingdom', 'AU': 'Australia', 
                 'NZ': 'New Zealand', 'CA': 'Canada'}

In [90]:
# Function to get cities from GeonamesCache by country ISO code
def get_cities_by_country(country_iso):
    # Get all cities
    cities = gc.get_cities()
    # Filter cities by country ISO code
    country_cities = {geoid: city for geoid, city in cities.items() if city['countrycode'] == country_iso}
    return list(country_cities.values())

# Create a list of all cities for the specified countries
all_cities = []
for iso, country_name in countries_iso.items():
    # Get the ISO code for the country from geonamescache
    country_iso = gc.get_countries_by_names()[country_name]['iso']
    all_cities.extend(get_cities_by_country(country_iso))


In [91]:
# Simulate the city and country for each individual
gender_list = ["male", "female"]
individuals = [{'country': random.choice(list(countries_iso.values())),
                'city': random.choice(all_cities)['name'],
                'gender': random.choice(gender_list)} for _ in range(500)]

# Check the first few entries
individuals[:5]

# Create a DataFrame with the simulated data
individuals_df = pd.DataFrame(individuals)

individuals_df.head(10)

Unnamed: 0,country,city,gender
0,Canada,Tewksbury,male
1,Canada,Pittsfield,male
2,United Kingdom,Wharton,male
3,Australia,Five Corners,female
4,Australia,Chula Vista,male
5,Canada,South Suffolk,male
6,Canada,Fairfield,female
7,Canada,Hartley,female
8,United Kingdom,Lisle,male
9,United States,Adrian,female


# Generating names and gender

In [92]:
def choose_faker(country, gender):
    if country == "United States":
        faker = Faker('en_US')
    elif country == "United Kingdom":
        faker = Faker('en_GB')
    elif country == "Australia":
        faker = Faker('en_AU')
    elif country == "New Zealand":
        faker = Faker('en_NZ')
    elif country == "Canada":
        faker = Faker('en_CA')
    else:
        print("some problems here lol")
        return None

    if gender == "male":
        return faker.name_male()
    elif gender == "female":
        return faker.name_female()
    else:
        return faker.name()

In [93]:
name_list =[]
for index in range(len(individuals_df)):
    row = individuals_df.iloc[index]
    country_name = row["country"]
    gender= row["gender"]
    name_list.append(choose_faker(country_name, gender))

In [94]:
individuals_df["name"] = name_list
individuals_df.head()

Unnamed: 0,country,city,gender,name
0,Canada,Tewksbury,male,Collin Farley
1,Canada,Pittsfield,male,Jose Ellis
2,United Kingdom,Wharton,male,Joe Townsend
3,Australia,Five Corners,female,Mary Hayes
4,Australia,Chula Vista,male,Peter Bishop


# Generating Heights/BMI based on name/gender

In [95]:
mean_heights = {
    'United States' : {'male': 177, 'female': 163},
    'United Kingdom' : {'male': 178, 'female': 164},
    'Canada' : {'male': 178, 'female': 165},
    'Australia' : {'male': 179, 'female': 165},
    'New Zealand' : {'male': 178, 'female': 165},
}

st_dev_height = 6

In [96]:
# Function to generate height based on country and gender
def generate_height(country, gender):
    mean_height = mean_heights[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_height, st_dev_height)

In [97]:
# Generate heights for each individual in the DataFrame
individuals_df['height'] = individuals_df.apply(lambda x: generate_height(x['country'], x['gender']), axis=1)
individuals_df.head(20)

Unnamed: 0,country,city,gender,name,height
0,Canada,Tewksbury,male,Collin Farley,186.906352
1,Canada,Pittsfield,male,Jose Ellis,177.731554
2,United Kingdom,Wharton,male,Joe Townsend,166.997407
3,Australia,Five Corners,female,Mary Hayes,166.290902
4,Australia,Chula Vista,male,Peter Bishop,179.283643
5,Canada,South Suffolk,male,Ricky Fisher,167.700142
6,Canada,Fairfield,female,Isabella Smith,161.218051
7,Canada,Hartley,female,Sarah Lynch,170.622549
8,United Kingdom,Lisle,male,Dr Billy Mitchell,173.57972
9,United States,Adrian,female,Laura Winters,159.11029


In [98]:
mean_bmis = {
    'United States' : {'male': 28.8, 'female': 28.8},
    'United Kingdom' : {'male': 27.5, 'female': 27.1},
    'Canada' : {'male': 27.6, 'female': 26.8},
    'Australia' : {'male': 26.5, 'female': 24.4},
    'New Zealand' : {'male': 28, 'female': 27.8},
}

st_dev_bmi = 2

In [99]:
# Function to generate height based on country and gender
def generate_bmi(country, gender):
    mean_bmi = mean_bmis[country][gender]
    # Sample from a normal distribution around the mean height
    return np.random.normal(mean_bmi, st_dev_bmi)

In [100]:
# Generate heights for each individual in the DataFrame
individuals_df['bmi'] = individuals_df.apply(lambda x: generate_bmi(x['country'], x['gender']), axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi
0,Canada,Tewksbury,male,Collin Farley,186.906352,25.777256
1,Canada,Pittsfield,male,Jose Ellis,177.731554,27.279741
2,United Kingdom,Wharton,male,Joe Townsend,166.997407,26.911763
3,Australia,Five Corners,female,Mary Hayes,166.290902,21.703085
4,Australia,Chula Vista,male,Peter Bishop,179.283643,24.505262
5,Canada,South Suffolk,male,Ricky Fisher,167.700142,26.725507
6,Canada,Fairfield,female,Isabella Smith,161.218051,27.553141
7,Canada,Hartley,female,Sarah Lynch,170.622549,23.313577
8,United Kingdom,Lisle,male,Dr Billy Mitchell,173.57972,28.548937
9,United States,Adrian,female,Laura Winters,159.11029,27.674716


# SNP

In [119]:
# Set a random seed for reproducibility
np.random.seed(0)

snp1 = np.random.choice([2,2,2,2,2,2,2,1,1,0], size=500)
snp2 = np.random.choice([2,2,2,2,2,2,1,1,0,0], size=500)
snp3 = np.random.randint(0, 3, size=500)
snp4 = np.random.randint(0, 3, size=500)
snp5 = np.random.choice([2,2,2,2,1,1,1,0,0,0], size=500)


df_snp = pd.DataFrame({
    'SNP1': snp1,
    'SNP2': snp2,
    'SNP3': snp3,
    'SNP4': snp4,
    'SNP5': snp5
})

df_snp.head()

Unnamed: 0,SNP1,SNP2,SNP3,SNP4,SNP5
0,2,0,1,2,1
1,2,1,0,2,1
2,2,2,1,1,2
3,2,2,2,0,2
4,1,1,0,2,0


In [None]:
num_individuals = 500
np.random.seed(500)

# Random ages between 20 and 90
ages = np.random.randint(20, 90, num_individuals)

# Education levels
education_levels = ['primary', 'high school', 'bachelor', 'master', 'phd']

# Adjusting the distribution
education_probabilities = [0.1, 0.45, 0.25, 0.17, 0.03]

# Randomly assigning education levels
education = np.random.choice(education_levels, num_individuals, p=education_probabilities)

# Creating the DataFrame
data = pd.DataFrame({
    'Age': ages,
    'Education Level': education
})

data.head()

In [102]:
individuals_df = pd.concat([individuals_df, df_snp], axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5
0,Canada,Tewksbury,male,Collin Farley,186.906352,25.777256,0,1,0,1,1
1,Canada,Pittsfield,male,Jose Ellis,177.731554,27.279741,2,0,2,0,0
2,United Kingdom,Wharton,male,Joe Townsend,166.997407,26.911763,0,2,1,2,2
3,Australia,Five Corners,female,Mary Hayes,166.290902,21.703085,0,1,1,1,1
4,Australia,Chula Vista,male,Peter Bishop,179.283643,24.505262,0,1,0,0,1
5,Canada,South Suffolk,male,Ricky Fisher,167.700142,26.725507,2,0,2,0,1
6,Canada,Fairfield,female,Isabella Smith,161.218051,27.553141,1,2,0,1,1
7,Canada,Hartley,female,Sarah Lynch,170.622549,23.313577,1,0,2,0,2
8,United Kingdom,Lisle,male,Dr Billy Mitchell,173.57972,28.548937,2,0,2,0,0
9,United States,Adrian,female,Laura Winters,159.11029,27.674716,0,1,1,2,0


# Pseudoanonymising names into sample_ID

In [103]:
def generate_sample_id(index):
    return f"sid_{str(index + 1).zfill(4)}"

individuals_df['sample_ID'] = individuals_df.index.to_series().apply(generate_sample_id)

individuals_df.head()

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5,sample_ID
0,Canada,Tewksbury,male,Collin Farley,186.906352,25.777256,0,1,0,1,1,sid_0001
1,Canada,Pittsfield,male,Jose Ellis,177.731554,27.279741,2,0,2,0,0,sid_0002
2,United Kingdom,Wharton,male,Joe Townsend,166.997407,26.911763,0,2,1,2,2,sid_0003
3,Australia,Five Corners,female,Mary Hayes,166.290902,21.703085,0,1,1,1,1,sid_0004
4,Australia,Chula Vista,male,Peter Bishop,179.283643,24.505262,0,1,0,0,1,sid_0005


# Education and Age

In [104]:
num_individuals = 500
np.random.seed(500)

# Random ages between 20 and 90
ages = np.random.randint(20, 90, num_individuals)

# Education levels
education_levels = ['primary', 'high school', 'bachelor', 'master', 'phd']

# Adjusting the distribution
education_probabilities = [0.1, 0.45, 0.25, 0.17, 0.03]

# Randomly assigning education levels
education = np.random.choice(education_levels, num_individuals, p=education_probabilities)

# Creating the DataFrame
data = pd.DataFrame({
    'Age': ages,
    'Education Level': education
})

data.head()

Unnamed: 0,Age,Education Level
0,75,high school
1,85,bachelor
2,37,high school
3,81,bachelor
4,51,high school


In [105]:
individuals_df = pd.concat([individuals_df, data], axis=1)
individuals_df.head(10)

Unnamed: 0,country,city,gender,name,height,bmi,SNP1,SNP2,SNP3,SNP4,SNP5,sample_ID,Age,Education Level
0,Canada,Tewksbury,male,Collin Farley,186.906352,25.777256,0,1,0,1,1,sid_0001,75,high school
1,Canada,Pittsfield,male,Jose Ellis,177.731554,27.279741,2,0,2,0,0,sid_0002,85,bachelor
2,United Kingdom,Wharton,male,Joe Townsend,166.997407,26.911763,0,2,1,2,2,sid_0003,37,high school
3,Australia,Five Corners,female,Mary Hayes,166.290902,21.703085,0,1,1,1,1,sid_0004,81,bachelor
4,Australia,Chula Vista,male,Peter Bishop,179.283643,24.505262,0,1,0,0,1,sid_0005,51,high school
5,Canada,South Suffolk,male,Ricky Fisher,167.700142,26.725507,2,0,2,0,1,sid_0006,37,high school
6,Canada,Fairfield,female,Isabella Smith,161.218051,27.553141,1,2,0,1,1,sid_0007,37,bachelor
7,Canada,Hartley,female,Sarah Lynch,170.622549,23.313577,1,0,2,0,2,sid_0008,61,bachelor
8,United Kingdom,Lisle,male,Dr Billy Mitchell,173.57972,28.548937,2,0,2,0,0,sid_0009,54,master
9,United States,Adrian,female,Laura Winters,159.11029,27.674716,0,1,1,2,0,sid_0010,62,high school


# Gene expression

In [107]:
def generate_positive_normal(mean, std):
    while True:
        num = np.random.normal(mean, std)
        if num > 0:
            return num

def generate_bimodal_distribution(mean1, std1, mean2, std2):
    value = np.random.choice([np.random.normal(mean1, std1), np.random.normal(mean2, std2)])
    return max(value, 0)

def generate_bimodal_gender(gender, mean1, std1, mean2, std2):
    if gender == "male":
        value = np.random.normal(mean1, std1)
    elif gender == "female":
        value = np.random.normal(mean2, std2)
    return max(value, 0)

def generate_bimodal_bmi(bmi, mean1, std1, mean2, std2):
    if bmi > 25:
        value = np.random.normal(mean1,std1)
    elif bmi < 25:
        value = np.random.normal(mean2,std2)
    return max(value, 0)

In [110]:
individuals_df['gene 1'] = individuals_df.apply(lambda x: generate_positive_normal(4, 1), axis=1)
individuals_df['gene 2'] = individuals_df.apply(lambda x: generate_positive_normal(7, 5), axis=1)
individuals_df['gene 3'] = individuals_df.apply(lambda x: generate_positive_normal(9, 3), axis=1)
individuals_df['gene 4'] = individuals_df.apply(lambda x: generate_bimodal_distribution(3, 2, 10, 2), axis=1)
individuals_df['gene 5'] = individuals_df.apply(lambda x: generate_bimodal_distribution(5, 1, 1, 1), axis=1)
individuals_df['gene 6'] = individuals_df.apply(lambda x: generate_bimodal_distribution(12, 5, 3, 3), axis=1)
individuals_df['gene 7'] = individuals_df.apply(lambda x: generate_bimodal_distribution(8, 3, 2, 1), axis=1)
individuals_df['gene 8'] = individuals_df.apply(lambda x: generate_bimodal_gender(x["gender"], 2, 2, 7, 2), axis=1)
individuals_df['gene 9'] = individuals_df.apply(lambda x: generate_bimodal_bmi(x["bmi"], 9, 1, 3, 1), axis=1)
individuals_df['gene 10'] = individuals_df.apply(lambda x: generate_bimodal_bmi(x["bmi"], 22, 2, 23, 2), axis=1)
individuals_df.head()

Unnamed: 0,name,sample_ID,Age,gender,bmi,height,country,city,Education Level,SNP1,...,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10
0,Collin Farley,sid_0001,75,male,25.777256,186.906352,Canada,Tewksbury,high school,0,...,4.899147,0.676506,8.315109,4.627301,1.001678,3.355536,0.0,3.641007,9.355269,20.076151
1,Jose Ellis,sid_0002,85,male,27.279741,177.731554,Canada,Pittsfield,bachelor,2,...,3.10685,14.148518,9.21722,1.889618,1.721625,12.503965,1.752575,1.369876,6.89061,23.85476
2,Joe Townsend,sid_0003,37,male,26.911763,166.997407,United Kingdom,Wharton,high school,0,...,2.521621,4.33561,9.938187,0.558405,4.624382,9.06472,3.026812,1.046234,8.072452,20.511905
3,Mary Hayes,sid_0004,81,female,21.703085,166.290902,Australia,Five Corners,bachelor,0,...,2.698747,2.871349,8.066806,9.955909,3.314628,12.943536,1.021407,5.272147,4.133949,22.966024
4,Peter Bishop,sid_0005,51,male,24.505262,179.283643,Australia,Chula Vista,high school,0,...,4.202518,5.671911,9.130155,10.084573,0.976247,13.940195,10.713765,2.539063,1.50385,22.632662


# Case Control

# Building the table

In [106]:
# Reordering the columns
individuals_df = individuals_df.reindex(columns=['name', 'sample_ID','Age','gender', 'bmi', 'height', 'country','city','Education Level','SNP1', 'SNP2', 'SNP3', 'SNP4', 'SNP5'])
individuals_df.head(500)

Unnamed: 0,name,sample_ID,Age,gender,bmi,height,country,city,Education Level,SNP1,SNP2,SNP3,SNP4,SNP5
0,Collin Farley,sid_0001,75,male,25.777256,186.906352,Canada,Tewksbury,high school,0,1,0,1,1
1,Jose Ellis,sid_0002,85,male,27.279741,177.731554,Canada,Pittsfield,bachelor,2,0,2,0,0
2,Joe Townsend,sid_0003,37,male,26.911763,166.997407,United Kingdom,Wharton,high school,0,2,1,2,2
3,Mary Hayes,sid_0004,81,female,21.703085,166.290902,Australia,Five Corners,bachelor,0,1,1,1,1
4,Peter Bishop,sid_0005,51,male,24.505262,179.283643,Australia,Chula Vista,high school,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Amy Green-Fraser,sid_0496,68,female,26.045148,162.492075,United Kingdom,Watertown,high school,2,0,1,1,2
496,Bethany Norman,sid_0497,37,female,24.514920,169.537057,United Kingdom,Mount Prospect,master,2,2,0,2,2
497,Glen Holmes,sid_0498,31,male,28.217858,187.629829,United Kingdom,Hillsborough,high school,2,1,1,2,1
498,Kyle Carter,sid_0499,61,male,28.694099,179.788518,United States,Ozone Park,high school,1,1,1,0,1


# FINAL DATASET