In [79]:
from faker import Faker
import geonamescache
import random
import pandas as pd
import numpy as np

## Assigned the Countries (UK, US, Canada, Australia, New Zealand)

In [65]:
gc = geonamescache.GeonamesCache()
countries_iso = {'US': 'United States', 'GB': 'United Kingdom', 'AU': 'Australia', 
                 'NZ': 'New Zealand', 'CA': 'Canada'}

In [66]:
# Function to get cities from GeonamesCache by country ISO code
def get_cities_by_country(country_iso):
    # Get all cities
    cities = gc.get_cities()
    # Filter cities by country ISO code
    country_cities = {geoid: city for geoid, city in cities.items() if city['countrycode'] == country_iso}
    return list(country_cities.values())

# Create a list of all cities for the specified countries
all_cities = []
for iso, country_name in countries_iso.items():
    # Get the ISO code for the country from geonamescache
    country_iso = gc.get_countries_by_names()[country_name]['iso']
    all_cities.extend(get_cities_by_country(country_iso))

In [67]:
gender_list = ["male", "female"]

In [69]:
# Simulate the city and country for each individual
individuals = [{'country': random.choice(list(countries_iso.values())),
                'city': random.choice(all_cities)['name'],
                'gender': random.choice(gender_list)} for _ in range(500)]

individuals_df = pd.DataFrame(individuals[:10])
individuals_df.head()

Unnamed: 0,country,city,gender
0,Australia,Burleson,male
1,New Zealand,Cliffcrest,female
2,Australia,Bloomington,male
3,United States,Kirwan,female
4,United Kingdom,Santa Paula,female


## Name and gender (male, female)

In [70]:
def choose_faker(country, gender):
    if country == "United States":
        faker = Faker('en_US')
    elif country == "United Kingdom":
        faker = Faker('en_GB')
    elif country == "Australia":
        faker = Faker('en_AU')
    elif country == "New Zealand":
        faker = Faker('en_NZ')
    elif country == "Canada":
        faker = Faker('en_CA')
    else:
        print("some problems here lol")
        return None

    if gender == "male":
        return faker.name_male()
    elif gender == "female":
        return faker.name_female()
    else:
        print("some problems here lol")
        return None

In [71]:
name_list =[]
for index in range(len(individuals_df)):
    row = individuals_df.iloc[index]
    country_name = row["country"]
    gender= row["gender"]
    name_list.append(choose_faker(country_name, gender))

individuals_df["name"] = name_list
individuals_df.head(10)

Unnamed: 0,country,city,gender,name
0,Australia,Burleson,male,Edward Hawkins
1,New Zealand,Cliffcrest,female,Jordan Morton-McCallum
2,Australia,Bloomington,male,David Evans
3,United States,Kirwan,female,Brittany Cohen
4,United Kingdom,Santa Paula,female,Sylvia Doherty-Day
5,United States,Frankfort,female,Helen Morales
6,Australia,New Milton,male,Ryan Ross
7,New Zealand,Cole Harbour,female,Daniel Jones
8,United Kingdom,Bangor,female,Mrs Sara Rhodes
9,United States,Martinsburg,male,Eric Long


## Sample id

In [78]:
def generate_sample_id(index):
    return f"sid_{str(index + 1).zfill(4)}"

individuals_df['id'] = individuals_df.index.to_series().apply(generate_sample_id)

individuals_df.head()

Unnamed: 0,country,city,gender,name,id
0,Australia,Burleson,male,Edward Hawkins,sid_0001
1,New Zealand,Cliffcrest,female,Jordan Morton-McCallum,sid_0002
2,Australia,Bloomington,male,David Evans,sid_0003
3,United States,Kirwan,female,Brittany Cohen,sid_0004
4,United Kingdom,Santa Paula,female,Sylvia Doherty-Day,sid_0005


### do not use this, this is hash

In [76]:
# import hashlib

# key = "pandas"

# # Function to generate hash IDs
# def generate_hash_id(row):
#     data_string = row.to_string() + key
#     hash_object = hashlib.sha256(data_string.encode())  # Create a SHA-256 hash object
#     return hash_object.hexdigest()[:8]  # Return the first 8 characters of the hash as the ID

# individuals_df['id'] = individuals_df.apply(generate_hash_id, axis=1)

# print(individuals_df)

# unique_ids_count = individuals_df["id"].nunique()
# unique_ids_count


          country          city  gender                    name        id
0       Australia      Burleson    male          Edward Hawkins  6b456fae
1     New Zealand    Cliffcrest  female  Jordan Morton-McCallum  4645c9c5
2       Australia   Bloomington    male             David Evans  30a6d455
3   United States        Kirwan  female          Brittany Cohen  ec4b8937
4  United Kingdom   Santa Paula  female      Sylvia Doherty-Day  3ae9bbf1
5   United States     Frankfort  female           Helen Morales  6e255aaa
6       Australia    New Milton    male               Ryan Ross  5bc737e3
7     New Zealand  Cole Harbour  female            Daniel Jones  18f13471
8  United Kingdom        Bangor  female         Mrs Sara Rhodes  4334411f
9   United States   Martinsburg    male               Eric Long  fc92ef95


10

## Gene expression - Diabetes
- 3 normal distribution
4, 1
7, 5
9, 3
- 4 bi normal distribution
- 3 log distribtuion

In [92]:
def generate_positive_normal(mean, std):
    while True:
        num = np.random.normal(mean, std)
        if num > 0:
            return num

def generate_bimodal_distribution(mean1, std1, mean2, std2):
    value = np.random.choice([np.random.normal(mean1, std1), np.random.normal(mean2, std2)])
    return max(value, 0)

def generate_bimodal_gender(gender, mean1, std1, mean2, std2):
    if gender == "male":
        value = np.random.normal(mean1, std1)
    elif gender == "female":
        value = np.random.normal(mean2, std2)
    return max(value, 0)

In [93]:
individuals_df['gene 1'] = individuals_df.apply(lambda x: generate_positive_normal(4, 1), axis=1)
individuals_df['gene 2'] = individuals_df.apply(lambda x: generate_positive_normal(7, 5), axis=1)
individuals_df['gene 3'] = individuals_df.apply(lambda x: generate_positive_normal(9, 3), axis=1)
individuals_df['gene 4'] = individuals_df.apply(lambda x: generate_bimodal_distribution(3, 2, 10, 2), axis=1)
individuals_df['gene 5'] = individuals_df.apply(lambda x: generate_bimodal_distribution(5, 1, 1, 1), axis=1)
individuals_df['gene 6'] = individuals_df.apply(lambda x: generate_bimodal_distribution(12, 5, 3, 3), axis=1)
individuals_df['gene 7'] = individuals_df.apply(lambda x: generate_bimodal_distribution(8, 3, 2, 1), axis=1)
individuals_df['gene 8'] = individuals_df.apply(lambda x: generate_bimodal_gender(x["gender"], 2, 2, 7, 2), axis=1)
individuals_df['gene 9'] = individuals_df.apply(lambda x: generate_bimodal_gender(x["gender"], 9, 1, 3, 1), axis=1)
individuals_df['gene 10'] = individuals_df.apply(lambda x: generate_bimodal_gender(x["gender"], 2, 2, 2, 2), axis=1)
individuals_df.head()

Unnamed: 0,country,city,gender,name,id,gene 1,gene 2,gene 3,gene 4,gene 5,gene 6,gene 7,gene 8,gene 9,gene 10
0,Australia,Burleson,male,Edward Hawkins,sid_0001,4.094302,3.71315,10.793932,12.87788,4.372767,8.674096,7.11556,1.587884,8.533223,1.635237
1,New Zealand,Cliffcrest,female,Jordan Morton-McCallum,sid_0002,3.572754,9.373469,11.907446,1.917179,0.0,8.168822,4.858804,6.97664,2.239874,2.049752
2,Australia,Bloomington,male,David Evans,sid_0003,5.706054,5.218115,2.51851,7.294403,0.19934,12.588782,4.571158,2.146009,8.434324,1.070701
3,United States,Kirwan,female,Brittany Cohen,sid_0004,2.065258,2.206684,8.266153,8.709177,4.158358,17.767508,3.137957,6.359655,1.957369,2.677642
4,United Kingdom,Santa Paula,female,Sylvia Doherty-Day,sid_0005,6.782803,4.120628,10.5112,9.41234,1.421557,3.994561,12.988639,3.602498,2.465219,2.951582
