In [248]:
import numpy as np
import pandas as pd

# Example of sampling

In [486]:
# Define statistical properties of population
mean_height = 180
sd_height = 10

mean_weight = 80
sd_weight = 12

mean_income = 650000
sd_income=50000

std_matrix = np.array([
    [sd_height, 0, 0],
    [0, sd_weight, 0],
    [0,0, sd_income]
])


# specify desired correlation between variables
correlation_matrix = np.array([
    [1, .8, 0],
    [.8, 1, 0],
    [0, 0 , 1]
])

covariance_matrix = np.dot(std_matrix, np.dot(correlation_matrix, std_matrix))

# Sampling from multivariate_distribtuion
num_samples = 10000
samples = np.random.multivariate_normal((mean_height, mean_weight, mean_income), covariance_matrix, num_samples)

# Adding samples to Pandas DataFrame
samples_df=pd.DataFrame(samples, columns=["Height", "Weight", "Income"])
samples_df.corr()

Unnamed: 0,Height,Weight,Income
Height,1.0,0.798037,-0.001117
Weight,0.798037,1.0,0.001052
Income,-0.001117,0.001052,1.0


In [487]:
print(f"Percentage of people taller than 160 cm: {samples_df[samples_df['Height'] > 160]['Height'].count()/num_samples*100}")
print(f"Percentage of people shorter than 200 cm: {samples_df[samples_df['Height'] < 200]['Height'].count()/num_samples*100}")

Percentage of people taller than 160 cm: 97.8
Percentage of people shorter than 200 cm: 97.78999999999999


In [488]:
# Print actual descirptive
samples_df.describe()

Unnamed: 0,Height,Weight,Income
count,10000.0,10000.0,10000.0
mean,179.99155,79.888963,649598.269699
std,9.924684,11.984878,49868.644461
min,141.511664,29.063885,468939.192382
25%,173.309528,71.923774,615896.839617
50%,179.991657,79.742033,649462.917893
75%,186.743389,88.046169,683098.003017
max,216.023414,129.020317,844214.743765


In [491]:
# Tallest man
print(samples_df.iloc[samples_df["Height"].idxmax()])


Height       216.023414
Weight       112.395856
Income    675775.988989
Name: 1301, dtype: float64


In [492]:
# Shortest man
print(samples_df.iloc[samples_df["Height"].idxmin()])

Height       141.511664
Weight        35.004598
Income    679091.003243
Name: 9808, dtype: float64


# Utilize ML models

In [507]:
from sklearn.linear_model import LogisticRegression

# Loading dataset to create gender generator
with open("cardio_train.csv") as file:
    data = pd.read_csv(file, delimiter=";")
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [None]:
# Fitting linear regressor to predict gender from height and weight

X = data[["height", "weight"]]
y = data["gender"]
clf = LogisticRegression().fit(X, y)

In [512]:
# Assigning gender based on height and weight

def generate_gender(x):
    categories = ["female", "male"]
    probability = clf.predict_proba([[x["Height"],x["Weight"]]])[0]
    return np.random.choice(categories, 1, p=probability)[0]

samples_df["Gender"]=samples_df.apply(generate_gender,axis=1)


In [520]:
# Printing number of each member in dataset.

print(f"Number of females in dataset: {samples_df[samples_df['Gender']== 'female']['Gender'].count()}")
print(f"Number of males in dataset: {samples_df[samples_df['Gender']== 'male']['Gender'].count()}")

Number of females in dataset: 2076
Number of males in dataset: 7924


In [523]:
samples_df.head(20)

Unnamed: 0,Height,Weight,Income,Gender
0,191.224422,97.941187,657794.273582,male
1,190.236523,87.007991,555339.84303,male
2,166.404665,64.866459,614950.745161,male
3,180.125193,80.592521,593325.768395,male
4,182.353297,84.84571,728360.526414,male
5,194.155231,80.428162,629966.695875,male
6,199.330397,98.981812,610362.864634,male
7,175.146898,72.357179,711794.827708,male
8,176.343555,74.698695,710905.880681,female
9,178.18377,79.28756,682831.165805,female


# Enter deep neural networks

In [524]:
!pip install tensorflow
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

Collecting tensorflow
  Downloading tensorflow-2.4.1-cp36-cp36m-manylinux2010_x86_64.whl (394.3 MB)
[K     |████████████████████████████████| 394.3 MB 11 kB/s s eta 0:00:01   |█▏                              | 14.0 MB 9.0 MB/s eta 0:00:43     |██████████▊                     | 132.1 MB 10.7 MB/s eta 0:00:25     |█████████████▋                  | 167.4 MB 19.2 MB/s eta 0:00:12     |██████████████▎                 | 175.3 MB 4.8 MB/s eta 0:00:46 MB/s eta 0:00:46     |███████████████                 | 184.7 MB 4.8 MB/s eta 0:00:44     |█████████████████▍              | 214.7 MB 8.6 MB/s eta 0:00:21     |██████████████████▍             | 226.6 MB 41.1 MB/s eta 0:00:05     |██████████████████▉             | 232.4 MB 41.1 MB/s eta 0:00:04:00:08     |██████████████████████          | 270.6 MB 11.9 MB/s eta 0:00:11     |██████████████████████▊         | 280.2 MB 2.0 MB/s eta 0:00:57     |███████████████████████▍        | 287.7 MB 2.0 MB/s eta 0:00:53     |█████████████████████████▎      | 311

In [532]:
with open("navn.txt") as file:
    navn = file.read()

vocab = sorted(set(navn))
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab))
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [542]:
ids = ids_from_chars(tf.strings.unicode_split(navn, input_encoding='UTF-8'))

In [543]:
ids

<tf.Tensor: shape=(197,), dtype=int64, numpy=
array([10, 29, 31, 15,  2, 10, 29, 31, 15, 22,  2,  3, 27, 27, 15,  2,  3,
       26, 26, 15,  2,  9, 15, 24, 15,  2,  9, 15, 23, 15,  2,  9, 15, 35,
       15,  2, 11, 26, 23, 34, 23, 15,  2,  3, 27, 23, 26, 23, 19,  2, 13,
       29, 20, 23, 19,  2, 13, 29, 30, 22, 23, 19,  2,  8, 19, 15, 22,  2,
        8, 19, 15,  2, 13, 29, 20, 23, 15,  2, 13, 29, 30, 22, 23, 15,  2,
        6, 28, 21, 31, 23, 18,  2,  7, 15, 25, 29, 16,  2,  7, 15, 17, 29,
       16,  2,  3, 27, 23, 26,  2, 10, 29, 15, 22,  2, 10, 29, 15,  2, 11,
       26, 23, 34, 19, 31,  2,  4, 23, 26, 23, 30,  2,  4, 23, 26, 26, 23,
       30,  2, 12, 22, 23, 26, 23, 30,  2, 12, 22, 23, 26, 26, 23, 30,  2,
       14, 23, 26, 26, 23, 15, 27,  2,  8, 33, 17, 15, 32,  2,  8, 33, 25,
       15, 32,  2,  8, 23, 15, 27,  2,  5, 19, 28, 31, 23, 25,  2, 11, 32,
       25, 15, 31,  2, 11, 17, 32, 15, 31,  2])>

In [544]:
chars = chars_from_ids(ids)
chars

<tf.Tensor: shape=(197,), dtype=string, numpy=
array([b'N', b'o', b'r', b'a', b'\n', b'N', b'o', b'r', b'a', b'h', b'\n',
       b'E', b'm', b'm', b'a', b'\n', b'E', b'l', b'l', b'a', b'\n', b'M',
       b'a', b'j', b'a', b'\n', b'M', b'a', b'i', b'a', b'\n', b'M', b'a',
       b'y', b'a', b'\n', b'O', b'l', b'i', b'v', b'i', b'a', b'\n', b'E',
       b'm', b'i', b'l', b'i', b'e', b'\n', b'S', b'o', b'f', b'i', b'e',
       b'\n', b'S', b'o', b'p', b'h', b'i', b'e', b'\n', b'L', b'e', b'a',
       b'h', b'\n', b'L', b'e', b'a', b'\n', b'S', b'o', b'f', b'i', b'a',
       b'\n', b'S', b'o', b'p', b'h', b'i', b'a', b'\n', b'I', b'n', b'g',
       b'r', b'i', b'd', b'\n', b'J', b'a', b'k', b'o', b'b', b'\n', b'J',
       b'a', b'c', b'o', b'b', b'\n', b'E', b'm', b'i', b'l', b'\n', b'N',
       b'o', b'a', b'h', b'\n', b'N', b'o', b'a', b'\n', b'O', b'l', b'i',
       b'v', b'e', b'r', b'\n', b'F', b'i', b'l', b'i', b'p', b'\n', b'F',
       b'i', b'l', b'l', b'i', b'p', b'\n', b'P', b'h'

In [545]:
tf.strings.reduce_join(chars, axis=-1).numpy()

b'Nora\nNorah\nEmma\nElla\nMaja\nMaia\nMaya\nOlivia\nEmilie\nSofie\nSophie\nLeah\nLea\nSofia\nSophia\nIngrid\nJakob\nJacob\nEmil\nNoah\nNoa\nOliver\nFilip\nFillip\nPhilip\nPhillip\nWilliam\nLucas\nLukas\nLiam\nHenrik\nOskar\nOcsar\n'