In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
# %matplotlib notebook

In [2]:
df = pd.read_csv('key_to_generate_synthetic_data.csv', na_values=-999).dropna()[:10]
df['randint'] = df['age'].apply(lambda x: random.randint(7, 13))
df['proxy'] = range(len(df))
df['species'] = df['species'].apply(lambda x: x.strip())

In [3]:
df.head()

Unnamed: 0,age,species,randint,proxy
0,38.6,African elephant,8,0
1,4.5,African giant pouched rat,9,1
2,14.0,Arctic Fox,9,2
4,69.0,Asian elephant,11,3
5,27.0,Baboon,8,4


In [4]:
df = df.sort_values('age', axis=0)

In [5]:
df.head()

Unnamed: 0,age,species,randint,proxy
1,4.5,African giant pouched rat,9,1
10,7.0,Chinchilla,13,9
2,14.0,Arctic Fox,9,2
6,19.0,Big brown bat,7,5
5,27.0,Baboon,8,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 1 to 4
Data columns (total 4 columns):
age        10 non-null float64
species    10 non-null object
randint    10 non-null int64
proxy      10 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 400.0+ bytes


In [7]:
stats = df.describe()

In [8]:
stats

Unnamed: 0,age,randint,proxy
count,10.0,10.0,10.0
mean,28.75,9.7,4.5
std,19.857115,2.162817,3.02765
min,4.5,7.0,0.0
25%,15.25,8.0,2.25
50%,27.5,9.0,4.5
75%,36.55,11.0,6.75
max,69.0,13.0,9.0


In [9]:
# plt.hist(df['age'])
# plt.show()

In [10]:
q1 = 7.0
q2 = 20.0
q3 = 35.0

r1 = [0.5, 2.5]
r2 = [2.0, 4.5]
r3 = [4.0, 6.5]
r4 = [6.0, 8.0]

In [11]:
ds = df.sample(1500, replace=True)

In [12]:
m1 = np.mean(ds[ds['age']<q1].age)
m2 = np.mean(ds[(ds['age']>q1) & (ds['age']<q2)].age)
m3 = np.mean(ds[(ds['age']>q2) & (ds['age']<q3)].age)
m4 = np.mean(ds[ds['age']>q3].age)

In [13]:
age_range = [0.7, 1.3]
ds['age'] = ds['age'].apply(lambda x: random.uniform(x*age_range[0], x*age_range[1]))

In [14]:
def score(x):
    if x < q1:
        return (random.uniform(r1[0], r1[1])*x/m1)
    elif x < q2:
        return (random.uniform(r2[0], r2[1])*x/m2)
    elif x < q3:
        return (random.uniform(r3[0], r3[1])*x/m3)
    else:
        return (random.uniform(r4[0], r4[1])*x/m4)

In [15]:
ds['score'] = ds['age'].apply(score)

In [16]:
# ds.head()

In [17]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1500 entries, 5 to 6
Data columns (total 5 columns):
age        1500 non-null float64
species    1500 non-null object
randint    1500 non-null int64
proxy      1500 non-null int64
score      1500 non-null float64
dtypes: float64(2), int64(2), object(1)
memory usage: 70.3+ KB


In [18]:
# ds.describe()

In [19]:
# sns.swarmplot(x=ds['age'], y=ds['score'])
# plt.show()

In [20]:
# sns.swarmplot(x=ds['species'], y=ds['score'])
# plt.show()

In [21]:
# fig = plt.figure()
# ax = Axes3D(fig)

# ax.scatter(ds['age'], ds['proxy'], ds['score'], s=40)
# plt.show()

In [22]:
save_to_csv_df = ds[['age', 'species', 'score']]
save_to_csv_df.to_csv('synthetic_data.csv', index=False)