# Dummy Data Examples

It's often useful to create some dummy dataframes to experiment with pandas functions.

In [19]:
import pandas as pd
import numpy as np

### Generating random test-data with numpy

In [25]:
# | Draw sample(s) from the “standard normal” distribution
a = np.random.randn(100, 4) # 100 samples, 4 columns

# | Draw sample(s) from a uniform distribution 
# "random float values"
b = np.random.uniform(low=0.0, high=1.0, size=(100,4))

# random integer values
c = np.random.randint(0, 1000, size=(100,4))

# | Uniform sampling from given array/list
d = np.random.choice(['C1','C2','C3'], size=(100,4))

### Basic DataFrames

In [5]:
df = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,-0.500661,1.367907,-0.191519,0.156243
1,2.471379,-0.495846,0.750718,0.256677
2,0.265474,1.138274,-0.795528,0.553536
3,-0.175151,0.711412,0.940609,1.778986
4,-0.274403,2.972132,1.34721,1.076192


In [18]:
df = pd.DataFrame(np.random.randint(0, 1000, size=(100,4)), columns=list('ABCD'))
# categorical column
df['CAT1'] = np.random.choice(['C1','C2','C3'], size=len(df))
df['CAT2'] = np.random.choice(['C1','C2','C3'], size=len(df))
df.head()

Unnamed: 0,A,B,C,D,CAT1,CAT2
0,17,963,969,255,C1,C2
1,182,493,518,54,C2,C2
2,586,892,784,81,C3,C1
3,330,20,415,886,C3,C2
4,372,991,794,427,C3,C1


### Time-Series DataFrames

In [13]:
sr_dates = pd.Series(pd.date_range('2017', periods=4, freq='Q'))
sr_dates

0   2017-03-31
1   2017-06-30
2   2017-09-30
3   2017-12-31
dtype: datetime64[ns]

In [15]:
times = pd.date_range('3/6/2012 00:00', periods=100, freq='S', tz='UTC')
df = pd.DataFrame(np.random.randint(10, size=(100,1)), index=times)
df.head()

Unnamed: 0,0
2012-03-06 00:00:00+00:00,1
2012-03-06 00:00:01+00:00,8
2012-03-06 00:00:02+00:00,3
2012-03-06 00:00:03+00:00,0
2012-03-06 00:00:04+00:00,2


### MultiIndex DataFrames

In [9]:
header = pd.MultiIndex.from_product([['Semester1','Semester2'],['Maths','Science']])
data = ([[12,45,67,56],[78,89,45,67],[45,67,89,90],[67,44,56,55]])
 
df = pd.DataFrame(data, index=['Alisa','Bobby','Cathrine','Jack'], columns=header)
df

Unnamed: 0_level_0,Semester1,Semester1,Semester2,Semester2
Unnamed: 0_level_1,Maths,Science,Maths,Science
Alisa,12,45,67,56
Bobby,78,89,45,67
Cathrine,45,67,89,90
Jack,67,44,56,55


In [46]:
colidx = pd.MultiIndex.from_product([('Yes', 'No'), ('Yes', 'No')],
                                    names=['Heart Disease', 'High Blood Pressure'])
rowidx = pd.MultiIndex.from_product([('Female', 'Male'), ('Single', 'Married')], 
                                    names=['Sex', 'Marital Status'])

df = pd.DataFrame(np.random.randint(100, size=(len(rowidx), len(colidx))), index=rowidx, columns=colidx)
df

Unnamed: 0_level_0,Heart Disease,Yes,Yes,No,No
Unnamed: 0_level_1,High Blood Pressure,Yes,No,Yes,No
Sex,Marital Status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,Single,73,2,15,13
Female,Married,2,3,53,10
Male,Single,69,74,74,89
Male,Married,43,51,34,31


In [43]:
col_idx = pd.MultiIndex.from_product([['A', 'B', 'C'],  ['O', 'I']])
# col_idx = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] for y in ['O', 'I']])
 
df = pd.DataFrame(np.random.randn(10, len(col_idx)), columns=col_idx)
df



Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,O,I,O,I,O,I
0,-1.050761,0.422522,0.732426,-2.138614,-0.715674,0.022829
1,0.272496,-0.454574,0.819042,0.624458,-0.90428,-0.383154
2,-1.029386,1.453976,0.153515,0.734942,-0.130665,0.26289
3,1.578337,-0.374483,0.15374,0.999857,-1.288229,-0.331237
4,0.630885,1.211861,-1.18635,-1.355882,-0.260152,-1.539952
5,-1.17668,-1.259541,0.138799,-0.658577,0.297062,0.526809
6,0.317991,1.373705,-0.286015,0.149082,1.251834,-0.563052
7,-0.701673,-0.805064,0.285746,-1.191175,0.208019,-1.541519
8,0.364734,-0.653906,-0.89344,-0.193085,-0.187358,-0.538942
9,0.960923,0.79923,-0.898155,-0.433945,-0.277014,-0.125753
