# Dummy Data Examples

It's often useful to create some dummy dataframes to experiment with pandas functions.

In [1]:
import pandas as pd
import numpy as np

### Generating random test-data with numpy

In [2]:
# | Draw sample(s) from the “standard normal” distribution
a = np.random.randn(100, 4) # 100 samples, 4 columns

# | Draw sample(s) from a uniform distribution 
# "random float values"
b = np.random.uniform(low=0.0, high=1.0, size=(100,4))

# random integer values
c = np.random.randint(0, 1000, size=(100,4))

# | Uniform sampling from given array/list
d = np.random.choice(['C1','C2','C3'], size=(100,4))

### Basic DataFrames

In [3]:
df = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
df.head()

Unnamed: 0,A,B,C,D
0,0.246948,0.088561,-0.46913,1.733602
1,-0.292812,-1.363095,0.935763,1.818374
2,-1.45681,-1.143127,-0.220597,-0.212127
3,0.393341,0.717071,0.901208,-1.078046
4,0.110282,1.980008,-0.565658,2.103793


In [4]:
df = pd.DataFrame(np.random.randint(0, 1000, size=(100,4)), columns=list('ABCD'))
# categorical column
df['CAT1'] = np.random.choice(['C1','C2','C3'], size=len(df))
df['CAT2'] = np.random.choice(['C1','C2','C3'], size=len(df))
df.head()

Unnamed: 0,A,B,C,D,CAT1,CAT2
0,186,594,574,75,C3,C2
1,786,37,701,306,C3,C2
2,207,308,857,298,C1,C2
3,583,105,430,621,C1,C1
4,122,444,125,973,C2,C2


### Time-Series DataFrames

In [5]:
sr_dates = pd.Series(pd.date_range('2017', periods=4, freq='Q'))
sr_dates

0   2017-03-31
1   2017-06-30
2   2017-09-30
3   2017-12-31
dtype: datetime64[ns]

In [6]:
times = pd.date_range('3/6/2012 00:00', periods=100, freq='S', tz='UTC')
df = pd.DataFrame(np.random.randint(10, size=(100,1)), index=times)
df.head()

Unnamed: 0,0
2012-03-06 00:00:00+00:00,6
2012-03-06 00:00:01+00:00,9
2012-03-06 00:00:02+00:00,1
2012-03-06 00:00:03+00:00,8
2012-03-06 00:00:04+00:00,0


### MultiIndex DataFrames

In [7]:
header = pd.MultiIndex.from_product([['Semester1','Semester2'],['Maths','Science']])
data = ([[12,45,67,56],[78,89,45,67],[45,67,89,90],[67,44,56,55]])
 
df = pd.DataFrame(data, index=['Alisa','Bobby','Cathrine','Jack'], columns=header)
df

Unnamed: 0_level_0,Semester1,Semester1,Semester2,Semester2
Unnamed: 0_level_1,Maths,Science,Maths,Science
Alisa,12,45,67,56
Bobby,78,89,45,67
Cathrine,45,67,89,90
Jack,67,44,56,55


In [8]:
colidx = pd.MultiIndex.from_product([('Yes', 'No'), ('Yes', 'No')],
                                    names=['Heart Disease', 'High Blood Pressure'])
rowidx = pd.MultiIndex.from_product([('Female', 'Male'), ('Single', 'Married')], 
                                    names=['Sex', 'Marital Status'])

df = pd.DataFrame(np.random.randint(100, size=(len(rowidx), len(colidx))), index=rowidx, columns=colidx)
df

Unnamed: 0_level_0,Heart Disease,Yes,Yes,No,No
Unnamed: 0_level_1,High Blood Pressure,Yes,No,Yes,No
Sex,Marital Status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Female,Single,95,47,87,80
Female,Married,25,24,32,11
Male,Single,64,8,24,83
Male,Married,85,51,87,41


In [10]:
col_idx = pd.MultiIndex.from_product([['A', 'B', 'C'],  ['O', 'I']])
# col_idx = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] for y in ['O', 'I']])
 
df = pd.DataFrame(np.random.randn(10, len(col_idx)), columns=col_idx)
df.head()



Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,O,I,O,I,O,I
0,-0.15321,1.874311,0.698654,0.856726,-1.640441,-1.386755
1,-0.998808,-0.132932,-0.227019,0.005667,0.287349,-0.198939
2,-0.279573,2.360184,-0.58959,0.849242,-0.936124,-1.089161
3,-1.236291,0.673723,0.235151,-0.635083,-0.049784,0.913887
4,1.559644,1.028673,-0.175795,0.220123,0.247714,-1.221123
