# Introduction to Data Structures
http://pandas.pydata.org/pandas-docs/stable/dsintro.html

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Series

In [5]:
# pd.Series([list], [list]) #assumes data, index as inputs
# pd.Series([list], index=[list]) #define index parameter
# pd.Series({key:item, ...}) #use dictionaries

# Series from lists
data = [1,2,3,4,5]
index = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(data, index)
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [11]:
# Series from dictionary
data = {'a': 0., 'b': 1, 'c': 2, 'd': 3, 'e': 4}
s = pd.Series(data)
s

a    0
b    1
c    2
d    3
e    4
dtype: float64

In [100]:
s.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [17]:
# Naming your Series
s = pd.Series(np.random.randn(10), name='nameOfSeries')
s

0    0.809086
1    0.513723
2    2.978490
3    0.423159
4   -0.269855
5    1.561544
6   -1.268294
7    0.668495
8    2.359559
9    0.210141
Name: nameOfSeries, dtype: float64

## Working with Series

In [13]:
s[0] #take ith element
s[0:3] #slice
s[s > 3] #filter
np.power(s,2) #math operations

a     0
b     1
c     4
d     9
e    16
dtype: float64

In [14]:
'f' in s  #check if in index
s[4] == s['e']  #grab data by ith index or by index label
# s['f']  #throws exception
s.get('f', -999.25) #if not found, return value (default None)

-999.25

In [3]:
#numpy vectorized operations
#this crashed my computer when range went past 7
for i in range(2,5):
    print i
    mult = int(pow(10,i))
#     print mult
    s = pd.Series(np.random.randn(mult))
    x = %timeit -o s ** s;
    y = %timeit -o [x**x for x in s];
    print y.best/x.best

2
10000 loops, best of 3: 73.2 µs per loop
10000 loops, best of 3: 49.5 µs per loop
0.676212428563
3
10000 loops, best of 3: 94.5 µs per loop
1000 loops, best of 3: 354 µs per loop
3.74856641356
4
1000 loops, best of 3: 340 µs per loop
100 loops, best of 3: 3.53 ms per loop
10.382936299


## Working with Dates

In [120]:
dates = pd.date_range('20160101', periods=12, freq='W-MON') #freq=('H', 'BH', 'D', 'W', 'W-DAY', 'M', 'BM')
dates

DatetimeIndex(['2016-01-04', '2016-01-11', '2016-01-18', '2016-01-25',
               '2016-02-01', '2016-02-08', '2016-02-15', '2016-02-22',
               '2016-02-29', '2016-03-07', '2016-03-14', '2016-03-21'],
              dtype='datetime64[ns]', freq='W-MON')

# DataFrame

In [75]:
# Create Series
n = 10
s = pd.Series(np.random.randn(n))
# Create Index
ind = [x+20 for x in xrange(n)]
ind = np.arange(n) + 10
# Create DataFrame from Series
df = pd.DataFrame(s, index=ind) #throws error - index already set by pd.Series
pd.DataFrame(s.values, ind)

Unnamed: 0,0
10,0.501542
11,-1.549232
12,0.534121
13,-0.347528
14,0.103708
15,-0.47669
16,-0.347374
17,-0.627531
18,0.740117
19,-2.154983


In [90]:
# Create DataFrame from dict
dict = {'a': [0,1], 'b': [2,3], 'c': [4,5], 'd': [6,7], 'e': [8,9]}
pd.DataFrame(dict)

Unnamed: 0,a,b,c,d,e
0,0,2,4,6,8
1,1,3,5,7,9


In [92]:
array_zeros = np.zeros((5,5))
df = pd.DataFrame(array_zeros, index=['a', 'b', 'c', 'd', 'e'], columns=['first', 'second','third', 'fourth', 'fifth'])
df

Unnamed: 0,first,second,third,fourth,fifth
a,0,0,0,0,0
b,0,0,0,0,0
c,0,0,0,0,0
d,0,0,0,0,0
e,0,0,0,0,0


In [97]:
print df.index
print df.columns

Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')
Index([u'first', u'second', u'third', u'fourth', u'fifth'], dtype='object')
