In [1]:
# Pandas is a library that provides useful data structure and data analysis tools
# pandas is well suited for many different kinds of data
# . Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet
# . Ordered and unordered (not necessarily fixed-frequency) time series data.
# . Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels
# . Any other form of observational / statistical data sets. The data actually need not be labeled 
#   at all to be placed into a pandas data structure. 

import numpy as np
import pandas as pd

In [2]:
# Pandas Data Structure


In [3]:
# . Series
#   . One dimension array capable of holding any data type,
# . To create Series
data = np.array([1,2,3,4])
index = np.array(['a','b','c','d'])
s = pd.Series(data, index=index)
print(s)
# . The "data" can be list, tuple, dict, numpy.ndarray ... 


a    1
b    2
c    3
d    4
dtype: int32


In [4]:
s = pd.Series([1,2,3,4])
print(s)
print('')
x = {'a':0, 'b':1} # x is dictionary
s = pd.Series(x)
print(s)

0    1
1    2
2    3
3    4
dtype: int64

a    0
b    1
dtype: int64


In [5]:
# . The index should be the same size with the data
s = pd.Series([0,1,2,3], index = ['a','b','c','d'])
print(s)

a    0
b    1
c    2
d    3
dtype: int64


In [6]:
# . You can access the series element by their index
print(s['b'])
print('')
S = set()
S.add('a')
S.add('c')
print(s[S])

1

a    0
c    2
dtype: int64


In [7]:
# . Data frame
#  . 2-dimensional labeled data structure with columns of poetentially different types
#  . like a spreadsheet or SQL table, or a dict of Series objects
#  . accepts many different kinds of input:
#    . Dict of 1D ndarrays, lists, dicts, or Series
#    . 2-D numpy.ndarray
#    . Structured or record ndarry
#    . A Series
#    . Another DataFrame


In [8]:
# From dict of series or dict
d = {}
d['a'] = pd.Series([1,2,3,4], index = ['a','b','c','d'])
d['b'] = pd.Series([5,6,7,8], index = ['c','a','b','x'])
df = pd.DataFrame(d)
print(df)

     a    b
a  1.0  6.0
b  2.0  7.0
c  3.0  5.0
d  4.0  NaN
x  NaN  8.0


In [9]:
# The index and column parameter
df = pd.DataFrame(d, index = ['a','b','c'])
print(df)
print('')
df = pd.DataFrame(d, index = ['a','b','c'], columns = ['a','d'])
print(df)

   a  b
a  1  6
b  2  7
c  3  5

   a    d
a  1  NaN
b  2  NaN
c  3  NaN


In [10]:
# From list of dict
x = [{'a':1, 'b':2}, {'a':3, 'b':4, 'c':5}]
print(pd.DataFrame(x))
print('')
print(pd.DataFrame(x, index = ['first','second']))
print('')
print(pd.DataFrame(x, index = ['first','second'], columns=['a','c']))
print('')


   a  b    c
0  1  2  NaN
1  3  4  5.0

        a  b    c
first   1  2  NaN
second  3  4  5.0

        a    c
first   1  NaN
second  3  5.0

