In [56]:
# Reference: 
# online free docs:          https://pandas.pydata.org/pandas-docs/stable/
# book old edition free:  https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/
# book new edition pay: https://smile.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/

In [57]:
# pandas is a python library used for data manipulation and analysis
# two key data structures - series objects and dataframes

In [58]:
import numpy as np
import pandas as pd

In [59]:
##### pandas dataframes #####
# 2-dimensional labeled data structure as denoted by index and columns
# analogous to spreadsheet or SQL table

In [60]:
##### creating dataframes #####

In [61]:
# create a pandas dataframe using a list with default indices and columns
# can use lists, dictionaries, np.arrays, pd.series and other pd.dataframes to create a dataframe

In [62]:
lst = [[10,20,30], [100,200,300]]
df = pd.DataFrame(data=lst)
df

Unnamed: 0,0,1,2
0,10,20,30
1,100,200,300


In [63]:
# create a pandas dataframe using a list with labeled indices and default columns
lst = [[10,20,30], [100,200,300]]
idx = ['X','Y']
df = pd.DataFrame(data=lst, index=idx)
df

Unnamed: 0,0,1,2
X,10,20,30
Y,100,200,300


In [64]:
# create a pandas dataframe using a list with labeled indices and labeled columns
lst = [[10,20,30], [100,200,300]]
idx = ['X','Y']
col = ['aa','bb','cc']
df = pd.DataFrame(data=lst, index=idx, columns=col)
df

Unnamed: 0,aa,bb,cc
X,10,20,30
Y,100,200,300


In [65]:
# create a pandas dataframe using lists
lst1=[10,20,30]
lst2 = [100,200,300]
df = pd.DataFrame(data={'X':lst1, 'Y':lst2}, index=['aa','bb','cc'])
df

Unnamed: 0,X,Y
aa,10,100
bb,20,200
cc,30,300


In [66]:
# create a pandas dataframe using dictionaries
dct1 = {'aa':10,'bb':20,'cc':30}
dct2 = {'aa':100,'bb':200,'cc':300}
df = pd.DataFrame(data={'X':dct1, 'Y':dct2})
df

Unnamed: 0,X,Y
aa,10,100
bb,20,200
cc,30,300


In [67]:
# create a pandas dataframe using pandas series
np.random.seed(0)
ser1 = pd.Series([10, 20, 30],index=['aa','bb','cc'])
ser2 = pd.Series([100, 200, 300],index=['aa','bb','cc'])
df = pd.DataFrame(data={'X':ser1, 'Y':ser2})
df

Unnamed: 0,X,Y
aa,10,100
bb,20,200
cc,30,300


In [68]:
##### dataframe attributes #####

In [69]:
df = pd.DataFrame(data=np.random.normal(size=(3,5)), index=['X','Y','Z'], columns=['aa','bb','cc','dd','ee'])

In [70]:
df.shape

(3, 5)

In [71]:
df.size

15

In [72]:
df.index

Index(['X', 'Y', 'Z'], dtype='object')

In [73]:
df.columns

Index(['aa', 'bb', 'cc', 'dd', 'ee'], dtype='object')

In [74]:
df.values

array([[ 1.76405235,  0.40015721,  0.97873798,  2.2408932 ,  1.86755799],
       [-0.97727788,  0.95008842, -0.15135721, -0.10321885,  0.4105985 ],
       [ 0.14404357,  1.45427351,  0.76103773,  0.12167502,  0.44386323]])

In [75]:
##### dataframe methods #####

In [76]:
df.head() # returns top n rows

Unnamed: 0,aa,bb,cc,dd,ee
X,1.764052,0.400157,0.978738,2.240893,1.867558
Y,-0.977278,0.950088,-0.151357,-0.103219,0.410599
Z,0.144044,1.454274,0.761038,0.121675,0.443863


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, X to Z
Data columns (total 5 columns):
aa    3 non-null float64
bb    3 non-null float64
cc    3 non-null float64
dd    3 non-null float64
ee    3 non-null float64
dtypes: float64(5)
memory usage: 144.0+ bytes


In [78]:
df.describe()

Unnamed: 0,aa,bb,cc,dd,ee
count,3.0,3.0,3.0,3.0,3.0
mean,0.310273,0.93484,0.529473,0.753116,0.90734
std,1.378204,0.527224,0.599579,1.29335,0.83174
min,-0.977278,0.400157,-0.151357,-0.103219,0.410599
25%,-0.416617,0.675123,0.30484,0.009228,0.427231
50%,0.144044,0.950088,0.761038,0.121675,0.443863
75%,0.954048,1.202181,0.869888,1.181284,1.155711
max,1.764052,1.454274,0.978738,2.240893,1.867558


In [79]:
df.transpose() # df.T

Unnamed: 0,X,Y,Z
aa,1.764052,-0.977278,0.144044
bb,0.400157,0.950088,1.454274
cc,0.978738,-0.151357,0.761038
dd,2.240893,-0.103219,0.121675
ee,1.867558,0.410599,0.443863
