In [184]:
# Reference: 
# online free docs:          https://pandas.pydata.org/pandas-docs/stable/
#                                    https://docs.scipy.org/doc/numpy/reference/ufuncs.html#available-ufuncs
#                                    https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-stats
#                                    https://pandas.pydata.org/pandas-docs/stable/api.html#api-series-stats
#                                    https://pandas.pydata.org/pandas-docs/stable/api.html#api-dataframe-stats
# book old edition free:  https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/
# book new edition pay: https://smile.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/

In [185]:
import numpy as np
import pandas as pd

In [186]:
##### universal functions (or ufuncs) #####
# https://docs.scipy.org/doc/numpy/reference/ufuncs.html#available-ufuncs
# pandas inherits much of this functionality from NumPy.
# so can pretty much use all the ufuncs we learend about with NumPy.

In [187]:
df = pd.DataFrame(np.random.randn(3,5), index=['X','Y','Z'], columns=['a','b','c','d','e'])
df

Unnamed: 0,a,b,c,d,e
X,0.333674,1.494079,-0.205158,0.313068,-0.854096
Y,-2.55299,0.653619,0.864436,-0.742165,2.269755
Z,-1.454366,0.045759,-0.187184,1.532779,1.469359


In [188]:
np.exp(df)

Unnamed: 0,a,b,c,d,e
X,1.396088,4.455232,0.814518,1.367614,0.425668
Y,0.077849,1.922485,2.373667,0.476082,9.677026
Z,0.233548,1.046822,0.829291,4.63103,4.346447


In [189]:
np.sin(df * np.pi/4)

Unnamed: 0,a,b,c,d,e
X,0.259078,0.92209,-0.160435,0.243413,-0.621617
Y,-0.907158,0.491099,0.627958,-0.550443,0.97764
Z,-0.909573,0.035931,-0.146485,0.933424,0.914403


In [190]:
##### arithmetic operations #####
# pandas will automatically align the based on the union of the index/columns;
# missing values are marked as NaN

In [191]:
df = pd.DataFrame(np.ones((3,5)), index=['X','Y','Z'], columns=['a','b','c','d','e'])
df

Unnamed: 0,a,b,c,d,e
X,1.0,1.0,1.0,1.0,1.0
Y,1.0,1.0,1.0,1.0,1.0
Z,1.0,1.0,1.0,1.0,1.0


In [192]:
-df

Unnamed: 0,a,b,c,d,e
X,-1.0,-1.0,-1.0,-1.0,-1.0
Y,-1.0,-1.0,-1.0,-1.0,-1.0
Z,-1.0,-1.0,-1.0,-1.0,-1.0


In [193]:
df*10

Unnamed: 0,a,b,c,d,e
X,10.0,10.0,10.0,10.0,10.0
Y,10.0,10.0,10.0,10.0,10.0
Z,10.0,10.0,10.0,10.0,10.0


In [194]:
# series with series example
ser1 = pd.Series(data=[10,20,30], index=['a','b','c'])
ser2 = pd.Series(data=[100,200,300], index=['a','b','c'])
print (ser1)
print (ser2)
ser1 + ser2

a    10
b    20
c    30
dtype: int64
a    100
b    200
c    300
dtype: int64


a    110
b    220
c    330
dtype: int64

In [195]:
ser1 = pd.Series(np.random.randint(-10, 10, 5), index=['a','b','c','d','e'])
ser2 = pd.Series(np.random.randint(-15, 15, 5), index=['c','d','e','f','g'])
print (ser1)
print (ser2)
ser1 + ser2 # ser1.add(ser2)

a    5
b   -6
c   -1
d    0
e   -9
dtype: int32
c   -14
d    -8
e    -6
f    10
g   -12
dtype: int32


a     NaN
b     NaN
c   -15.0
d    -8.0
e   -15.0
f     NaN
g     NaN
dtype: float64

In [196]:
# dataframe with dataframe example
df1 = pd.DataFrame(data=[[1,2,3,4],[5,6,7,8]], index=['X','Y'], columns=['a','b','c','d'])
df2 = pd.DataFrame(data=[[10,20,30,40],[50,60,70,80]], index=['X','Y'], columns=['a','b','c','d'])
print (df1)
print (df2)
df1 + df2

   a  b  c  d
X  1  2  3  4
Y  5  6  7  8
    a   b   c   d
X  10  20  30  40
Y  50  60  70  80


Unnamed: 0,a,b,c,d
X,11,22,33,44
Y,55,66,77,88


In [197]:
# dataframe with dataframe example
df1 = pd.DataFrame(np.random.randint(-10, 10, (3,5)), index=['X','Y','Z'], columns=['a','b','c','d','e'])
df2 = pd.DataFrame(np.random.randint(-15, 15, (3,5)), index=['X','Y','W'], columns=['c','d','e','f','g'])
print (df1)
print (df2)
df1 + df2 # df1.add(df2)

   a  b  c  d   e
X -4  1  4  8 -10
Y  4 -7  2  0   1
Z -6 -4 -6  5  -7
   c   d   e   f  g
X -3 -11   5  -7 -1
Y  0   5 -12  11  8
W  0  -2   6   6  1


Unnamed: 0,a,b,c,d,e,f,g
W,,,,,,,
X,,,1.0,-3.0,-5.0,,
Y,,,2.0,5.0,-11.0,,
Z,,,,,,,


In [198]:
df1 = pd.DataFrame(np.random.randint(-10, 10, (3,5)), index=['X','Y','Z'], columns=['a','b','c','d','e'])
df2 = pd.DataFrame(np.random.randint(-15, 15, (1,5)), index=['X'], columns=['c','d','e','f','g'])
print (df1)
print (df2)
df1 + df2 # df1.add(df2)

   a   b  c  d   e
X  7  -5 -1 -7 -10
Y -5 -10  7  8  -6
Z -8   6 -7 -8   0
   c  d  e  f  g
X -2  1 -8  6 -6


Unnamed: 0,a,b,c,d,e,f,g
X,,,-3.0,-6.0,-18.0,,
Y,,,,,,,
Z,,,,,,,


In [199]:
df1 = pd.DataFrame(np.random.randint(-10, 10, (3,5)), index=['X','Y','Z'], columns=['a','b','c','d','e'])
df2 = pd.DataFrame(np.random.randint(-15, 15, (3,1)), index=['X','Y','Z'], columns=['c'])
print (df1)
print (df2)
df1 + df2 # df1.add(df2)

    a  b  c  d  e
X -10  0  8  1 -8
Y  -8 -7 -7  8  4
Z  -7  7  8  4 -1
    c
X  11
Y -14
Z -11


Unnamed: 0,a,b,c,d,e
X,,,19,,
Y,,,-21,,
Z,,,-3,,


In [200]:
##### aggregations #####
# count, sum, mean, median, mode, min, max, std, var, quantile, etc
# https://pandas.pydata.org/pandas-docs/stable/basics.html#basics-stats
# https://pandas.pydata.org/pandas-docs/stable/api.html#api-series-stats
# https://pandas.pydata.org/pandas-docs/stable/api.html#api-dataframe-stats

In [201]:
np.random.seed(0)
df = pd.DataFrame(np.random.randn(3,5), index=['X','Y','Z'], columns=['a','b','c','d','e'])
df

Unnamed: 0,a,b,c,d,e
X,1.764052,0.400157,0.978738,2.240893,1.867558
Y,-0.977278,0.950088,-0.151357,-0.103219,0.410599
Z,0.144044,1.454274,0.761038,0.121675,0.443863


In [202]:
df.describe()

Unnamed: 0,a,b,c,d,e
count,3.0,3.0,3.0,3.0,3.0
mean,0.310273,0.93484,0.529473,0.753116,0.90734
std,1.378204,0.527224,0.599579,1.29335,0.83174
min,-0.977278,0.400157,-0.151357,-0.103219,0.410599
25%,-0.416617,0.675123,0.30484,0.009228,0.427231
50%,0.144044,0.950088,0.761038,0.121675,0.443863
75%,0.954048,1.202181,0.869888,1.181284,1.155711
max,1.764052,1.454274,0.978738,2.240893,1.867558


In [203]:
# count aggregation across rows (for each column)
df.count(axis=0)

a    3
b    3
c    3
d    3
e    3
dtype: int64

In [204]:
# count aggregation across columns (for each row)
df.sum(axis=1)

X    7.251399
Y    0.128833
Z    2.924893
dtype: float64

In [205]:
# aaggregation on a column
df['a'].median()

0.144043571160878

In [206]:
# aggregation on a row
df.loc['X'].mean()

1.4502797455584104

In [207]:
# correlation between two columns
# note: df1.corrwith(df2) gives pairwise correlation between rows or colums of two dataframes
df[['a','b']].corr()

Unnamed: 0,a,b
a,1.0,-0.607805
b,-0.607805,1.0


In [208]:
##### uniques #####

In [209]:
# unique values
df = pd.DataFrame([[10, 20, 3, 4, 5],
                   [10, 20, 30, 40, 50],
                   [10, 21, 31, 40, 50],
                   [100, 20, 100,  100, 500]],
                  index = list('XYZW'),
                  columns = list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
X,10,20,3,4,5
Y,10,20,30,40,50
Z,10,21,31,40,50
W,100,20,100,100,500


In [210]:
# unique values in column
df['A'].unique()

array([ 10, 100], dtype=int64)

In [211]:
# number of unique values in column
df['A'].nunique()

2

In [212]:
# unique values and number of times they appear in column
df['A'].value_counts()

10     3
100    1
Name: A, dtype: int64

In [213]:
##### apply method #####

In [214]:
df = pd.DataFrame([[10, 20, 3, 4, 'abc'],
                   [10, 20, 30, 40, 'abcd'],
                   [10, 21, 31, 40, 'abcde'],
                   [100, 20, 100,  100, 'abcdef']],
                  index = list('XYZW'),
                  columns = list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
X,10,20,3,4,abc
Y,10,20,30,40,abcd
Z,10,21,31,40,abcde
W,100,20,100,100,abcdef


In [215]:
df['A'].apply(lambda x: x**2)

X      100
Y      100
Z      100
W    10000
Name: A, dtype: int64

In [216]:
df['E'].apply(len)

X    3
Y    4
Z    5
W    6
Name: E, dtype: int64

In [217]:
##### sorting #####

In [218]:
df = pd.DataFrame([[11, 20, 3, 4, 5],
                   [101, 20, 30, 40, 50],
                   [99, 21, 31, 40, 50],
                   [10, 20, 100,  100, 500]],
                  index = list('XYZW'),
                  columns = list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
X,11,20,3,4,5
Y,101,20,30,40,50
Z,99,21,31,40,50
W,10,20,100,100,500


In [219]:
df.sort_values(by='A')

Unnamed: 0,A,B,C,D,E
W,10,20,100,100,500
X,11,20,3,4,5
Z,99,21,31,40,50
Y,101,20,30,40,50


In [220]:
##### vectorized string operations #####

In [221]:
# string methods
# len(), ljust(), rjust(), center(), zfill(), strip(), rstrip(), lstrip()
# lower(), upper(), find(), rfind(), index(), rindex(), capitalize(), swapcase()
# translate(), startswith(), endswith(), isalnum(), isalpha(), isdigit(), isspace(), istitle()
# islower(), isupper(), isnumeric(), isdecimal(), split(), rsplit(), partition(), rpartition()
ser = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Michael Palin', np.nan])
print (ser)
print (ser.str.lower())
print (ser.str.len())

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3     Michael Palin
4               NaN
dtype: object
0    graham chapman
1       john cleese
2     terry gilliam
3     michael palin
4               NaN
dtype: object
0    14.0
1    11.0
2    13.0
3    13.0
4     NaN
dtype: float64


In [222]:
# regular expression methods (calls underlying re. methods)
# match(), extract(), findall(), replace(), contains(), count(), split(), rsplit()
ser = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Michael Palin', np.nan])
print (ser)
print (ser.str.extract('([A-Za-z]+)')) # continuous string of letters
print (ser.str.findall(r'^[^aeiou].*[^aeiou]$')) # starts and ends with a consonant

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3     Michael Palin
4               NaN
dtype: object
         0
0   Graham
1     John
2    Terry
3  Michael
4      NaN
0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3     [Michael Palin]
4                 NaN
dtype: object


In [223]:
# miscellaneous methods
# get(), slice(), slice_replace(), cat(), repeat(), normalize(), pad(), wrap(), join(), get_dummies()
ser = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Michael Palin', np.nan])
print (ser)
print (ser.str[0:3])
print (ser.str.split().str.get(-1))

0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3     Michael Palin
4               NaN
dtype: object
0    Gra
1    Joh
2    Ter
3    Mic
4    NaN
dtype: object
0    Chapman
1     Cleese
2    Gilliam
3      Palin
4        NaN
dtype: object
