# Pandas reference
https://learning.oreilly.com/library/view/python-for-data/9781449323592/ch05.html

In [150]:
import pandas as pd
import numpy as np
import seaborn as sns

# For visualization:
from matplotlib.pyplot import figure, plot, semilogy, grid, legend
%matplotlib inline

In [85]:
%config IPCompleter.greedy=True

# Series

In [86]:
# Series is indexed ndarray.
a = pd.Series([1,1,2,3,5,8], index=['a','b','c','d','e','f'])
a.name = 'fib'
a.index.name = 'ind'
a

ind
a    1
b    1
c    2
d    3
e    5
f    8
Name: fib, dtype: int64

In [87]:
a[a>3]

ind
e    5
f    8
Name: fib, dtype: int64

In [88]:
# can access with index-name instead of index .. BUT WHY ?
a[['a','b']]

ind
a    1
b    1
Name: fib, dtype: int64

In [89]:
a.isnull()

ind
a    False
b    False
c    False
d    False
e    False
f    False
Name: fib, dtype: bool

In [90]:
a.index = np.arange(6) # reassign index.

In [91]:
a

0    1
1    1
2    2
3    3
4    5
5    8
Name: fib, dtype: int64

In [92]:
# create series from list/ndarray
a = pd.Series([1,2,3])
print(a)

a = pd.Series(np.arange(1,100,10))
print(type(a)); print(a)

# get ndarray from series - without index.
v = a.values
print(type(v), v)

0    1
1    2
2    3
dtype: int64
<class 'pandas.core.series.Series'>
0     1
1    11
2    21
3    31
4    41
5    51
6    61
7    71
8    81
9    91
dtype: int64
<class 'numpy.ndarray'> [ 1 11 21 31 41 51 61 71 81 91]


# Dataframe

In [93]:
# create dataframe from dict
d = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(d)
print(df.columns)
print(df.shape)
(nrows, ncols) = df.shape
print ('rows: ', nrows, 'cols: ', ncols)
display(df)

Index(['state', 'year', 'pop'], dtype='object')
(6, 3)
rows:  6 cols:  3


Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [94]:
# each col in DF is Series
st = df['state'] # or st.state
print(type(st), st)

<class 'pandas.core.series.Series'> 0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object


In [95]:
# access row by position
print (df.loc[3]) # df[3] won't work
print (st.loc[2], st[2]) # no need to use loc for series? 

state    Nevada
year       2001
pop         2.4
Name: 3, dtype: object
Ohio Ohio


In [96]:
# create DF from arrays -- list of of lists basically
a =   [['Ohio', 2000, 1.5],
       ['Ohio', 2001, 1.7],
       ['Ohio', 2002, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9],
       ['Nevada', 2003, 3.2]]
df = pd.DataFrame(a, columns = ['State', 'Year', 'Population'])
display(df)

Unnamed: 0,State,Year,Population
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [97]:
# get array back
print (df.values)

[['Ohio' 2000 1.5]
 ['Ohio' 2001 1.7]
 ['Ohio' 2002 3.6]
 ['Nevada' 2001 2.4]
 ['Nevada' 2002 2.9]
 ['Nevada' 2003 3.2]]


In [98]:
# drop some elements in row - note 1 and 2 are missing in the index
df1 = df.drop([1,2])
display(df1)

Unnamed: 0,State,Year,Population
0,Ohio,2000,1.5
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [99]:
# drop a col. Axis 0 is row, 1 is col
df2 = df.drop('State', axis=1) # or axis='column'
display(df2)

Unnamed: 0,Year,Population
0,2000,1.5
1,2001,1.7
2,2002,3.6
3,2001,2.4
4,2002,2.9
5,2003,3.2


In [100]:
#filtering 
df1 = df[df.Population>3]
display (df1)
print(df1.iloc[1]) # integer location. Note that index 5 is in int-index 1.

# multiple conditions - may be cleaner to define filter and apply.
filt = (df.Year > 2002) & (df.Population > 3)
print ('filter: ', filt)
df1 = df[filt]
display(df1)

Unnamed: 0,State,Year,Population
2,Ohio,2002,3.6
5,Nevada,2003,3.2


State         Nevada
Year            2003
Population       3.2
Name: 5, dtype: object
filter:  0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool


Unnamed: 0,State,Year,Population
5,Nevada,2003,3.2


In [18]:
# apply
df['PopPerc'] = df['Population'].apply(lambda x: 100.0*x/df['Population'].sum() ) # note - this is incorrect %.
display(df)

Unnamed: 0,State,Year,Population,PopPerc
0,Ohio,2000,1.5,9.803922
1,Ohio,2001,1.7,11.111111
2,Ohio,2002,3.6,23.529412
3,Nevada,2001,2.4,15.686275
4,Nevada,2002,2.9,18.954248
5,Nevada,2003,3.2,20.915033


In [19]:
print (df.Population.min(), df.Population.max(), df.Population.mean())

1.5 3.6 2.5500000000000003


## File IO

In [20]:
seindf = pd.read_csv('~/Dropbox/data/seinfeld/seinfeld.csv')
print(seindf.shape)
display(seindf.head(3))               

(172, 5)


Unnamed: 0,Season,Episode,Title,Rating,Votes
0,1,2,The Stakeout,7.8,649
1,1,3,The Robbery,7.7,565
2,1,4,Male Unbonding,7.6,561


In [21]:
# write to file
import sys
df.to_csv('~/Dropbox/data/test/test.csv')
# print to sceen with custom seperator
df.to_csv(sys.stdout, sep=':', index=False, header=False)


Ohio:2000:1.5:9.80392156862745
Ohio:2001:1.7:11.11111111111111
Ohio:2002:3.6:23.52941176470588
Nevada:2001:2.4:15.686274509803921
Nevada:2002:2.9:18.954248366013072
Nevada:2003:3.2:20.915032679738562


In [22]:
df.to_json(sys.stdout)

{"State":{"0":"Ohio","1":"Ohio","2":"Ohio","3":"Nevada","4":"Nevada","5":"Nevada"},"Year":{"0":2000,"1":2001,"2":2002,"3":2001,"4":2002,"5":2003},"Population":{"0":1.5,"1":1.7,"2":3.6,"3":2.4,"4":2.9,"5":3.2},"PopPerc":{"0":9.8039215686,"1":11.1111111111,"2":23.5294117647,"3":15.6862745098,"4":18.954248366,"5":20.9150326797}}

# merge (again)

In [82]:
# merging DF on common colounms
dfa = pd.DataFrame({'A1': ['a','b','c'],
                   'A2' : [1,2,3]})

dfb = pd.DataFrame({'A1': ['b','c','d','e'],
                   'B2' : [100,200,300,400]})

print("-- dfa --")
display(dfa)

print ('-- dfb --')
display(dfb)

print ('-- merge (default: inner) --')
display(dfa.merge(dfb))

print ('-- merge left -- ')
display(dfa.merge(dfb,how='left',on='A1'))
                  
print ('-- merge right --')
display(dfa.merge(dfb,how='right',on='A1'))

print ('-- merge inner ---')                 
display(dfa.merge(dfb,how='inner',on='A1'))

print ('-- merge outer ---')                 
display(dfa.merge(dfb,how='outer',on='A1'))

# Fill NA with values -- use astype() to change dtype
dfm = dfa.merge(dfb,how='outer',on='A1')
dfm.B2 = dfm.B2.fillna(0).astype(int)
display(dfm)

-- dfa --


Unnamed: 0,A1,A2
0,a,1
1,b,2
2,c,3


-- dfb --


Unnamed: 0,A1,B2
0,b,100
1,c,200
2,d,300
3,e,400


-- merge (default: inner) --


Unnamed: 0,A1,A2,B2
0,b,2,100
1,c,3,200


-- merge left -- 


Unnamed: 0,A1,A2,B2
0,a,1,
1,b,2,100.0
2,c,3,200.0


-- merge right --


Unnamed: 0,A1,A2,B2
0,b,2.0,100
1,c,3.0,200
2,d,,300
3,e,,400


-- merge inner ---


Unnamed: 0,A1,A2,B2
0,b,2,100
1,c,3,200


-- merge outer ---


Unnamed: 0,A1,A2,B2
0,a,1.0,
1,b,2.0,100.0
2,c,3.0,200.0
3,d,,300.0
4,e,,400.0


Unnamed: 0,A1,A2,B2
0,a,1.0,0
1,b,2.0,100
2,c,3.0,200
3,d,,300
4,e,,400


In [24]:
dfa = pd.DataFrame({'A1': ['a','b'],
                   'A2' : [1,2]})

dfb = pd.DataFrame({'B1': ['b','c'],
                    'B2' : [100,200]})

dfa.merge(dfb, left_on='A1', right_on='B1', how='outer')

Unnamed: 0,A1,A2,B1,B2
0,a,1.0,,
1,b,2.0,b,100.0
2,,,c,200.0


In [25]:
dfa = pd.DataFrame({'A1': ['a','b','c'],
                   'A2' : [1,2,3]})

dfb = pd.DataFrame({ 'A1': ['a','c','b'],
                    'A2' : [1,3,2]})

dfb_s = canonicalize (dfb)

display(dfa, dfb, dfb_s)
#print(v)
dfa.equals(dfb)

NameError: name 'canonicalize' is not defined

## Melting


In [53]:
# melting DF - make wide -> narrow
df_w = pd.DataFrame({'A1': ['a','b','c'],
                   'V2' : [1,2,3],
                   'V3' : [11,22,33],
                   'V4': [21,22,23]})

c = df_w.columns
cl = list(c)
v = ['V2','V3','V4']
[ cl.remove(x) for x in v ] 
print (f' columns: {c}\n id_vars: {cl}\n value_vars: {v}')

df_n = df_w.melt(id_vars=cl,
                value_vars = v,
                var_name='VAR',
                value_name='VAL')

display(df_w, df_n)

 columns: Index(['A1', 'V2', 'V3', 'V4'], dtype='object')
 id_vars: ['A1']
 value_vars: ['V2', 'V3', 'V4']


Unnamed: 0,A1,V2,V3,V4
0,a,1,11,21
1,b,2,22,22
2,c,3,33,23


Unnamed: 0,A1,VAR,VAL
0,a,V2,1
1,b,V2,2
2,c,V2,3
3,a,V3,11
4,b,V3,22
5,c,V3,33
6,a,V4,21
7,b,V4,22
8,c,V4,23


In [54]:
#from 
#https://www.youtube.com/watch?v=kJsiiPK5sxs
#https://github.com/knathanieltucker/pandas-tutorial/blob/master/notebooks/Row-Column%20Transformations.ipynb
    
tips = sns.load_dataset('tips')
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


## Group by

In [55]:
tips_gb = tips.groupby(['day', 'sex']).agg({'size' : 'sum'})
tips_gb

Unnamed: 0_level_0,Unnamed: 1_level_0,size
day,sex,Unnamed: 2_level_1
Thur,Male,73
Thur,Female,79
Fri,Male,21
Fri,Female,19
Sat,Male,156
Sat,Female,63
Sun,Male,163
Sun,Female,53


In [56]:
#tibs_gb2 = tips_gb.pivot(columns=['sex'], values='size')
#tips_gb2
tips_gb1 = tips_gb.unstack()
tips_gb1
#tips_gb1['size', 'Male'] <-- access multi index col.

Unnamed: 0_level_0,size,size
sex,Male,Female
day,Unnamed: 1_level_2,Unnamed: 2_level_2
Thur,73,79
Fri,21,19
Sat,156,63
Sun,163,53


In [57]:
# flatten multi index col names 
tips_gb1.columns = ['__'.join(col).strip() for col in tips_gb1.columns.values]
tips_gb1

Unnamed: 0_level_0,size__Male,size__Female
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Thur,73,79
Fri,21,19
Sat,156,63
Sun,163,53


In [58]:
tips_gb1['mf_ratio'] = tips_gb1.size__Male / tips_gb1.size__Female
tips_gb1

Unnamed: 0_level_0,size__Male,size__Female,mf_ratio
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,73,79,0.924051
Fri,21,19,1.105263
Sat,156,63,2.47619
Sun,163,53,3.075472


In [59]:
tips_gb2 = tips_gb.unstack(0)
tips_gb2

Unnamed: 0_level_0,size,size,size,size
day,Thur,Fri,Sat,Sun
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,73,21,156,163
Female,79,19,63,53


In [60]:
tips_gb2.columns = ['__'.join(col).strip() for col in tips_gb2.columns.values]
tips_gb2

Unnamed: 0_level_0,size__Thur,size__Fri,size__Sat,size__Sun
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,73,21,156,163
Female,79,19,63,53


In [61]:
df1 = pd.DataFrame({'first': ['John', 'Mary'],
                        'last': ['Doe', 'Bo'],
                        'height': [5.5, 6.0],
                        'weight': [130, 150]})
df1

Unnamed: 0,first,last,height,weight
0,John,Doe,5.5,130
1,Mary,Bo,6.0,150


In [62]:
# melting can be done with stacking
df1.melt(id_vars=['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [63]:
#samething with stack
df1.set_index(['first', 'last'], inplace=True)
df1.stack().reset_index()

Unnamed: 0,first,last,level_2,0
0,John,Doe,height,5.5
1,John,Doe,weight,130.0
2,Mary,Bo,height,6.0
3,Mary,Bo,weight,150.0


In [64]:
def filterdata(data, low, high):
    ###
    ### YOUR CODE HERE
    ###
    df = pd.DataFrame(data, columns=['price', 'city'])
    # multiple conditions. (condition1) & (condition)
    df1 = df[(df['price'] >= low) & (df['price'] <= high)]
    
    # convert df to list
    o = df1.values.tolist()
    print (f'ret {len(o)} results')
    
    #print (o[0:10])
    #print('values', df1.values.tolist())
    return (o)

In [78]:
# adding row to dataframe
df = pd.DataFrame(np.array([[1,2], [3, 4], [4, 5]]), 
                  columns = ['A', 'B'])
print ('--- dataframe')
print(df.shape)
display(df)

data =  [10, 11]
print (f'adding {data}')
s = pd.Series(data, index = df.columns)
df = df.append(s, ignore_index = True)

print ('-- modified dataframe')
print(df.shape)
display(df)


--- dataframe
(3, 2)


Unnamed: 0,A,B
0,1,2
1,3,4
2,4,5


adding [10, 11]
-- modified dataframe
(4, 2)


Unnamed: 0,A,B
0,1,2
1,3,4
2,4,5
3,10,11


In [85]:
# Iterating over rows in DF
# NOTE - THIS IS NOT RECOMMENDED and considered an anti-pattern
for i, r in df.iterrows ():
    print(f'--- i: {i} type(row): {type(r)}')
    display(r)

--- i: 0 type(row): <class 'pandas.core.series.Series'>


A    1
B    2
Name: 0, dtype: int64

--- i: 1 type(row): <class 'pandas.core.series.Series'>


A    3
B    4
Name: 1, dtype: int64

--- i: 2 type(row): <class 'pandas.core.series.Series'>


A    4
B    5
Name: 2, dtype: int64

--- i: 3 type(row): <class 'pandas.core.series.Series'>


A    10
B    11
Name: 3, dtype: int64

# sample dataset

In [1]:
# Import seaborn
import seaborn as sns
# Check out available datasets
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'tips', 'titanic']


  gh_list = BeautifulSoup(http)


In [29]:
mpg_df = sns.load_dataset('mpg')
print(mpg_df.shape)
display(mpg_df.head(3))
mpg_df.describe()

(398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [30]:
id_vars = ['name']
val_vars = ['mpg', 'cylinders', 'origin']
mpg_narrow_df = pd.melt(mpg_df, id_vars = id_vars, value_vars = val_vars)
mpg_narrow_df

Unnamed: 0,name,variable,value
0,chevrolet chevelle malibu,mpg,18
1,buick skylark 320,mpg,15
2,plymouth satellite,mpg,18
3,amc rebel sst,mpg,16
4,ford torino,mpg,17
...,...,...,...
1189,ford mustang gl,origin,usa
1190,vw pickup,origin,europe
1191,dodge rampage,origin,usa
1192,ford ranger,origin,usa


## Date & time

In [60]:
dstr = [["2021-04-01 3:15PM"], ["2021/01/01" ]]
df = pd.DataFrame(dstr, columns=['date'])
display(df)
print('type: ',type(df.loc[0][0]))

# convert str to timestamp
df['date'] = pd.to_datetime(df['date'])
display(df)
print('type: ',type(df.loc[1][0]))

# timestamp arth
d = df.loc[0][0] - df.loc[1][0]
print (f'\ndiff between {df.loc[0][0]} and {df.loc[1][0]} is \'{d}\'')

Unnamed: 0,date
0,2021-04-01 3:15PM
1,2021/01/01


type:  <class 'str'>


Unnamed: 0,date
0,2021-04-01 15:15:00
1,2021-01-01 00:00:00


type:  <class 'pandas._libs.tslibs.timestamps.Timestamp'>

diff between 2021-04-01 15:15:00 and 2021-01-01 00:00:00 is '90 days 15:15:00'


In [69]:
# check if required cols are preset in DF

values = [['a', 1, 2, 3], ['b', 10, 11, 12]]
colnames = ['col1', 'col2', 'col2', 'col3']
df = pd.DataFrame(values, columns=colnames)
display (df)

cols = df.columns
req_cols = ['col1', 'col2']
has_cols =  all(elem in cols  for elem in req_cols)
print (f'cols {req_cols}  present ? ', has_cols)

req_cols = ['col1', 'col8']
has_cols =  all(elem in cols  for elem in req_cols)
print (f'cols {req_cols}  present ? ', has_cols)


Unnamed: 0,col1,col2,col2.1,col3
0,a,1,2,3
1,b,10,11,12


cols ['col1', 'col2']  present ?  True
cols ['col1', 'col8']  present ?  False


* concatinate dataframes

In [79]:
values = [['x', 101, 201, 301]]
df1 = pd.DataFrame(values, columns = df.columns) # number and name of columns should match
print (df.shape, df1.shape)

# note the []
df = pd.concat([df, df1])

print (df.shape)
display (df.head())

(4, 4) (1, 4)
(5, 4)


Unnamed: 0,col1,col2,col2.1,col3
0,a,1,2,3
1,b,10,11,12
0,x,101,201,301
0,x,101,201,x
0,x,101,201,301


In [113]:
# Sort DF
mpg_df = sns.load_dataset('mpg')
print('\nMPG ', mpg_df.shape); display(mpg_df.head(3))

# filter for us made cars after 80s
f = (mpg_df['origin'] == 'usa') & (mpg_df['model_year'] >= 80)

# sort by cols in descending order
mpg_df = mpg_df[f].sort_values(by=['cylinders', 'horsepower'], ascending=False)

print('\nUS MPG ',mpg_df.shape); display(mpg_df.head(10))



MPG  (398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite



US MPG  (40, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
364,26.6,8,350.0,105.0,3725,19.0,81,usa,oldsmobile cutlass ls
389,22.0,6,232.0,112.0,2835,14.7,82,usa,ford granada l
341,23.5,6,173.0,110.0,2725,12.6,81,usa,chevrolet citation
363,22.4,6,231.0,110.0,3415,15.8,81,usa,buick century
386,25.0,6,181.0,110.0,2945,16.4,82,usa,buick century limited
316,19.1,6,225.0,90.0,3381,18.7,80,usa,dodge aspen
365,20.2,6,200.0,88.0,3060,17.1,81,usa,ford granada gl
366,17.6,6,225.0,85.0,3465,16.6,81,usa,chrysler lebaron salon
387,38.0,6,262.0,85.0,3015,17.0,82,usa,oldsmobile cutlass ciera (diesel)
323,27.9,4,156.0,105.0,2800,14.4,80,usa,dodge colt
