### Introduction to Pandas

In [4]:
# by convention we import everything needed at the top
import numpy as np
import pandas as pd

In [5]:
# pandas has a series type (mutable collection of same data type)
l = [8.0, 7, -5, 3, 99]
o = pd.Series(l)
o.dtype
o.values # a numpy array
o.index # a range object
o # always has an index

0     8.0
1     7.0
2    -5.0
3     3.0
4    99.0
dtype: float64

In [6]:
# Building Series
vals = [4, 7, -5, 3] # just a list
ind  = ['d', 'b', 'a', 'd'] # another list
# s = pd.Series(vals, ind)
s = pd.Series(vals, index=ind) # more conventional
s # in addition to the default ordinality we also have our own index
# s['d':'d'] # slicing CANNOT handle repeated index names
s['d'] # returns ALL matches

d    4
d    3
dtype: int64

In [7]:
ind  = {'d', 'b', 'a', 'c'} # set ensures unique index names
s = pd.Series(vals, index=ind) # more conventional
s # in addition to the default ordinality we also have our own index
s['d']
s
# ind['b'] # do not expect the members of a set to be ordinal - they remain positional

b    4
a    7
d   -5
c    3
dtype: int64

In [8]:
s['d']=99.99 # we can mutate the member at index position 'd'
s[0] # the zeroth member
s # careful - auto dtype changing might be deprecated

b     4.00
a     7.00
d    99.99
c     3.00
dtype: float64

In [9]:
# we can add additional members to the Series
s['e']=0.000001
s[4]
s

b     4.000000
a     7.000000
d    99.990000
c     3.000000
e     0.000001
dtype: float64

#### Slicing Series

In [10]:
s['d':'e':2] # [start:stop:step] careful - not stop-before
s[1:4]     # [start:stop-before:step] (using underlying ordinal members)

a     7.00
d    99.99
c     3.00
dtype: float64

In [11]:
# we can use logic to get slices
s[s>2]
s[s<0]
s**0.5 # NaN means Not a Number

b    2.000000
a    2.645751
d    9.999500
c    1.732051
e    0.001000
dtype: float64

In [12]:
s[2]=np.nan
s

b    4.000000
a    7.000000
d         NaN
c    3.000000
e    0.000001
dtype: float64

In [13]:
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000}
idata = {'Cork', 'Dublin', 'Shannon', 'Galway', 'Athlone'} # set, list, tuple all good
p = pd.Series(sdata, index=idata)
p # any missing index memvbers take NaN

Cork       35000.0
Galway     16000.0
Dublin     71000.0
Athlone     5000.0
Shannon        NaN
dtype: float64

In [14]:
p['Cork']
p['Shannon']=32345
p

Cork       35000.0
Galway     16000.0
Dublin     71000.0
Athlone     5000.0
Shannon    32345.0
dtype: float64

### Pandas DataFrame

In [15]:
# DataFrame is a collection of Series
# Each column of a DataFrame is a Series
# Each column can therefore only contain ONE data type
# The DataFrame columns can be of dirrerent data types
# A DataFrame looks soewhat like a spreadsheet


In [16]:
towns_l = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Rosscarbery', 'Athenry']
years_l = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
pop_l   = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]
data_d  = {'Town':towns_l, 'Year':years_l, 'Pop':pop_l}
data_d

{'Town': ['Cork',
  'Dublin',
  'Galway',
  'Athlone',
  'Shannon',
  'Rosscarbery',
  'Athenry'],
 'Year': [2017, 2018, 2019, 2020, 2021, 2022, 2023],
 'Pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]}

In [17]:
df = pd.DataFrame(data_d, index=towns_l) # index is optional
df # we have a pretty DataFrame

Unnamed: 0,Town,Year,Pop
Cork,Cork,2017,1.5
Dublin,Dublin,2018,1.7
Galway,Galway,2019,3.6
Athlone,Athlone,2020,2.4
Shannon,Shannon,2021,2.9
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [18]:
df.head(3) # just the first few
df.tail(2) # just the last few

Unnamed: 0,Town,Year,Pop
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [19]:
# we can re-index a DataFrame
index_l = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
df.index=index_l
df

Unnamed: 0,Town,Year,Pop
one,Cork,2017,1.5
two,Dublin,2018,1.7
three,Galway,2019,3.6
four,Athlone,2020,2.4
five,Shannon,2021,2.9
six,Rosscarbery,2022,3.2
seven,Athenry,2023,1.7


In [20]:
# alternatively we can make a DataFrame using just the data members we are interested in
df2 = pd.DataFrame(data_d, columns=['Pop', 'Year'], index=index_l)
df2

Unnamed: 0,Pop,Year
one,1.5,2017
two,1.7,2018
three,3.6,2019
four,2.4,2020
five,2.9,2021
six,3.2,2022
seven,1.7,2023


### Accessing Data within DataFrames

In [21]:
df2.index = towns_l
df2
# we can use 'loc' to locate rows
df2['Pop'] # gets the entire column
df2.loc['Shannon'] # get that row

Pop        2.9
Year    2021.0
Name: Shannon, dtype: float64

In [22]:
# we use iloc to access rows by their ordinal underlying index position
df2.iloc[3] # ordinal position 0, 1, 2, 3

Pop        2.4
Year    2020.0
Name: Athlone, dtype: float64

In [23]:
# we can use both loc and iloc on numbers
num_l = [4,5,6,3,2,1,0]
df3 = pd.DataFrame(data_d, index = num_l)
df3
# what is df.loc[2] and df.iloc[2]
df3.loc[2] # the index member 2
df3.iloc[2] # the ordinal member 3

Town    Galway
Year      2019
Pop        3.6
Name: 6, dtype: object

#### Access and Mutate Members of a DataFrame

In [24]:
# we can mutate members of a DataFrame
df2['Tour'] = np.arange(7)
df2.loc['Galway']['Tour']+9 # this does not persist the value in the DataFrame
# we can override values like this
new_values = [3.2, 5.3, 2.9]
df2['Tour'] = pd.Series(new_values, index=['Galway', 'Shannon', 'Dublin'])
df2 # missing vaues take value NaN

Unnamed: 0,Pop,Year,Tour
Cork,1.5,2017,
Dublin,1.7,2018,2.9
Galway,3.6,2019,3.2
Athlone,2.4,2020,
Shannon,2.9,2021,5.3
Rosscarbery,3.2,2022,
Athenry,1.7,2023,


### Indexing and Filtering DataFrames

In [28]:
i = ['Waterford', 'Clonakilty', 'Athenry', 'Meath']
c = ['red', 'green', 'gold', 'white']
df4 = pd.DataFrame(np.arange(16).reshape(4,4), index = i, columns=c)
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,5,6,7
Athenry,8,9,10,11
Meath,12,13,14,15


In [37]:
whichRow = df4[ df4['gold']==6 ]
# whichRow['green']=55 # not possible
df4.loc['Clonakilty']['green'] = 55
df4.iloc[1]['white'] = 99
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,55,6,99
Athenry,8,9,10,11
Meath,12,13,14,15
