### Introducing Pandas

In [2]:
# by convention we use the first cell to import stuff
import numpy as np
import pandas as pd

In [9]:
# pandas has a 'series' data type
o = pd.Series([6.0, 7, -5, 3, 99]) # all the same data type
o.dtype
o.values # the values are in a 1-dimensional numpy array
o.index # there will be a default index
o

0     6.0
1     7.0
2    -5.0
3     3.0
4    99.0
dtype: float64

#### Building Series

In [15]:
values = [4, 7, -5, 3] # just a Python list
ind    = ['d', 'b', 'a', 'c'] # another Python list
# we can make a Series from these lists
s = pd.Series(values, index=ind) # replace the default index with our own
s['a'] # show the data member at index position 'a'
s['e'] = 99 # set a new member in the Series with index label 'e'
s['b'] = 77 # mutate a member
s

d     4
b    77
a    -5
c     3
e    99
dtype: int64

#### Slicing Series

In [22]:
print( s['d':'c'] ) # CAREFUL: slicing is start:stop:step (not stop-before)
# remember, the actual Series is NOT mutated
s

d     4
b    77
a    -5
c     3
dtype: int64


d     4
b    77
a    -5
c     3
e    99
dtype: int64

In [27]:
# we can do maths we can implement logic
s[s>5]
s[s<0]
s**0.5 # the data-type will need ot change automatically

d    2.000000
b    8.774964
a         NaN
c    1.732051
e    9.949874
dtype: float64

In [28]:
# Nan means not a number - but NaN is a numeric data type

In [29]:
# we can make a dict of values
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000}
idata = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon']

In [30]:
p = pd.Series(sdata, index=idata)
p # mising values are automatically set to NaN

Cork       35000.0
Dublin     71000.0
Galway     16000.0
Athlone     5000.0
Shannon        NaN
dtype: float64

In [31]:
p['Shannon'] = 32323
p

Cork       35000.0
Dublin     71000.0
Galway     16000.0
Athlone     5000.0
Shannon    32323.0
dtype: float64

#### Pandas Data Frame

In [32]:
# the Data Frame is a collection of Pandas Series as columns
# all the series will share one index
# Each column may have a header
# looks a bit like a spreadsheet
town_list = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Roscarberry']
year_list = [2017, 2018, 2019, 2020, 2021, 2022] # a list
pop_list  = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2] # a list
data = {'town':town_list, 'year':year_list, 'pop': pop_list } # a dict

In [33]:
df = pd.DataFrame(data)
df # now each columns is made up from a Pandas Series

Unnamed: 0,town,year,pop
0,Cork,2017,1.5
1,Dublin,2018,1.7
2,Galway,2019,3.6
3,Athlone,2020,2.4
4,Shannon,2021,2.9
5,Roscarberry,2022,3.2


In [36]:
df.head(3)
df.tail(2)

Unnamed: 0,town,year,pop
4,Shannon,2021,2.9
5,Roscarberry,2022,3.2


In [44]:
# we can specify the order of the columns and even which columns we need
index_l = ['one', 'two', 'three', 'four', 'five', 'six']
df = pd.DataFrame(data, columns=['pop', 'year'], index=index_l)
df['pop'] # we now have a Pandas Series
df

Unnamed: 0,pop,year
one,1.5,2017
two,1.7,2018
three,3.6,2019
four,2.4,2020
five,2.9,2021
six,3.2,2022


#### Accessing data within Data Frames

In [43]:
# we an use the index to access members
df.loc['two'] # here we access LOCation of a row

pop        1.7
year    2018.0
Name: two, dtype: float64

In [45]:
# in addition we can access by teh numerical position of a row
df.iloc[3] # iloc is index location (zero-based)

pop        2.4
year    2020.0
Name: four, dtype: float64

In [53]:
# we need a fresh Data Frame
num_list = [4,5,6,3,2,1]
df4 = pd.DataFrame(data, index=num_list)
df4 # careful - our own numbbers are the index, there is still an underlying ordinal index
df4.loc[6] # our index member 6
df4.iloc[2] # underlying ordinal 2

town    Galway
year      2019
pop        3.6
Name: 6, dtype: object

In [55]:
# we can add new data columns to an existing Data Frame
df4['Tour'] = np.arange(6.)
df4

Unnamed: 0,town,year,pop,Tour
4,Cork,2017,1.5,0.0
5,Dublin,2018,1.7,1.0
6,Galway,2019,3.6,2.0
3,Athlone,2020,2.4,3.0
2,Shannon,2021,2.9,4.0
1,Roscarberry,2022,3.2,5.0


In [74]:
# most changes do not persist
df4.loc[4]['Tour']+3
# to overwrite values
new_values = [3.2, 5.3, 2.9]
# here we DO override the values - they persist
# Careful - missing values are now NaN
df4['Tour'] = pd.Series(new_values, index=[4, 6, 2])
df4

Unnamed: 0,town,year,pop,Tour
4,Cork,2017,1.5,3.2
5,Dublin,2018,1.7,
6,Galway,2019,3.6,5.3
3,Athlone,2020,2.4,
2,Shannon,2021,2.9,2.9
1,Roscarberry,2022,3.2,


#### Indexing and Filtering Data Frames

In [102]:
i = ['Waterford', 'Clonakilty', 'Athenry', 'Meath']
c = ['one', 'two', 'three', 'four']
df5 = pd.DataFrame(np.arange(16).reshape(4,4), index=i, columns=c)
df5

Unnamed: 0,one,two,three,four
Waterford,0,1,2,3
Clonakilty,4,5,6,7
Athenry,8,9,10,11
Meath,12,13,14,15


In [109]:
# we can spot outlying values
whichRow = df5[ df5['three'] == 6 ] # we now know which row(s)
whichRow['three'] = 0
whichRow
df5.loc['Clonakilty']['three'] = 0
df5

Unnamed: 0,one,two,three,four
Waterford,0,1,2,3
Clonakilty,4,5,0,7
Athenry,8,9,10,11
Meath,12,13,14,15
