### Introducing Pandas

In [1]:
# we need to import the libraries
import numpy as np
import pandas as pd

In [8]:
# Pandas has a 'series' data type
o = pd.Series([4, 7, -5, 3, 99]) # all the same data type
o # a Series will have an index 0, 1, 2 etc.
o.values # the values of a Series are a numpy arange
o.index # start, stop-before, step
o # we have a default index

0     4
1     7
2    -5
3     3
4    99
dtype: int64

In [14]:
values_list = [4, 7, -5, 3] # just Python lists
index_list  = ['d', 'b', 'a', 'c']
o2 = pd.Series(values_list, index=index_list)
o2 # a Series can use ANY index (index values must all be the same data type)
o2['a'] # -5 is at index-position 'a'
o2['e'] = 99 # insert new members
o2['b'] = 77 # mutate existing members
o2

d     4
b    77
a    -5
c     3
e    99
dtype: int64

In [17]:
# we can use slicing with Series
o2['d':'c'] # careful - this is [start:stop:step] (not stop-before)
slice_list = ['e', 'c', 'a'] # we can have a non-consecutive slice
o2[slice_list]

e    99
c     3
a    -5
dtype: int64

In [21]:
# as before, we can do maths and logic
o3 = o2[ o2>6 ] # to persist the data, store it in a fresh structure
o2**3
o2 # the original Series remains unchanged

d     4
b    77
a    -5
c     3
e    99
dtype: int64

### Using Series for Data Sets

In [27]:
# we can make a dict of values
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000}
idata = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon']
p = pd.Series(sdata, index=idata) # same as before, but we injected an index
p # the missing value is NaN which is a numeric data type

Cork       35000.0
Dublin     71000.0
Galway     16000.0
Athlone     5000.0
Shannon        NaN
dtype: float64

### Pandas Data Frame

In [34]:
# the Data Frame is a collection of Pandas Series
# they all share the same index
# each series can have a column header
# looks rather like a spreaadsheet
town_list = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Roscarberry']
year_list = [2017, 2018, 2019, 2020, 2021, 2022]
pop_list  = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
data = {'town':town_list, 'year':year_list, 'pop': pop_list }
df = pd.DataFrame(data) # each column will contsain all the same data type
# (cana have differnet data types across columns)
df # the dataframe will have a default index 0, 1, 2 ...
df.describe() # we get an overview of the data statistics

Unnamed: 0,year,pop
count,6.0,6.0
mean,2019.5,2.55
std,1.870829,0.836062
min,2017.0,1.5
25%,2018.25,1.875
50%,2019.5,2.65
75%,2020.75,3.125
max,2022.0,3.6


In [39]:
df.head(3)
df.tail(2)

Unnamed: 0,town,year,pop
4,Shannon,2021,2.9
5,Roscarberry,2022,3.2


In [50]:
# we can specify the order of our data columns, and which columns we need
# we can also specify our index values
index_l = ['one', 'two', 'three', 'four', 'five', 'six']
df2 = pd.DataFrame(data, columns=['pop', 'year'], index=index_l)
df2

Unnamed: 0,pop,year
one,1.5,2017
two,1.7,2018
three,3.6,2019
four,2.4,2020
five,2.9,2021
six,3.2,2022


In [54]:
# we can also use data from our other structures to specify the index
df3 = pd.DataFrame(data, columns=['pop', 'year'], index=town_list)
df3['pop'] # remember each column is a Series
df3

Unnamed: 0,pop,year
Cork,1.5,2017
Dublin,1.7,2018
Galway,3.6,2019
Athlone,2.4,2020
Shannon,2.9,2021
Roscarberry,3.2,2022


### Accessing rows and columns with loc, iloc and axis

In [56]:
# we use 'loc' for location of index
df3.loc['Athlone'] # here we access the row at index 'Athlone'

pop        2.4
year    2020.0
Name: Athlone, dtype: float64

In [57]:
# we can use 'iloc' for index location by ordinal position
df3.iloc[3] # index row ordinal number 3 (counts 0, 1, 2, ...)

pop        2.4
year    2020.0
Name: Athlone, dtype: float64

In [65]:
# can we use numbers for index values
num_list = [4,5,6,3,2,1]
df4 = pd.DataFrame(data, index=num_list)
df4
df4.loc[2] # Shannon
df4.iloc[2] # Galway
# probably not a good idea to mix non-ordinal numeric indexes!!!!

town    Galway
year      2019
pop        3.6
Name: 6, dtype: object

In [80]:
# we can add additional data to an existing DataFrame
df2['Debt'] = np.arange(6.)
new_values = [-1.2, 1.5, -2.7]
# overwrite some of the values
df2['Debt'] = pd.Series(new_values, index=['one', 'three', 'five']) # we get NaN for missing values
df2

Unnamed: 0,pop,year,Debt
one,1.5,2017,-1.2
two,1.7,2018,
three,3.6,2019,1.5
four,2.4,2020,
five,2.9,2021,-2.7
six,3.2,2022,


In [84]:
# we can transpose DataFrames
df2.T
# df2 # remember - changes may not persist

Unnamed: 0,one,two,three,four,five,six
pop,1.5,1.7,3.6,2.4,2.9,3.2
year,2017.0,2018.0,2019.0,2020.0,2021.0,2022.0
Debt,-1.2,,1.5,,-2.7,


### Indexing and Filtering DataFrames

In [97]:
i = ['Waterford', 'Clonakilty', 'Athenry', 'Meath']
c = ['one', 'two', 'three', 'four']
df5 = pd.DataFrame(np.arange(16).reshape(4,4), index=i, columns=c)
df5
# we can spot outlying data members
df5[ df5['three'] > 9 ] = 0 # set all column-three values greater than nine to zero
df5 # NB teh whole row gets set to zero

Unnamed: 0,one,two,three,four
Waterford,0,1,2,3
Clonakilty,4,5,6,7
Athenry,0,0,0,0
Meath,0,0,0,0
