### Introducing Pandas

In [2]:
# by convention we use the first cell to import stuff
import numpy as np
import pandas as pd

In [9]:
# pandas has a 'series' data type
o = pd.Series([6.0, 7, -5, 3, 99]) # all the same data type
o.dtype
o.values # the values are in a 1-dimensional numpy array
o.index # there will be a default index
o

0     6.0
1     7.0
2    -5.0
3     3.0
4    99.0
dtype: float64

#### Building Series

In [15]:
values = [4, 7, -5, 3] # just a Python list
ind    = ['d', 'b', 'a', 'c'] # another Python list
# we can make a Series from these lists
s = pd.Series(values, index=ind) # replace the default index with our own
s['a'] # show the data member at index position 'a'
s['e'] = 99 # set a new member in the Series with index label 'e'
s['b'] = 77 # mutate a member
s

d     4
b    77
a    -5
c     3
e    99
dtype: int64

#### Slicing Series

In [22]:
print( s['d':'c'] ) # CAREFUL: slicing is start:stop:step (not stop-before)
# remember, the actual Series is NOT mutated
s

d     4
b    77
a    -5
c     3
dtype: int64


d     4
b    77
a    -5
c     3
e    99
dtype: int64

In [27]:
# we can do maths we can implement logic
s[s>5]
s[s<0]
s**0.5 # the data-type will need ot change automatically

d    2.000000
b    8.774964
a         NaN
c    1.732051
e    9.949874
dtype: float64

In [28]:
# Nan means not a number - but NaN is a numeric data type

In [29]:
# we can make a dict of values
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000}
idata = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon']

In [30]:
p = pd.Series(sdata, index=idata)
p # mising values are automatically set to NaN

Cork       35000.0
Dublin     71000.0
Galway     16000.0
Athlone     5000.0
Shannon        NaN
dtype: float64

In [31]:
p['Shannon'] = 32323
p

Cork       35000.0
Dublin     71000.0
Galway     16000.0
Athlone     5000.0
Shannon    32323.0
dtype: float64

#### Pandas Data Frame

In [32]:
# the Data Frame is a collection of Pandas Series as columns
# all the series will share one index
# Each column may have a header
# looks a bit like a spreadsheet
town_list = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Roscarberry']
year_list = [2017, 2018, 2019, 2020, 2021, 2022] # a list
pop_list  = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2] # a list
data = {'town':town_list, 'year':year_list, 'pop': pop_list } # a dict

In [33]:
df = pd.DataFrame(data)
df # now each columns is made up from a Pandas Series

Unnamed: 0,town,year,pop
0,Cork,2017,1.5
1,Dublin,2018,1.7
2,Galway,2019,3.6
3,Athlone,2020,2.4
4,Shannon,2021,2.9
5,Roscarberry,2022,3.2


In [36]:
df.head(3)
df.tail(2)

Unnamed: 0,town,year,pop
4,Shannon,2021,2.9
5,Roscarberry,2022,3.2


In [38]:
# we can specify the order of the columns and even which columns we need
index_l = ['one', 'two', 'three', 'four', 'five', 'six']
df = pd.DataFrame(data, columns=['pop', 'year'], index=index_l)
df['pop'] # we now have a Pandas Series

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: pop, dtype: float64