## Pandas for Data Analysis
Numpy and Pandas work together to offer performant tools for Data Analysis

In [1]:
# we normally import at the top
import numpy as np # this provides array and arange
import pandas as pd # provides series and data frame

In [6]:
# Pandas provides a series data structure
l = [8.0, 7, -5, 3, 99] # a list
m = ['a', 'b', 'c', 'd']
o = pd.Series(l)
p = pd.Series(m)
o # a Series is a mutable indexed collection of a single data type
p
o.values # a numpy array
o.index  # a range
o # all Series will have an index

0     8.0
1     7.0
2    -5.0
3     3.0
4    99.0
dtype: float64

In [27]:
# Building Series with custom index
vals = (4, 6, 3, 9, 6, 2, 5, 7)
ind  = ('d', 'b', 'a', 'c', 'e', 'd', 'g', 'h')
s=pd.Series(vals, index=ind)
s['d'] # returns all matches
# CAREFUL index slicing is [start:stop] (not stop-before)
s['b':'g'] # access members by index (cannot use repeating index names)

b    6
a    3
c    9
e    6
d    2
g    5
dtype: int64

In [25]:
s['c'] = s['c']/3

In [32]:
# we can use the index or the ordinal position to access members of a Series
s[1] # ordinal
s['b'] # index

6

In [42]:
# we can addd new members to a Series
s['z'] = -3
s['e'] = np.nan # assign to a not-number value
s

d    4.0
b    6.0
a    3.0
c    9.0
e    NaN
d    2.0
g    5.0
h    7.0
i    2.0
z   -3.0
dtype: float64

In [43]:
# We can use logic to filter 
s[s>2]
s[s<0]
s**0.5 # NaN represents not a number (a numeric data type)

d    2.000000
b    2.449490
a    1.732051
c    3.000000
e         NaN
d    1.414214
g    2.236068
h    2.645751
i    1.414214
z         NaN
dtype: float64

In [44]:
# Working with structured data
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000}
idata = {'Cork', 'Dublin', 'Shannon', 'Galway', 'Athlone'} # set, list, tuple all good
p = pd.Series(sdata, idata) # or index=idata
p # any missing index members take NaN

Cork       35000.0
Shannon        NaN
Athlone     5000.0
Galway     16000.0
Dublin     71000.0
dtype: float64

In [47]:
p['Cork']
p['Shannon'] = 32344
p

Cork       35000.0
Shannon    32344.0
Athlone     5000.0
Galway     16000.0
Dublin     71000.0
dtype: float64

### Pandas Data Frame structure

In [48]:
# Dataframe is a collection of series
# Each column of a dataframe is a series
# Each column can therefore only contain one data type
# The dataframe columns can be of different daya types
# a Dataframe looks somewhat like a spreadsheet
towns_l = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Rosscarbery', 'Athenry']
years_l = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
pop_l   = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]

In [50]:
data_d = {'towns':towns_l, 'year':years_l, 'pop':pop_l}
data_d # we have a Python dictionary

{'towns': ['Cork',
  'Dublin',
  'Galway',
  'Athlone',
  'Shannon',
  'Rosscarbery',
  'Athenry'],
 'year': [2017, 2018, 2019, 2020, 2021, 2022, 2023],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]}

In [54]:
# Pandas Dataframe makes this look nicer
df = pd.DataFrame(data_d, index=towns_l) # index is optional
df
# we can re-index a Dataframe
index_l = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
df.index = index_l
df

Unnamed: 0,towns,year,pop
one,Cork,2017,1.5
two,Dublin,2018,1.7
three,Galway,2019,3.6
four,Athlone,2020,2.4
five,Shannon,2021,2.9
six,Rosscarbery,2022,3.2
seven,Athenry,2023,1.7


In [59]:
df.head(2) # just show  the first few
df.tail(3)

Unnamed: 0,towns,year,pop
five,Shannon,2021,2.9
six,Rosscarbery,2022,3.2
seven,Athenry,2023,1.7


In [63]:
# Alternatively we can make a dataframee from primitive structures
df2 = pd.DataFrame(data_d, columns=['pop', 'year'], index=towns_l)
df2

Unnamed: 0,pop,year
Cork,1.5,2017
Dublin,1.7,2018
Galway,3.6,2019
Athlone,2.4,2020
Shannon,2.9,2021
Rosscarbery,3.2,2022
Athenry,1.7,2023


In [65]:
df2['year'] # this is a Series

Cork           2017
Dublin         2018
Galway         2019
Athlone        2020
Shannon        2021
Rosscarbery    2022
Athenry        2023
Name: year, dtype: int64

### Using loc and iloc

In [67]:
# loc is location (uses the index)
df2.loc['Shannon']
# iloc is the underlying ordinal index location
df2.iloc[3]

pop        2.4
year    2020.0
Name: Athlone, dtype: float64

In [71]:
# we can use numbers for our own index
num_l = [4,5,6,3,2,1,0]
df3 = pd.DataFrame(data_d, index=num_l)
df3
# what is loc and iloc now...
df3.loc[2] # Shannon - index member 2
df3.iloc[2] # Galway - ordinal position 2

towns    Galway
year       2019
pop         3.6
Name: 6, dtype: object

### Accessing and Mutating Dataframe members

In [78]:
# we can add new columns to our Dataframe
df2['tour'] = np.arange(7)
df2.loc['Galway']['tour']+9 # this does not mutate the original value
new_val = [3.2,5.3,2.9]
df2['tour'] = pd.Series(new_val, index=['Galway', 'Shannon', 'Dublin'])
df2 # any new values not provided will become NaN

Unnamed: 0,pop,year,tour
Cork,1.5,2017,
Dublin,1.7,2018,2.9
Galway,3.6,2019,3.2
Athlone,2.4,2020,
Shannon,2.9,2021,5.3
Rosscarbery,3.2,2022,
Athenry,1.7,2023,


### Indexing and Filtering Dataframe

In [80]:
i = ['Waterford', 'Clonakilty', 'Athenry', 'Meath']
c = ['red', 'green', 'gold', 'white']
df4 = pd.DataFrame(np.arange(16).reshape(4,4), index=i, columns=c)
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,5,6,7
Athenry,8,9,10,11
Meath,12,13,14,15


In [82]:
whichRow = df4[ df4['gold']==6 ]
whichRow

Unnamed: 0,red,green,gold,white
Clonakilty,4,5,6,7


In [84]:
df4.loc['Clonakilty']['green'] = 55 # this mutation will persist
df4.iloc[1]['white']=99
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,55,6,99
Athenry,8,9,10,11
Meath,12,13,14,15
