### Introducing Pandas
Pandas is Python And Numerical Data Analysis

In [5]:
import numpy as np
import pandas as pd # for stand-along remember to pip install pandas

In [59]:
# Pandas procides additional data structures
l = [8.0, 7, -5.0, 3, 99]
# a Pandas Series is a mutable collection of the same data type
# p = pd.Series(l, index=range(0,10,2)) # we may set an index on creation
p = pd.Series(l)
p.values # the values of a Series are a Numpy array
p.index # a range-like object - there is always an index
# p[0:3] # by default the index is zero-based, 0, 1, 2.... We can apply our own index
ind = ('d','b','a','c', 'd') # a tuple (same length as the target series)
s = pd.Series(l, index=ind) # we use the tuple as an index
s.index
s['b':'c'] # normal slicing, but inclusive
s['d'] # every match is found - even with repeated index names

d     8.0
d    99.0
dtype: float64

In [69]:
# building Pandas Series
vals = (4,7,-5,3)
ind = {'d', 'b', 'a', 'c'} # set ensures unique members
s = pd.Series(vals, index=ind)
s['d'] # due to the index being a set we cannot presume the index order


-5

In [79]:
s['d'] = 99.99 # we can mutate members of a Series
s.iloc[0] # iloc lets us use the inderlying index location
s

  s['d'] = 99.99 # we can mutate members of a Series


a     4.00
c     7.00
d    99.99
b     3.00
dtype: float64

In [85]:
# as well as mutating members, we can add members to the Series
s['e'] = -321
s['d':'e':2] # step uses the underlying ordinality

d     99.99
e   -321.00
dtype: float64

In [93]:
# We can use comparative logic to return members
s[s>2] # any comparitor is legal here
# we may apply mathematics to all the members
s**0.5  # apply to every member NaN represents 'not a number'

a    2.000000
c    2.645751
d    9.999500
b    1.732051
e         NaN
dtype: float64

In [95]:
s.iloc[2] = np.nan
s

a      4.0
c      7.0
d      NaN
b      3.0
e   -321.0
dtype: float64

### Pandas Data Analysis with Data Frames

In [115]:
# here is some data
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000, 'Genoa':500}
idata = {'Cork', 'Dublin', 'Shannon', 'Galway', 'Athlone'} # set, list, tuple all good
p = pd.Series(sdata, index=idata)
p # any missing data members are assigned NaN

Athlone     5000.0
Dublin     71000.0
Shannon        NaN
Cork       35000.0
Galway     16000.0
dtype: float64

In [117]:
p['Cork'] # case sensitive index
p['Shannon'] = 32345
p

Athlone     5000.0
Dublin     71000.0
Shannon    32345.0
Cork       35000.0
Galway     16000.0
dtype: float64

In [None]:
# Pandas also provides a DataFrame class
# Each column of a DataFrame is a Series
# Therefore each column may only contain a single data type
# the columns of a DataFrame may be of different types
# A DataFrame is somewha like a spreadsheet

In [119]:
towns_l = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Rosscarbery', 'Athenry']
years_l = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
pop_l   = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]
data_d = {'Towns':towns_l, 'Years':years_l, 'Pop':pop_l}
data_d # we have a dictionary

{'Towns': ['Cork',
  'Dublin',
  'Galway',
  'Athlone',
  'Shannon',
  'Rosscarbery',
  'Athenry'],
 'Years': [2017, 2018, 2019, 2020, 2021, 2022, 2023],
 'Pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]}

In [123]:
# we may construct a DataFrame from this dictionary
df = pd.DataFrame(data_d, index=towns_l) # index is optional
df # we have a pretty reopresentation of the data

Unnamed: 0,Towns,Years,Pop
Cork,Cork,2017,1.5
Dublin,Dublin,2018,1.7
Galway,Galway,2019,3.6
Athlone,Athlone,2020,2.4
Shannon,Shannon,2021,2.9
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [137]:
# CAREFUL [[]]
df[['Pop', 'Years']] # we may examine single columns of the DataFrame
# a single columns is a Series. A list or columns returns a DataFrame

Unnamed: 0,Pop,Years
Cork,1.5,2017
Dublin,1.7,2018
Galway,3.6,2019
Athlone,2.4,2020
Shannon,2.9,2021
Rosscarbery,3.2,2022
Athenry,1.7,2023


In [145]:
df.head(3) # we may choose to show a few members
df.tail(2)

Unnamed: 0,Towns,Years,Pop
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [147]:
# we may re-index at any time
new_ind= ('one','two','three','four','five','six','seven')
df.index = new_ind
df

Unnamed: 0,Towns,Years,Pop
one,Cork,2017,1.5
two,Dublin,2018,1.7
three,Galway,2019,3.6
four,Athlone,2020,2.4
five,Shannon,2021,2.9
six,Rosscarbery,2022,3.2
seven,Athenry,2023,1.7


In [167]:
# we may use xisting data to populate a new DataFrame with specific columns
df2 = pd.DataFrame(data_d, columns=['Pop', 'Years'], index=new_ind) # columns in this order
df2
df3 = df2[['Pop']] # we may also make a DataFrame from an existing DataFrame
df3

Unnamed: 0,Pop
one,1.5
two,1.7
three,3.6
four,2.4
five,2.9
six,3.2
seven,1.7


#### Accessing Data within DataFrames

In [195]:
df2.index = towns_l
df2['Pop'] # returns just that column
df2.iloc[3] # iloc lets us use the underlying ordinality of the members (just that row)
df2.loc['Shannon'] # locate a member from the index - return the matching row

Pop         2.4
Years    2020.0
Name: Athlone, dtype: float64

In [209]:
# we may use both loc and iloc together
num_l = [4,5,6,3,2,1,0]
df4 = pd.DataFrame(data_d, index=num_l)
df4
# df4.loc[2] # our custom index member 2
df4.iloc[2] # the underlying ordinal index

Towns    Galway
Years      2019
Pop         3.6
Name: 6, dtype: object