## Introducing Pandas
Pandas is for Python numerical data analysis

In [193]:
import numpy as np
import pandas as pd

In [194]:
# pandas provides additional data structures
# a pandas series is a mutable collection of one data type
l = [9.0, 7, -5.0, 3, 99]
p = pd.Series(l)
p.values # this is a numpy array
p.index

RangeIndex(start=0, stop=5, step=1)

In [195]:
# we may customize the index
ind = ('c', 'b', 'a', 'c', 'd') # often an index is strings, but it may be any data type
s = pd.Series(l, index=ind )
s.dtype
s['b':'d'] # slicing in Pandas is inclusive: here from b to c inclusive

b     7.0
a    -5.0
c     3.0
d    99.0
dtype: float64

In [196]:
s['c'] # find all matching values for index 'c'

c    9.0
c    3.0
dtype: float64

In [197]:
# we may find matching members within a Series
s[s>2]
t = s**2 # we may apply operations across every members of the Series
s # NB operations do not mutate the original structure

c     9.0
b     7.0
a    -5.0
c     3.0
d    99.0
dtype: float64

In [198]:
# NaN is not a number - a numeric data type representing the absence of a number
s['a'] = np.nan
s**2 # operates on all calues except NaN
s.count() # counts all values that are not NaN
s.mean()

29.5

### Indexing and Ordinality

In [200]:
s.index
s.iloc[3] # iloc uses the underlying ordinal sequence of data members

3.0

In [201]:
s['g'] = 99
s['b':'g']

b     7.0
a     NaN
c     3.0
d    99.0
g    99.0
dtype: float64

### Pandas DataFrame

In [203]:
# here is some data
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000, 'Genoa':500}# dict
idata = {'Cork', 'Dublin', 'Shannon', 'Galway', 'Athlone'} # set
p = pd.Series(sdata, index=idata)
p # any missing members are assigned NaN

Athlone     5000.0
Dublin     71000.0
Shannon        NaN
Galway     16000.0
Cork       35000.0
dtype: float64

In [204]:
p['Cork']
p['Shannon'] = 32345 # this member is added where the index already exists
p

Athlone     5000.0
Dublin     71000.0
Shannon    32345.0
Galway     16000.0
Cork       35000.0
dtype: float64

In [205]:
# The DataFrame structure is like a spreadsheet
# Each column is a Series, must contain the same data type
# columns may differ in thier data type
towns_l = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Rosscarbery', 'Athenry']
years_l = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
pop_l   = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]
data_d = {'Towns':towns_l, 'Years':years_l, 'Pop':pop_l}
df = pd.DataFrame(data_d, index=towns_l)
df.dtypes # each column is a Series object - a single data type
df

Unnamed: 0,Towns,Years,Pop
Cork,Cork,2017,1.5
Dublin,Dublin,2018,1.7
Galway,Galway,2019,3.6
Athlone,Athlone,2020,2.4
Shannon,Shannon,2021,2.9
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [206]:
# working with DataFrame structures
# Careful - remember double square brackets
df[['Years','Pop']] # a list in a slice, in any order

Unnamed: 0,Years,Pop
Cork,2017,1.5
Dublin,2018,1.7
Galway,2019,3.6
Athlone,2020,2.4
Shannon,2021,2.9
Rosscarbery,2022,3.2
Athenry,2023,1.7


In [207]:
df.head(3)
df.tail(2)

Unnamed: 0,Towns,Years,Pop
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [208]:
# we may need to re-index our data
df.index = ('one', 'two', 'three', 'four', 'five', 'six', 'seven')
df.loc['three'] # we use 'loc' to locate an index member - this is a Series
df.iloc[3] # we can still use the underlying ordinality with 'iloc'

Towns    Athlone
Years       2020
Pop          2.4
Name: four, dtype: object

In [209]:
# we may create new DataFrames from existing data
df2 = df[['Towns', 'Pop']] #remember [[]]
df2

Unnamed: 0,Towns,Pop
one,Cork,1.5
two,Dublin,1.7
three,Galway,3.6
four,Athlone,2.4
five,Shannon,2.9
six,Rosscarbery,3.2
seven,Athenry,1.7


In [210]:
# we may add new columns to a DataFrame
df2['Tour'] = np.arange(7)
df2.loc['three']['Tour']
df2.iloc[3]['Tour']

3

In [229]:
# Use `df.loc[row_indexer, "col"] = values` to mutate individual members
df.loc['six', 'Pop'] = np.nan
df

Unnamed: 0,Towns,Years,Pop
one,Cork,2017,1.5
two,Dublin,2018,1.7
three,Galway,2019,3.6
four,Athlone,2020,2.4
five,Shannon,2021,2.9
six,Rosscarbery,2022,
seven,Athenry,2023,1.7
