### Introduction to Pandas

In [2]:
# by convention we import things in the top cell
import numpy as np
import pandas as pd # convention is call it pd

In [7]:
# pandas has a 'series' data type
o = pd.Series([8.0, 7, -5, 3, 99]) # Series members must be the same data type
o.dtype
o # series always have a numeric index (zero based)
o.values # it is a numpy array
o.index  # it is a range 

RangeIndex(start=0, stop=5, step=1)

#### Building Series

In [16]:
vals = [4, 7, -5, 3] # just a python list
ind  = ['d', 'b', 'a', 'c'] # another python list
# we can make a Series from these lists
s = pd.Series(vals, index=ind) # formally declare the index
s # in addition to the zero-based index, we have our own index
s['c'] # access members by their index
s['b']=99.4 # might need to automatically change the data type
s # careful - auto dtype changes may be deprecated in future
# we can add new members to our Series
s['e']=0.001
s

d     4.000
b    99.400
a    -5.000
c     3.000
e     0.001
dtype: float64

#### Slicing Series

In [20]:
s['b':'e':2] # slicing start:stop:step (not stop-before)

b    99.4
c     3.0
dtype: float64

In [24]:
# we can apply logic to slice parts of our Series
s[s>2]
s[s<0]
s**0.5

d    2.000000
b    9.969955
a         NaN
c    1.732051
e    0.031623
dtype: float64

In [26]:
# we can make Series from many structures
sdata = {'Cork':35000, 'Dublin':71000, 'Galway':16000, 'Athlone':5000}
idata = ['Cork','Dublin', 'Shannon','Athlone','Galway']
p = pd.Series(sdata, index=idata)
p # missing values are autmatically set to NaN

Cork       35000.0
Dublin     71000.0
Shannon        NaN
Athlone     5000.0
Galway     16000.0
dtype: float64

In [28]:
p['Cork']
p['Shannon']=32345
p

Cork       35000.0
Dublin     71000.0
Shannon    32345.0
Athlone     5000.0
Galway     16000.0
dtype: float64

### Pandas DataFrame

In [32]:
# DataFrame is a collection of Series. Each column of a DataFame is a Series
# Each column must contain ONE data type (since it is a Series)
# Across the DataFrame we can have different data types
# a DataFrame looks a bit like a spreadsheet
towns_l = ['Cork', 'Dublin', 'Galway', 'Athlone', 'Shannon', 'Rosscarbery', 'Athenry']
years_l = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
pop_l   = [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]
data    = {'Town':towns_l, 'Year':years_l, 'Pop':pop_l}
data

{'Town': ['Cork',
  'Dublin',
  'Galway',
  'Athlone',
  'Shannon',
  'Rosscarbery',
  'Athenry'],
 'Year': [2017, 2018, 2019, 2020, 2021, 2022, 2023],
 'Pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 1.7]}

In [43]:
# we can choose to specify the index when we built the DataFrame
df = pd.DataFrame(data, index=towns_l)
df # each column so fthe DataFrame is a Series
# we have a default index (zero-based)

Unnamed: 0,Town,Year,Pop
Cork,Cork,2017,1.5
Dublin,Dublin,2018,1.7
Galway,Galway,2019,3.6
Athlone,Athlone,2020,2.4
Shannon,Shannon,2021,2.9
Rosscarbery,Rosscarbery,2022,3.2
Athenry,Athenry,2023,1.7


In [39]:
df.head(3) # just the top
df.tail(2) # just the last few

Unnamed: 0,Town,Year,Pop
5,Rosscarbery,2022,3.2
6,Athenry,2023,1.7


In [86]:
index_l = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
df.index=index_l # we can alter an index AFTER creating the DataFrame
df
# or we can just make a new DataFrame with the bits we want
df2 = pd.DataFrame(data, columns=['Year', 'Pop'], index=index_l)
df2

Unnamed: 0,Year,Pop
one,2017,1.5
two,2018,1.7
three,2019,3.6
four,2020,2.4
five,2021,2.9
six,2022,3.2
seven,2023,1.7


### Accessing Data within DataFrames

In [87]:
# we can use 'loc' to find the 'location' of data members
df2.index = towns_l
df2
df2.loc['Athlone'] # use an index member to locate values

Year    2020.0
Pop        2.4
Name: Athlone, dtype: float64

In [88]:
# we can use 'iloc' to find the ordinal loation (based on the underlying index)
df2.iloc[3] # 0, 1, 2, 3 - the member at position three

Year    2020.0
Pop        2.4
Name: Athlone, dtype: float64

In [89]:
num_l = [4,5,6,3,2,1,0]
df3 = pd.DataFrame(data, index=num_l)
df3
df3.loc[6] # Galway
df3.iloc[6]# Athenry

Town    Athenry
Year       2023
Pop         1.7
Name: 0, dtype: object

In [94]:
# we can mutate DataFrames
df2['Tour'] = np.arange(7.)
df2.loc['Galway']['Tour']+99.0 # this does NOT alter the original DataFrame
# using 'at' to alter values
df2.at['Galway','Tour'] = 99 # this DOES alter the original DataFrame
df2
# we can overwrite many values
new_values = [3.2, 5.3, 2.9]
df2['Tour'] = pd.Series(new_values, index=['Galway', 'Shannon', 'Dublin'])
df2 # missing members become NaN

Unnamed: 0,Year,Pop,Tour
Cork,2017,1.5,
Dublin,2018,1.7,2.9
Galway,2019,3.6,3.2
Athlone,2020,2.4,
Shannon,2021,2.9,5.3
Rosscarbery,2022,3.2,
Athenry,2023,1.7,


### Indexing and Filtering DataFrames

In [96]:
i = ['Waterford', 'Clonakilty', 'Athenry', 'Meath']
c = ['red', 'green', 'gold', 'white']
df4 = pd.DataFrame(np.arange(16).reshape(4,4), index=i, columns=c)
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,5,6,7
Athenry,8,9,10,11
Meath,12,13,14,15


In [105]:
whichRow = df4[ df4['gold']==6 ]
whichRow
# we can mutate values
# whichRow['green']=55
df4.loc['Clonakilty']['green'] = 55
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,55,6,7
Athenry,8,9,10,11
Meath,12,13,14,15


In [106]:
# we can still use iloc
df4.iloc[1]['white']=77
df4

Unnamed: 0,red,green,gold,white
Waterford,0,1,2,3
Clonakilty,4,55,6,77
Athenry,8,9,10,11
Meath,12,13,14,15
