### Using Pandas for Data Analysis

In [50]:
# most data analysis need the following three libraries
import numpy as np # gives us arrays 
import pandas as pd # gives us series and data frames
import matplotlib.pyplot as plt # lets us plot charts

In [13]:
# Pandas is Python And Data Analysis
data = pd.Series([0.25, 0.5, 0.75, 1.0]) # a list of floating point numbers
data # a series is like a Numpy array but with an explicit index

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [14]:
# CAREFUL - a Series can ONLY contain ONE data type
# all float, all int, all string object etc.
# we can slice a series
data[1:4] # [start:stop-before:step]
data[3]

1.0

In [15]:
# we can see just the index
data.index
# or just the values
data.values # ooh look it is a Numpy array!!

array([0.25, 0.5 , 0.75, 1.  ])

In [16]:
# With Series, we are NOT limited to numeric index
# here is a Python dictionary - a non-numeric collection of members
# remember [] indicates a list, {} indicates a dictionary
population_dict = {'London': 38332521,
                   'Reading': 26448193,
                   'Birmingham': 19651127,
                   'Bath': 19552860,
                   'Southampton': 12882135, 'Oxford':1200}
# we can make a series out of a dictionary
population = pd.Series(population_dict)
population # we have a STRING-indexed collection

London         38332521
Reading        26448193
Birmingham     19651127
Bath           19552860
Southampton    12882135
Oxford             1200
dtype: int64

In [17]:
# we can slice as usual
population['London':'Bath'] # we can still slice

London        38332521
Reading       26448193
Birmingham    19651127
Bath          19552860
dtype: int64

In [18]:
# we can make a Series with an index like this
values_list = [5,4,3,2] # just a Python list
index_list  = ['c', 'd', 'a', 'b'] # another Python list

# then assemble them into a Series
g = pd.Series(values_list , index=index_list  )
g

c    5
d    4
a    3
b    2
dtype: int64

#### Using Pandas Data Frame

In [19]:
# here is some finctional data
area_dict = {'London': 423967, 'Birmingham': 695662, 'Reading': 141297,
             'Bath': 170312, 'Southampton': 149995, 'Oxford': 20100}

In [20]:
area_dict, population_dict
# we need to make the area dictionary into a Series
area = pd.Series(area_dict)

# we can bring separate data collection together in a DataFrame
cities = pd.DataFrame({ 'population':population, 'area':area })
cities # a DataFrame looks rather like a spreadsheet
# a DataFrame is made up of columns
# each column is a Series
# each series is made from a list or a dictionary collection
# each collection is made of int, float, string etc

Unnamed: 0,population,area
Bath,19552860,170312
Birmingham,19651127,695662
London,38332521,423967
Oxford,1200,20100
Reading,26448193,141297
Southampton,12882135,149995


In [21]:
# we CAN write long-winded statements
# but these tend to be ahrder to read
long_and_difficult = pd.DataFrame({ 'population':{'London': 38332521,
                   'Reading': 26448193,
                   'Birmingham': 19651127,
                   'Bath': 19552860,
                   'Southampton': 12882135, 'Oxford':1200}, 'area':{'London': 423967, 'Birmingham': 695662, 'Reading': 141297,
             'Bath': 170312, 'Southampton': 149995, 'Oxford': 20100} })
long_and_difficult
# overall we aim for clarity over brevity

Unnamed: 0,population,area
London,38332521,423967
Reading,26448193,141297
Birmingham,19651127,695662
Bath,19552860,170312
Southampton,12882135,149995
Oxford,1200,20100


In [22]:
# DataFrames give us really easy acces to data analysis statisctics
cities.mean() # 1.947801e+07 is 1.947801 * 10^7 i.e. 19478010
cities.max() # mean, max, min, sum

population    38332521
area            695662
dtype: int64

In [23]:
# we can see stuff
cities.index
cities.columns
# we can do slicing!!!
cities['area'] # show a COLUMN of our data - ie a Series

Bath           170312
Birmingham     695662
London         423967
Oxford          20100
Reading        141297
Southampton    149995
Name: area, dtype: int64

In [24]:
# we have basic anaylsis
cities.describe()

Unnamed: 0,population,area
count,6.0,6.0
mean,19478010.0,266888.833333
std,12860510.0,248269.719766
min,1200.0,20100.0
25%,14549820.0,143471.5
50%,19601990.0,160153.5
75%,24748930.0,360553.25
max,38332520.0,695662.0


### We can Load External Data into Pandas Data Frames

In [51]:
# you may need a relative path pd.read_csv('../../data/nba.csv')
df = pd.read_csv('nba.csv') # or Excel, or JSON etc.
df # NaN represents Not a Number
df.head(50) # head (or tail) to show a specific few values
df['Name']  # this is a SERIES (see it has an index)
df.columns # we see 'Salary ' has a trailing space
sub = ['Name', 'Salary ']
df[sub] # we provide a LIST of the column names
# df.info()
df.describe()
df

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [26]:
df.describe()  # remember the brackets!!
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [27]:
college = df.groupby('College')
college.groups # a dictionary of all the groupings
# we can show any of the groups like this
college.get_group("Arizona")
college.get_group("Kentucky")
college.get_group("Arizona State")

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
92,Jeff Ayres,Los Angeles Clippers,19.0,PF,29.0,6-9,250.0,Arizona State,111444.0
249,James Harden,Houston Rockets,13.0,SG,26.0,6-5,220.0,Arizona State,15756438.0


In [28]:
# we can group any column - here's Team groupings
team = df.groupby('Team')
team.groups
team.get_group("Boston Celtics")

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0


#### CAREFUL - the original DataFrame remains unchanged

In [29]:
# how can we see those players aged 27 or more??
# > < >= <= = ! (not)
df[['Age']] >= 27 # show only those values matching the criteria
# that gave us True and False
df[ df[['Age']] >= 27  ] # this gives us all the data, but not useful
df_age = df['Age'] # NB we need a SEPARATE collection to hold the results
df_age >= 27 # here we have a Series
df # the original data fram is unchanged

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


#### Using loc and iloc to access data members by row

In [30]:
# loc for location
# iloc for index location
df
df.loc[4] # loc uses the index name (might be a string)
df.iloc[4] # in this data set, the index and the location are the same number


Name         Jonas Jerebko
Team        Boston Celtics
Number                 8.0
Position                PF
Age                   29.0
Height                6-10
Weight               231.0
College                NaN
Salary           5000000.0
Name: 4, dtype: object

In [32]:
# Mini Challenge (using this data set) - until 2:30
# What is the mean weight of all the players?
df['Weight'].mean()
# Who is the lightest player?
df[['Weight']].min() #who is this??
# Who is the oldest in this data set?
df[['Age']].max()
# Group the teams then find the number of players in Utah Jazz
team = df.groupby('Team')
team.get_group("Utah Jazz").count()
# optional (if time)
# What is the data type of 'Height'
# Who is the oldest in 'Arizona'?
df.Age.max() # use dot notation if the heading has no spaces
# Who earns most in 'Boston Celtics'?
# df['Salary '].max() # use square-bracket notation if there are spaces in the heading

40.0

In [40]:
# group the data by team (college)
df.groupby('College')[['Age', 'Name']].mean() # or count or min etc.

Unnamed: 0_level_0,Age
College,Unnamed: 1_level_1
Alabama,29.000000
Arizona,27.384615
Arizona State,27.500000
Arkansas,27.333333
Baylor,25.000000
...,...
Western Michigan,25.000000
Wichita State,25.000000
Wisconsin,25.800000
Wyoming,23.000000


In [46]:
# aggregate min, mean, max age for Salary
df[['Salary ']].agg(['min', 'mean', 'max'])
# df['Salary '].min()

Unnamed: 0,Salary
min,30888.0
mean,4842684.0
max,25000000.0


In [48]:
# aggregate min, mean, max age for Salary and Age
df[['Salary ','Age']].agg(['min', 'mean', 'max'])

Unnamed: 0,Salary,Age
min,30888.0,19.0
mean,4842684.0,26.938731
max,25000000.0,40.0


In [49]:
# what if we need to count unique members
df.agg({'Team':['nunique']}) # nunique s the number of unique members

Unnamed: 0,Team
nunique,30
