# Pandas More Utility Functions

A demonstration of advanced `pandas` syntax to accompany Lecture 4.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import plotly.express as px

## Dataset: California baby names

In today's lecture, we'll work with the `babynames` dataset, which contains information about the names of infants born in California.

The cell below pulls census data from a government website and then loads it into a usable form. The code shown here is outside of the scope of Data 100, but you're encouraged to dig into it if you are interested!

In [4]:
import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


### Exercises
We want to obtain the first three baby names with `count > 250`.

1.Code this using head()

2.Code this using loc

3.Code this using iloc

4.Code this using []


In [5]:

# Answer Here
conditioned_df=babynames[babynames["Count"]>250]
conditioned_df.head(3)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [6]:
# Answer Here
#conditioned_df.loc(conditioned_df,:)
#print(conditioned_df)
conditioned_df.loc[conditioned_df.index[0:3], :]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [7]:
# Answer Here
conditioned_df.iloc[0:3]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [8]:
# Answer Here
conditioned_df[0:3]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


### `.isin` for Selection based on a list, array, or `Series`

In [9]:
# Note: The parentheses surrounding the code make it possible to break the code into multiple lines for readability
babynames[(babynames["Name"] == "Bella") |
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Narges") |
              (babynames["Name"] == "Lisa")]


Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
399773,CA,M,2019,Alex,438
402648,CA,M,2020,Alex,379
405452,CA,M,2021,Alex,334
408335,CA,M,2022,Alex,345


In [10]:
# A more concise method to achieve the above: .isin
#Answer Here
babynames[babynames["Name"].isin(["Bella","Lisa","Alex","Narges"])]


Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
399773,CA,M,2019,Alex,438
402648,CA,M,2020,Alex,379
405452,CA,M,2021,Alex,334
408335,CA,M,2022,Alex,345


### `.str` Functions for Defining a Condition

In [11]:
# What if we only want names that start with "J"?
#Answer Here
babynames[babynames["Name"].str.startswith("J")]

Unnamed: 0,State,Sex,Year,Name,Count
16,CA,F,1910,Josephine,66
44,CA,F,1910,Jean,35
46,CA,F,1910,Jessie,32
59,CA,F,1910,Julia,28
66,CA,F,1910,Juanita,25
...,...,...,...,...,...
413714,CA,M,2023,Jj,5
413715,CA,M,2023,Johnathon,5
413716,CA,M,2023,Jorden,5
413717,CA,M,2023,Jozef,5


# Custom Sort

In [12]:
# Sort a Series Containing Names
Sorted_by_name=babynames.sort_values(by="Name")
Sorted_by_name["Name"]

Unnamed: 0,Name
387660,Aadan
369654,Aadan
372774,Aadan
401876,Aadarsh
388799,Aaden
...,...
232190,Zyrah
220708,Zyrah
217445,Zyrah
197542,Zyrah


In [13]:
# Sort a DataFrame – there are lots of Michaels in California
Sorted_by_name[Sorted_by_name["Name"]=="Micheal"]

Unnamed: 0,State,Sex,Year,Name,Count
341602,CA,M,1998,Micheal,70
362402,CA,M,2006,Micheal,45
273925,CA,M,1959,Micheal,329
303098,CA,M,1981,Micheal,195
392702,CA,M,2016,Micheal,10
...,...,...,...,...,...
410049,CA,M,2022,Micheal,9
331562,CA,M,1994,Micheal,84
314485,CA,M,1987,Micheal,168
413060,CA,M,2023,Micheal,8


### Approach 1: Create a temporary column

In [14]:
# Create a Series of the length of each name
series_len_each_name=babynames["Name"].str.len()
# Add the Series as a new column to the DataFrame
babynames["name_lengths"]=series_len_each_name
# Sort the DataFrame by the new column
babynames=babynames.sort_values(by="name_lengths")
babynames

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
83016,CA,F,1979,Ji,5,2
331174,CA,M,1993,Vu,5,2
298821,CA,M,1978,Al,13,2
277555,CA,M,1962,Ty,55,2
404824,CA,M,2020,Jj,6,2
...,...,...,...,...,...,...
337819,CA,M,1996,Franciscojavier,8,15
325562,CA,M,1991,Franciscojavier,6,15
316193,CA,M,1987,Franciscojavier,5,15
317627,CA,M,1988,Franciscojavier,10,15


In [16]:
# drop new column
babynames=babynames.drop(columns="name_lengths")
babynames

Unnamed: 0,State,Sex,Year,Name,Count
83016,CA,F,1979,Ji,5
331174,CA,M,1993,Vu,5
298821,CA,M,1978,Al,13
277555,CA,M,1962,Ty,55
404824,CA,M,2020,Jj,6
...,...,...,...,...,...
337819,CA,M,1996,Franciscojavier,8
325562,CA,M,1991,Franciscojavier,6
316193,CA,M,1987,Franciscojavier,5
317627,CA,M,1988,Franciscojavier,10


### Approach 2: Sorting using the `key` argument

---



In [18]:
# Answer Here
babynames.sort_values(by="Name",key=lambda x:x.str.len(),ascending=True)

Unnamed: 0,State,Sex,Year,Name,Count
83016,CA,F,1979,Ji,5
253011,CA,M,1931,Al,17
258584,CA,M,1941,Ed,24
352911,CA,M,2002,An,7
253015,CA,M,1931,Ed,17
...,...,...,...,...,...
325441,CA,M,1991,Ryanchristopher,7
331009,CA,M,1993,Johnchristopher,5
317627,CA,M,1988,Franciscojavier,10
331124,CA,M,1993,Ryanchristopher,5


### Approach 3: Sorting Using the `map` Function

We can also use the Python map function if we want to use an arbitrarily defined function. Suppose we want to sort by the number of occurrences of "dr" plus the number of occurences of "ea".

In [25]:

# Define a function to count occurrences of 'dr' and 'ea'
def dr_ea_count(string):
  return string.count("dr")+string.count("ea")
# Apply the function to each name in the "Name" column and add as a new column
babynames["dr_ea_count"]=babynames['Name'].map(dr_ea_count)

# Sort the DataFrame by the new column in descending order
babynames=babynames.sort_values(by="dr_ea_count",ascending=False)
# Display the top rows
babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count,newcol,dr_ea_count
101982,CA,F,1986,Deandrea,6,3,3
131037,CA,F,1994,Leandrea,5,3,3
311780,CA,M,1985,Deandrea,6,3,3
108738,CA,F,1988,Deandrea,5,3,3
115965,CA,F,1990,Deandrea,5,3,3


In [26]:
# Drop the `dr_ea_count` column
babynames=babynames.drop(columns="dr_ea_count")
babynames

Unnamed: 0,State,Sex,Year,Name,Count,newcol
101982,CA,F,1986,Deandrea,6,3
131037,CA,F,1994,Leandrea,5,3
311780,CA,M,1985,Deandrea,6,3
108738,CA,F,1988,Deandrea,5,3
115965,CA,F,1990,Deandrea,5,3
...,...,...,...,...,...,...
330415,CA,M,1993,Candelario,7,0
72776,CA,F,1975,Christiana,9,0
95618,CA,F,1984,Jacquelene,9,0
238751,CA,F,2022,Ameliarose,6,0


## Grouping

Group rows that share a common feature, then aggregate data across the group.

In this example, we count the total number of babies born in each year (considering only a small subset of the data, for simplicity).

<img src="images/groupby.png" width="800"/>

In [35]:
# DataFrame with baby gril names only
gril_df=babynames[['Year',"Count"]]
gril_df
# Answer Here
#Groupby similar features like year and apply aggregate
# Answer Here
gril_df_grouped=gril_df.groupby("Year").agg(sum)
# Sort by Count
# Sort by Count in descending order
# Answer Here
gril_df_grouped=gril_df_grouped.sort_values(by="Count",ascending=False)
gril_df_grouped.head()

  gril_df_grouped=gril_df.groupby("Year").agg(sum)


Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1990,552669
1991,549339
1992,541091
1993,524993
1989,512613


In [37]:
# print first 10 entries
gril_df_grouped.head(10)


Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1990,552669
1991,549339
1992,541091
1993,524993
1989,512613
1994,509327
2007,497657
2006,495026
1995,494644
2005,484533


In [38]:
# the total baby count in each year
# Answer Here
gril_df_grouped


Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1990,552669
1991,549339
1992,541091
1993,524993
1989,512613
...,...
1914,26926
1913,22094
1912,17946
1911,9983


There are many different aggregation functions we can use, all of which are useful in different applications.

In [49]:
# What is the earliest year in which each name appeared?
# Answer Here
earliest_year=babynames.groupby("Name")["Year"].agg(min)

earliest_year.sort_values()

  earliest_year=babynames.groupby("Name")["Year"].agg(min)


Unnamed: 0_level_0,Year
Name,Unnamed: 1_level_1
Willie,1910
Clarice,1910
Clarence,1910
Clara,1910
Claire,1910
...,...
Sosefina,2023
Keydi,2023
Khailani,2023
Kaiyr,2023


In [59]:
# What is the largest single-year count of each name?
# Answer Here
largest=babynames.groupby("Name")["Count"].agg(max)
largest

  largest=babynames.groupby("Name")["Count"].agg(max)


Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Aadan,7
Aadarsh,6
Aaden,158
Aadhav,8
Aadhini,6
...,...
Zymir,5
Zyon,21
Zyra,28
Zyrah,6


In [67]:
#Can you find the most popular baby name in the state of California (CA) for each year? use idxmax function.
#Provide a list of years along with the corresponding most popular names."
result = babynames.groupby("Year")['Count'].idxmax()
print(babynames.loc[result,["Year","Name","Count"]].head())
#Answer Here

        Year  Name  Count
0       1910  Mary    295
233     1911  Mary    390
484     1912  Mary    534
243717  1913  John    614
1120    1914  Mary    773


## Case Study: Name "Popularity"

In this exercise, let's find the name with sex "F" that has dropped most in popularity since its peak usage. We'll start by filtering `babynames` to only include names corresponding to sex "F".

In [None]:
#Answer Here
f_babynames=babynames[babynames["Sex"]=="F"]
f_babynames.head()


Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [None]:
# We sort the data by year
f_babynames=f_babynames.sort_values(by="Year",ascending=True)
f_babynames

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
148,CA,F,1910,Merle,9
149,CA,F,1910,Rosalie,9
150,CA,F,1910,Rosie,9
151,CA,F,1910,Teresa,9
...,...,...,...,...,...
240783,CA,F,2023,Zayna,22
240784,CA,F,2023,Aashvi,21
240785,CA,F,2023,Aida,21
240759,CA,F,2023,Eimy,22


To build our intuition on how to answer our research question, let's visualize the prevalence of the name "Jennifer" over time.

In [None]:
# We'll talk about how to generate plots in a later lecture
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18,
                  autosize=False,
                 width=1000,
                  height=400)

We'll need a mathematical definition for the change in popularity of a name.

Define the metric "ratio to peak" (RTP). We'll calculate this as the count of the name in 2022 (the most recent year for which we have data) divided by the largest count of this name in *any* year.

A demo calculation for Jennifer:

In [None]:
# Find the highest Jennifer 'count'
largest_count_jennifer=f_babynames[f_babynames["Name"]=="Jennifer"].sort_values(by="Count",ascending=False).max()["Count"]

largest_count_jennifer

6065

In [None]:
# Remember that we sorted f_babynames by year.
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
latest_year=f_babynames["Year"].max()
f_babynames[(f_babynames["Name"] == "Jennifer") & (f_babynames["Year"]==latest_year)]

Unnamed: 0,State,Sex,Year,Name,Count
239956,CA,F,2023,Jennifer,88


In [None]:
# Compute the RTP
def RTP(series):
  return series.iloc[-1]/max(series)

We can also write a function that produces the `ratio_to_peak`for a given `Series`. This will allow us to use `.groupby` to speed up our computation for all names in the dataset.

In [None]:
# define the function for RTP
"""
Compute the RTP for a Series containing the counts per year for a single name
"""

df1=new_df.groupby("Name")["Count"].agg(RTP)

df1

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Aadhini,1.000000
Aadhira,0.500000
Aadhya,0.760000
Aadya,0.758621
Aahana,0.269231
...,...
Zyanya,0.800000
Zyla,1.000000
Zylah,1.000000
Zyra,1.000000


In [None]:
# Construct a Series containing our Jennifer count data
series_jennifer=f_babynames[f_babynames["Name"] == "Jennifer"]['Count']
series_jennifer
# Then, find the RTP using the function define above
RTP(series_jennifer)


0.014509480626545754

Now, let's use `.groupby` to compute the RTPs for *all* names in the dataset.

You may see a warning message when running the cell below. As discussed in lecture, `pandas` can't apply an aggregation function to non-numeric data (it doens't make sense to divide "CA" by a number). By default, `.groupby` will drop any columns that cannot be aggregated.

In [None]:
# Results in a TypeError
rtp_table = f_babynames.groupby("Name").agg(RTP)
rtp_table

TypeError: unsupported operand type(s) for /: 'str' and 'str'

In [None]:
# Find the RTP fro all names at once using groupby as describe in lec slides
df1=f_babynames.groupby("Name")["Count"].agg(RTP)
df1

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Aadhini,1.000000
Aadhira,0.500000
Aadhya,0.760000
Aadya,0.758621
Aahana,0.269231
...,...
Zyanya,0.800000
Zyla,1.000000
Zylah,1.000000
Zyra,1.000000


To avoid the warning message above, we explicitly extract only the columns relevant to our analysis before using `.agg`.

In [None]:
# Recompute the RTPs, but only performing the calculation on the "Count" column
df1=f_babynames.groupby("Name")[["Count"]].agg(RTP)
df1

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Aadhini,1.000000
Aadhira,0.500000
Aadhya,0.760000
Aadya,0.758621
Aahana,0.269231
...,...
Zyanya,0.800000
Zyla,1.000000
Zylah,1.000000
Zyra,1.000000


In [None]:
# Rename "Count" to "Count RTP" for clarity
Count_renamed=df1.rename(columns={"Count":"Count_RTP"})


In [None]:
# What name has fallen the most in popularity?

# df1=new_df.groupby("Name")["Count_RTP"].agg(RTP)
# df1.tail(1)
Count_renamed=Count_renamed.sort_values(by="Count_RTP")
Count_renamed.head(1)


Unnamed: 0_level_0,Count_RTP
Name,Unnamed: 1_level_1
Debra,0.001512


We can visualize the decrease in the popularity of the name "?:"

In [None]:
def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)],
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18,
                  autosize=False,
                  width=1000,
                  height=400)
    return fig
# pass the name into plot_name
plot_name("Debra")

In [None]:
# Find the 10 names that have decreased the most in popularity
# Answer Here
top10=Count_renamed.head(10).index

In [None]:
plot_name(*top10)

For fun, try plotting your name or your friends' names.