# Lecture 4 –Fall 2023

A demonstration of advanced `pandas` syntax to accompany Lecture 4.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd
import plotly.express as px

## Dataset: California baby names

In today's lecture, we'll work with the `babynames` dataset, which contains information about the names of infants born in California.

The cell below pulls census data from a government website and then loads it into a usable form. The code shown here is outside of the scope of Data 100, but you're encouraged to dig into it if you are interested!

In [3]:
# import urllib.request
# import os.path
# import zipfile

# data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
# local_filename = "/content/drive/MyDrive/data/babynamesbystate.zip"
# if not os.path.exists(local_filename): # If the data exists don't download again
#     with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
#         f.write(resp.read())

# zf = zipfile.ZipFile(local_filename, 'r')

# ca_name = 'STATE.CA.TXT'
# field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
# with zf.open(ca_name) as fh:
#     babynames = pd.read_csv(fh, header=None, names=field_names)

# babynames.head()

In [4]:
import zipfile

# Specify the path to your ZIP file using forward slashes
zip_file_path = "datafiles/babynamesbystate.zip"

# Specify the name of the file you want to read from the ZIP archive
ca_name = 'STATE.CA.TXT'

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zf:
    # Check if the specified file exists in the ZIP archive
    if ca_name in zf.namelist():
        # Read the CSV file directly from the ZIP archive
        with zf.open(ca_name) as fh:
            # Define field names for the DataFrame
            field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
            # Read CSV using Pandas
            orgbabynames = pd.read_csv(fh, header=None, names=field_names)
            # Display the first 15 rows of the DataFrame
            # orgbabynames.head(15)
    else:
        print(f"File '{ca_name}' not found in the ZIP archive.")

# orgbabynames.head(15)
babynames = orgbabynames.copy()
babynames.head(10)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
5,CA,F,1910,Ruth,128
6,CA,F,1910,Evelyn,126
7,CA,F,1910,Alice,118
8,CA,F,1910,Virginia,101
9,CA,F,1910,Elizabeth,93


### Exercises
We want to obtain the first three baby names with `count > 250`.

1.Code this using, loc and head()

2.Code this using, loc and iloc()

3.Code this using [] and head ()


In [5]:
# using loc and head():
babynames_count = babynames[(babynames['Count'] > 250)]
babynames_count.head(3)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
233,CA,F,1911,Mary,390
484,CA,F,1912,Mary,534


In [6]:
# with loc and iloc
babynames_count = babynames[(babynames['Count'] > 250)].reset_index()
babynames_count.iloc[0:3] #exclusive in nature


Unnamed: 0,index,State,Sex,Year,Name,Count
0,0,CA,F,1910,Mary,295
1,233,CA,F,1911,Mary,390
2,484,CA,F,1912,Mary,534


In [7]:
# with [] and head()
babynames_count = babynames[(babynames['Count'] > 250)].reset_index()
babynames_count[0:3] # also exclusive but loc is inclusive

Unnamed: 0,index,State,Sex,Year,Name,Count
0,0,CA,F,1910,Mary,295
1,233,CA,F,1911,Mary,390
2,484,CA,F,1912,Mary,534


### `.isin` for Selection based on a list, array, or `Series`

In [8]:
# Note: The parentheses surrounding the code make it possible to break the code into multiple lines for readability

( babynames[(babynames["Name"] == "Bella") |
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Narges") |
              (babynames["Name"] == "Lisa")])

Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
393248,CA,M,2018,Alex,495
396111,CA,M,2019,Alex,438
398983,CA,M,2020,Alex,379
401788,CA,M,2021,Alex,333


In [9]:
# A more concise method to achieve the above: .isin
names = ['Bella', 'Alex', 'Narges', 'Lisa']
babynames[(babynames['Name'].isin(names))]

Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
393248,CA,M,2018,Alex,495
396111,CA,M,2019,Alex,438
398983,CA,M,2020,Alex,379
401788,CA,M,2021,Alex,333


### `.str` Functions for Defining a Condition

In [10]:
# What if we only want names that start with "J"?
babynames[babynames["Name"].str.startswith('J')]

Unnamed: 0,State,Sex,Year,Name,Count
16,CA,F,1910,Josephine,66
44,CA,F,1910,Jean,35
46,CA,F,1910,Jessie,32
59,CA,F,1910,Julia,28
66,CA,F,1910,Juanita,25
...,...,...,...,...,...
407245,CA,M,2022,Jibreel,5
407246,CA,M,2022,Joseangel,5
407247,CA,M,2022,Josejulian,5
407248,CA,M,2022,Juelz,5


## Adding, Removing, and Modifying Columns

### Add a Column
To add a column, use `[]` to reference the desired new column, then assign it to a `Series` or array of appropriate length.

In [11]:
# Create a Series of the length of each name
name_length = babynames['Name'].str.len()

# Add a column named "name_lengths" that includes the length of each name
babynames['name_lengths'] = babynames['Name'].str.len()
babynames

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
0,CA,F,1910,Mary,295,4
1,CA,F,1910,Helen,239,5
2,CA,F,1910,Dorothy,220,7
3,CA,F,1910,Margaret,163,8
4,CA,F,1910,Frances,134,7
...,...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5,7
407424,CA,M,2022,Zia,5,3
407425,CA,M,2022,Zora,5,4
407426,CA,M,2022,Zuriel,5,6


### Modify a Column
To modify a column, use `[]` to access the desired column, then re-assign it to a new array or Series.

In [12]:
# Modify the "name_lengths" column to be one less than its original value
#modifying the name_length column by adding 1
babynames['name_lengths'] -= 1

# Display the updated DataFrame
babynames

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
0,CA,F,1910,Mary,295,3
1,CA,F,1910,Helen,239,4
2,CA,F,1910,Dorothy,220,6
3,CA,F,1910,Margaret,163,7
4,CA,F,1910,Frances,134,6
...,...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5,6
407424,CA,M,2022,Zia,5,2
407425,CA,M,2022,Zora,5,3
407426,CA,M,2022,Zuriel,5,5


### Rename a Column Name
Rename a column using the `.rename()` method.

In [13]:
# Rename "name_lengths" to "Length"
babynames = babynames.rename(columns= {'name_lengths':'Length'})
babynames

Unnamed: 0,State,Sex,Year,Name,Count,Length
0,CA,F,1910,Mary,295,3
1,CA,F,1910,Helen,239,4
2,CA,F,1910,Dorothy,220,6
3,CA,F,1910,Margaret,163,7
4,CA,F,1910,Frances,134,6
...,...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5,6
407424,CA,M,2022,Zia,5,2
407425,CA,M,2022,Zora,5,3
407426,CA,M,2022,Zuriel,5,5


### Delete a Column
Remove a column using `.drop()`.

In [14]:
# Remove our new "Length" column
babynames = babynames.drop(columns=['Length'])
babynames

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5
407424,CA,M,2022,Zia,5
407425,CA,M,2022,Zora,5
407426,CA,M,2022,Zuriel,5


## Custom sorting

In [15]:
# Sort a Series Containing Names
sorted_names_series = babynames['Name'].sort_values()
sorted_names_series

366001      Aadan
384005      Aadan
369120      Aadan
398211    Aadarsh
370306      Aaden
           ...   
220691      Zyrah
197529      Zyrah
217429      Zyrah
232167      Zyrah
404544      Zyrus
Name: Name, Length: 407428, dtype: object

In [16]:
# Sort a DataFrame – there are lots of Michaels in California
sorted_names_df = babynames.sort_values(by='Name')
sorted_names_df

Unnamed: 0,State,Sex,Year,Name,Count
366001,CA,M,2008,Aadan,7
384005,CA,M,2014,Aadan,5
369120,CA,M,2009,Aadan,6
398211,CA,M,2019,Aadarsh,6
370306,CA,M,2010,Aaden,62
...,...,...,...,...,...
220691,CA,F,2017,Zyrah,6
197529,CA,F,2011,Zyrah,5
217429,CA,F,2016,Zyrah,5
232167,CA,F,2020,Zyrah,5


### Approach 1: Create a temporary column

In [17]:
# Create a Series of the length of each name
name_lengths = babynames['Name'].str.len()
# name_length

# Add a column named "name_lengths" that includes the length of each name
babynames['name_lengths'] = name_lengths
# babynames

# Sort by the temporary column
sorted_length = babynames.sort_values(by='name_lengths')
sorted_length

Unnamed: 0,State,Sex,Year,Name,Count,name_lengths
326570,CA,M,1993,An,8,2
292150,CA,M,1976,Al,13,2
252556,CA,M,1937,Al,21,2
401470,CA,M,2020,Jr,5,2
260022,CA,M,1948,Ed,43,2
...,...,...,...,...,...,...
339472,CA,M,1998,Franciscojavier,6,15
327358,CA,M,1993,Johnchristopher,5,15
337477,CA,M,1997,Ryanchristopher,5,15
312543,CA,M,1987,Franciscojavier,5,15


In [18]:
# Drop the 'name_length' column
babynames = babynames.drop(columns=['name_lengths'])
babynames

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5
407424,CA,M,2022,Zia,5
407425,CA,M,2022,Zora,5
407426,CA,M,2022,Zuriel,5


### Approach 2: Sorting using the `key` argument

In [19]:
def custom_key(x):
    return x - 1

# babynames
key_sort = babynames.sort_values(by='Count', key=lambda x:x-1)
key_sort

# '''
# The key argument in the sort_values method of a Pandas DataFrame is intended to transform the data before sorting, 
# not to change the data itself. The lambda function you’ve provided, lambda x: x-1, is subtracting 1 from each element in the ‘Count’ column, 
# but this transformed data is only used for sorting purposes and does not alter the original data in the DataFrame.
# '''

Unnamed: 0,State,Sex,Year,Name,Count
407427,CA,M,2022,Zylo,5
300815,CA,M,1981,Broderick,5
300816,CA,M,1981,Brooke,5
300817,CA,M,1981,Bud,5
300818,CA,M,1981,Cha,5
...,...,...,...,...,...
283146,CA,M,1970,Michael,8196
281850,CA,M,1969,Michael,8245
317387,CA,M,1990,Michael,8246
267017,CA,M,1956,Michael,8258


### Approach 3: Sorting Using the `map` Function

We can also use the Python map function if we want to use an arbitrarily defined function. Suppose we want to sort by the number of occurrences of "dr" plus the number of occurences of "ea".

In [20]:
# First, define a function to count the number of times "sa" or "me" appear in each name
def dr_ea_count(string):
    return string.count('dr') + string.count('ea')

# Then, use `map` to apply `dr_ea_count` to each name in the "Name" column
babynames['dr_ea'] = babynames['Name'].map(dr_ea_count)

# Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork
babynames.sort_values(by= 'dr_ea', ascending=False)


Unnamed: 0,State,Sex,Year,Name,Count,dr_ea
131029,CA,F,1994,Leandrea,5,3
101976,CA,F,1986,Deandrea,6,3
308131,CA,M,1985,Deandrea,6,3
115957,CA,F,1990,Deandrea,5,3
108731,CA,F,1988,Deandrea,5,3
...,...,...,...,...,...,...
139084,CA,F,1997,Areli,49,0
139083,CA,F,1997,Xena,50,0
139082,CA,F,1997,Unique,50,0
139081,CA,F,1997,Tierra,50,0


In [21]:
babynames = babynames.drop(columns='dr_ea')
babynames


Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5
407424,CA,M,2022,Zia,5
407425,CA,M,2022,Zora,5
407426,CA,M,2022,Zuriel,5


## Grouping

Group rows that share a common feature, then aggregate data across the group.

In this example, we count the total number of babies born in each year (considering only a small subset of the data, for simplicity).

<img src="images/groupby.png" width="800"/>

In [22]:
# DataFrame with baby gril names only
f_babynames = babynames[(babynames['Sex'] == 'F')]
# f_babynames

#Groupby similar features like year and apply aggregate
grouped_by_year = f_babynames.groupby('Year')[['Count']].agg(sum)

# Sort by Count
grouped_by_year.sort_values(by='Count')
grouped_by_year


  grouped_by_year = f_babynames.groupby('Year')[['Count']].agg(sum)


Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1910,5950
1911,6602
1912,9804
1913,11860
1914,13815
...,...
2018,189208
2019,184228
2020,173763
2021,173913


In [23]:
# print first 10 entries
grouped_by_year.head(10)

Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1910,5950
1911,6602
1912,9804
1913,11860
1914,13815
1915,18643
1916,19555
1917,20864
1918,23052
1919,23288


In [24]:
#the total baby count in each year
grouped_by_year


Unnamed: 0_level_0,Count
Year,Unnamed: 1_level_1
1910,5950
1911,6602
1912,9804
1913,11860
1914,13815
...,...
2018,189208
2019,184228
2020,173763
2021,173913


There are many different aggregation functions we can use, all of which are useful in different applications.

In [25]:
# What is the earliest year in which each name appeared?
f_babynames = f_babynames.sort_values(by='Year')
grouped_by_name = f_babynames.groupby('Name')[['Year']].first()
grouped_by_name.sort_values(by='Year')

Unnamed: 0_level_0,Year
Name,Unnamed: 1_level_1
Lena,1910
Clarice,1910
Mamie,1910
Rosie,1910
Jeanette,1910
...,...
Nainika,2022
Naliah,2022
Namari,2022
Asenat,2022


In [26]:
# What is the largest single-year count of each name?
f_babynames = f_babynames.sort_values(by='Count', ascending= False)
grouped_by_name = f_babynames.groupby('Name')[['Count']].max()
grouped_by_name.sort_values(by='Count', ascending=False)

Unnamed: 0_level_0,Count
Name,Unnamed: 1_level_1
Jessica,6951
Linda,6759
Jennifer,6065
Ashley,4979
Lisa,4939
...,...
Jemimah,5
Jema,5
Jeilyn,5
Shailee,5


In [27]:
#Can you find the most popular baby name in the state of California (CA) for each year? use idxmax function.
#Provide a list of years along with the corresponding most popular names."
result = babynames.groupby("Year")['Count'].idxmax()
result
#Answer Here

Year
1910         0
1911       233
1912       484
1913    240064
1914      1120
         ...  
2018    221194
2019    396004
2020    398869
2021    401665
2022    404545
Name: Count, Length: 113, dtype: int64

## Case Study: Name "Popularity"

In this exercise, let's find the name with sex "F" that has dropped most in popularity since its peak usage. We'll start by filtering `babynames` to only include names corresponding to sex "F".

In [28]:
# Extracting the female babyname data
f_babynames = babynames[(babynames['Sex'] == 'F')]
f_babynames = f_babynames[['Name','Year','Count']].sort_values(by='Year')

To build our intuition on how to answer our research question, let's visualize the prevalence of the name "Jennifer" over time.

In [29]:
# # We'll talk about how to generate plots in a later lecture
# fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
#               x = "Year", y = "Count")
# fig.update_layout(font_size = 18,
#                   autosize=False,
#                  width=1000,
#                   height=400)

We'll need a mathematical definition for the change in popularity of a name.

Define the metric "ratio to peak" (RTP). We'll calculate this as the count of the name in 2022 (the most recent year for which we have data) divided by the largest count of this name in *any* year.

A demo calculation for Jennifer:

In [30]:
# Find the highest Jennifer 'count'
# f_babynames[f_babynames['Name'] == 'Jennifer']
peak_jenn_count = f_babynames[(f_babynames['Name'] == 'Jennifer')]['Count'].max()
peak_jenn_count


6065

In [31]:
# Remember that we sorted f_babynames by year.
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
recent_jenn_count = f_babynames[(f_babynames['Name'] == 'Jennifer')]['Count'].iloc[-1]
recent_jenn_count

114

In [32]:
# Compute the RTP
rtp = recent_jenn_count / peak_jenn_count
rtp


0.018796372629843364

We can also write a function that produces the `ratio_to_peak`for a given `Series`. This will allow us to use `.groupby` to speed up our computation for all names in the dataset.

In [33]:
# define the function for RTP
# """
# Compute the RTP for a Series containing the counts per year for a single name
# """
def rtp(series):
    return series.iloc[-1] / series.max()


In [34]:
# Construct a Series containing our Jennifer count data
jenn_names = f_babynames[(f_babynames['Name'] == 'Jennifer')]

# Then, find the RTP using the function define above
rtp_series = jenn_names.groupby('Year')['Count'].agg(rtp)
rtp_series

Year
1934    1.0
1938    1.0
1939    1.0
1940    1.0
1941    1.0
       ... 
2018    1.0
2019    1.0
2020    1.0
2021    1.0
2022    1.0
Name: Count, Length: 86, dtype: float64

Now, let's use `.groupby` to compute the RTPs for *all* names in the dataset.

You may see a warning message when running the cell below. As discussed in lecture, `pandas` can't apply an aggregation function to non-numeric data (it doens't make sense to divide "CA" by a number). By default, `.groupby` will drop any columns that cannot be aggregated.

In [35]:
# Results in a TypeError
rtp_table = f_babynames.groupby("Name").agg(rtp) # count coloumn should be specified here
rtp_table

Unnamed: 0_level_0,Year,Count
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aadhini,1.0,1.000000
Aadhira,1.0,0.500000
Aadhya,1.0,0.660000
Aadya,1.0,0.586207
Aahana,1.0,0.269231
...,...,...
Zyanya,1.0,0.466667
Zyla,1.0,1.000000
Zylah,1.0,1.000000
Zyra,1.0,1.000000


In [36]:
# Find the RTP fro all names at once using groupby as describe in lec slides
rtp_table = f_babynames.groupby("Name")['Count'].agg(rtp)
rtp_table

Name
Aadhini    1.000000
Aadhira    0.500000
Aadhya     0.660000
Aadya      0.586207
Aahana     0.269231
             ...   
Zyanya     0.466667
Zyla       1.000000
Zylah      1.000000
Zyra       1.000000
Zyrah      0.833333
Name: Count, Length: 13782, dtype: float64

To avoid the warning message above, we explicitly extract only the columns relevant to our analysis before using `.agg`.

In [37]:
# Recompute the RTPs, but only performing the calculation on the "Count" column
rtp_table = f_babynames.groupby("Name")['Count'].agg(rtp)
rtp_table = rtp_table.to_frame()

In [38]:
# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns= {'Count' : 'RTP Count'})
rtp_table

Unnamed: 0_level_0,RTP Count
Name,Unnamed: 1_level_1
Aadhini,1.000000
Aadhira,0.500000
Aadhya,0.660000
Aadya,0.586207
Aahana,0.269231
...,...
Zyanya,0.466667
Zyla,1.000000
Zylah,1.000000
Zyra,1.000000


In [39]:
rtp_table = rtp_table.sort_values(by='RTP Count').reset_index()
rtp_table

Unnamed: 0,Name,RTP Count
0,Debra,0.001260
1,Debbie,0.002815
2,Carol,0.003180
3,Tammy,0.003249
4,Susan,0.003305
...,...,...
13777,Fidelia,1.000000
13778,Naveyah,1.000000
13779,Finlee,1.000000
13780,Roseline,1.000000


In [65]:
# What name has fallen the most in popularity?
# OR    rtp_table[rtp_table['RTP Count'] == rtp_table['RTP Count'].min()]
fallen_name = rtp_table.iloc[0]
fallen_name

Name           Debra
RTP Count    0.00126
Name: 0, dtype: object

We can visualize the decrease in the popularity of the name "?:"

In [66]:
def plot_name(names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)],
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18,
                  autosize=False,
                  width=1000,
                  height=400)
    return fig

# pass the name into plot_name
plot_name(fallen_name)

In [67]:
# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.head(10)
top10 = top10['Name']
top10

0      Debra
1     Debbie
2      Carol
3      Tammy
4      Susan
5     Cheryl
6    Shannon
7       Tina
8    Michele
9      Terri
Name: Name, dtype: object

In [68]:
plot_name(top10)

For fun, try plotting your name or your friends' names.