# Lecture 4 –Fall 2023

A demonstration of advanced `pandas` syntax to accompany Lecture 4.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

## Dataset: California baby names

In today's lecture, we'll work with the `babynames` dataset, which contains information about the names of infants born in California.

The cell below pulls census data from a government website and then loads it into a usable form. The code shown here is outside of the scope of Data 100, but you're encouraged to dig into it if you are interested!

In [2]:
import urllib.request
import os.path
import zipfile
# https://www.ssa.gov/oact/babynames/state/namesbystate.zip
data_url = " "
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

# print(f"Downloaded {local_filename} to {os.getcwd()}")
zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'STATE.CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head(15)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
5,CA,F,1910,Ruth,128
6,CA,F,1910,Evelyn,126
7,CA,F,1910,Alice,118
8,CA,F,1910,Virginia,101
9,CA,F,1910,Elizabeth,93


### Exercises
We want to obtain the first three baby names with `count > 250`.

1.Code this using, loc and head()

2.Code this using, loc and iloc()

3.Code this using [] and head ()


In [3]:
# Answer Here
# Code this using, loc and head()
counts = 250
babynames_first_three = babynames.loc[babynames["Count"] > 250].head(3)
print(f'First 3 Baby Names thats counts Above {counts} from Data Frame : \n\n', babynames_first_three)

First 3 Baby Names thats counts Above 250 from Data Frame : 

     State Sex  Year  Name  Count
0      CA   F  1910  Mary    295
233    CA   F  1911  Mary    390
484    CA   F  1912  Mary    534


In [4]:
# Answer Here
# Code this using, loc and iloc()
# babynames_first_three = babynames.iloc[babynames["Count"] > 250].head(3)
babynames_first_three_iloc = babynames.loc[babynames['Count'] > 250].iloc[:3]
print(f'First 3 Baby Names thats counts Above {counts} from Data Frame : \n\n', babynames_first_three_iloc)

First 3 Baby Names thats counts Above 250 from Data Frame : 

     State Sex  Year  Name  Count
0      CA   F  1910  Mary    295
233    CA   F  1911  Mary    390
484    CA   F  1912  Mary    534


In [None]:
# Answer Here
# Code this using [] and head ()
babynames_first_three = babynames[babynames["Count"] > 250].head(3)
print(f'First 3 Baby Names thats counts Above {counts} from Data Frame : \n\n', babynames_first_three)

First 3 Baby Names thats counts Above 250 from Data Frame : 

     State Sex  Year  Name  Count
0      CA   F  1910  Mary    295
233    CA   F  1911  Mary    390
484    CA   F  1912  Mary    534


### `.isin` for Selection based on a list, array, or `Series`

In [None]:
# Note: The parentheses surrounding the code make it possible to break the code into multiple lines for readability

( babynames[(babynames["Name"] == "Bella") |
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Narges") |
              (babynames["Name"] == "Lisa")])


Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5
...,...,...,...,...,...
393248,CA,M,2018,Alex,495
396111,CA,M,2019,Alex,438
398983,CA,M,2020,Alex,379
401788,CA,M,2021,Alex,333


In [None]:
# A more concise method to achieve the above: .isin
#Answer Here
babyname_list = ["Bella", "Alex", "Narges", "Lisa"]
babynames_present = babynames[babynames["Name"].isin(babyname_list)]
print(f'Babyname present in List {babyname_list} :\n\n',babynames_present)

Babyname present in List ['Bella', 'Alex', 'Narges', 'Lisa'] :

        State Sex  Year   Name  Count
6289      CA   F  1923  Bella      5
7512      CA   F  1925  Bella      8
12368     CA   F  1932   Lisa      5
14741     CA   F  1936   Lisa      8
17084     CA   F  1939   Lisa      5
...      ...  ..   ...    ...    ...
393248    CA   M  2018   Alex    495
396111    CA   M  2019   Alex    438
398983    CA   M  2020   Alex    379
401788    CA   M  2021   Alex    333
404663    CA   M  2022   Alex    344

[317 rows x 5 columns]


### `.str` Functions for Defining a Condition

In [None]:
# What if we only want names that start with "J"?
#Answer Here
name_start = 'J'
babynames_j = babynames[babynames["Name"].str.startswith(name_start)]
print(f'Babyname starting with {name_start} :\n\n', babynames_j)

Babyname starting with J :

        State Sex  Year        Name  Count
16        CA   F  1910   Josephine     66
44        CA   F  1910        Jean     35
46        CA   F  1910      Jessie     32
59        CA   F  1910       Julia     28
66        CA   F  1910     Juanita     25
...      ...  ..   ...         ...    ...
407245    CA   M  2022     Jibreel      5
407246    CA   M  2022   Joseangel      5
407247    CA   M  2022  Josejulian      5
407248    CA   M  2022       Juelz      5
407249    CA   M  2022      Jujhar      5

[34751 rows x 5 columns]


## Adding, Removing, and Modifying Columns

### Add a Column
To add a column, use `[]` to reference the desired new column, then assign it to a `Series` or array of appropriate length.

In [None]:
# Remove a Column from dataFrame
# babynames.drop('Length', axis=1, inplace=True)
babynames.head()


Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [None]:
#

In [None]:
# Create a Series of the length of each name
babynams_name_lengths = babynames["Name"].str.len()
# print(f'Name Lengths :\n\n',babynams_name_lengths)

# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babynams_name_lengths
print(f'Name Lengths :\n\n',babynames)



Name Lengths :

        State Sex  Year      Name  Count  name_lengths
0         CA   F  1910      Mary    295             4
1         CA   F  1910     Helen    239             5
2         CA   F  1910   Dorothy    220             7
3         CA   F  1910  Margaret    163             8
4         CA   F  1910   Frances    134             7
...      ...  ..   ...       ...    ...           ...
407423    CA   M  2022   Zayvier      5             7
407424    CA   M  2022       Zia      5             3
407425    CA   M  2022      Zora      5             4
407426    CA   M  2022    Zuriel      5             6
407427    CA   M  2022      Zylo      5             4

[407428 rows x 6 columns]


### Modify a Column
To modify a column, use `[]` to access the desired column, then re-assign it to a new array or Series.

In [None]:
# Modify the "name_lengths" column to be one less than its original value
babynames_copy = babynames.copy()
babynames_copy["name_lengths"] = babynames_copy["name_lengths"] - 1
print(f'Name Lengths in Copied Data Frame :\n\n',babynames_copy)
print(f'\n\nName Lengths in Origional Data Frame :\n\n', babynames)

Name Lengths in Copied Data Frame :

        State Sex  Year      Name  Count  name_lengths
0         CA   F  1910      Mary    295             3
1         CA   F  1910     Helen    239             4
2         CA   F  1910   Dorothy    220             6
3         CA   F  1910  Margaret    163             7
4         CA   F  1910   Frances    134             6
...      ...  ..   ...       ...    ...           ...
407423    CA   M  2022   Zayvier      5             6
407424    CA   M  2022       Zia      5             2
407425    CA   M  2022      Zora      5             3
407426    CA   M  2022    Zuriel      5             5
407427    CA   M  2022      Zylo      5             3

[407428 rows x 6 columns]


Name Lengths in Origional Data Frame :

        State Sex  Year      Name  Count  name_lengths
0         CA   F  1910      Mary    295             4
1         CA   F  1910     Helen    239             5
2         CA   F  1910   Dorothy    220             7
3         CA   F  1910  Marg

### Rename a Column Name
Rename a column using the `.rename()` method.

In [None]:
# Rename "name_lengths" to "Length"
babyname_rename = babynames.rename(columns={"name_lengths": "Length"}, inplace=True)
print(f'Rename Column :\n\n', babyname_rename)
babynames.head()

Rename Column :

 None


Unnamed: 0,State,Sex,Year,Name,Count,Length
0,CA,F,1910,Mary,295,4
1,CA,F,1910,Helen,239,5
2,CA,F,1910,Dorothy,220,7
3,CA,F,1910,Margaret,163,8
4,CA,F,1910,Frances,134,7


### Delete a Column
Remove a column using `.drop()`.

In [None]:
# Remove our new "Length" column
babynames.drop("Length", axis=1, inplace=True)
babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


## Custom sorting

In [None]:
# Sort a Series Containing Names
sort_name_series = babynames["Name"].sort_values()
print(f'Sorted Series :\n\n', sort_name_series)

Sorted Series :

 366001      Aadan
384005      Aadan
369120      Aadan
398211    Aadarsh
370306      Aaden
           ...   
220691      Zyrah
197529      Zyrah
217429      Zyrah
232167      Zyrah
404544      Zyrus
Name: Name, Length: 407428, dtype: object


In [None]:
# Sort a DataFrame – there are lots of Michaels in California
# Answer Here
sort_name_df = babynames.sort_values("Name")
print(f'Sorted DataFrame :\n\n', sort_name_df)

Sorted DataFrame :

        State Sex  Year     Name  Count
366001    CA   M  2008    Aadan      7
384005    CA   M  2014    Aadan      5
369120    CA   M  2009    Aadan      6
398211    CA   M  2019  Aadarsh      6
370306    CA   M  2010    Aaden     62
...      ...  ..   ...      ...    ...
220691    CA   F  2017    Zyrah      6
197529    CA   F  2011    Zyrah      5
217429    CA   F  2016    Zyrah      5
232167    CA   F  2020    Zyrah      5
404544    CA   M  2021    Zyrus      5

[407428 rows x 5 columns]


### Approach 1: Create a temporary column

In [None]:
# Create a Series of the length of each name
babynams_name_lengths = babynames["Name"].str.len()
# print(f'Name Lengths :\n\n',babynams_name_lengths)
# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babynams_name_lengths
print(f'Name Lengths :\n\n',babynames)

# Sort by the temporary column
sort_by_name_length = babynames.sort_values("name_lengths")
print(f'\n\nSorted DataFrame :\n\n', sort_by_name_length)


Name Lengths :

        State Sex  Year      Name  Count  name_lengths
0         CA   F  1910      Mary    295             4
1         CA   F  1910     Helen    239             5
2         CA   F  1910   Dorothy    220             7
3         CA   F  1910  Margaret    163             8
4         CA   F  1910   Frances    134             7
...      ...  ..   ...       ...    ...           ...
407423    CA   M  2022   Zayvier      5             7
407424    CA   M  2022       Zia      5             3
407425    CA   M  2022      Zora      5             4
407426    CA   M  2022    Zuriel      5             6
407427    CA   M  2022      Zylo      5             4

[407428 rows x 6 columns]


Sorted DataFrame :

        State Sex  Year             Name  Count  name_lengths
326570    CA   M  1993               An      8             2
292150    CA   M  1976               Al     13             2
252556    CA   M  1937               Al     21             2
401470    CA   M  2020               Jr  

In [None]:
# Drop the 'name_length' column
babynames.drop("name_lengths", axis=1, inplace=True)
babynames.head()


Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


### Approach 2: Sorting using the `key` argument

In [None]:
# Answer Here
# sorting_key = babynames["Name"].str.len()
sort_by_name_length_key = babynames.sort_values('Name' , key=lambda x: x.str.len(), ascending=False).head()
print(f'\n\nSorted DataFrame using " key " argument :\n\n', sort_by_name_length_key)



Sorted DataFrame using " key " argument :

        State Sex  Year             Name  Count
334166    CA   M  1996  Franciscojavier      8
337301    CA   M  1997  Franciscojavier      5
339472    CA   M  1998  Franciscojavier      6
321792    CA   M  1991  Ryanchristopher      7
327358    CA   M  1993  Johnchristopher      5


### Approach 3: Sorting Using the `map` Function

We can also use the Python map function if we want to use an arbitrarily defined function. Suppose we want to sort by the number of occurrences of "dr" plus the number of occurences of "ea".

In [None]:
# # First, define a function to count the number of times "sa" or "me" appear in each name
# def dr_ea_count(string):
#     return name.str.count("dr").sum() + name.str.count("ea").sum()
#     # return name.str.count("dr") + name.str.count("ea")

# # Then, use `map` to apply `dr_ea_count` to each name in the "Name" column
#     babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)
#     # print(babynames.head())
# # Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork
# # babynames = babynames.sort_values(by='dr_ea_count' , ascending=False).head()
# print(f'\n\nSorted DataFrame using " map " argument :\n\n', babynames)


def dr_ea_count(string):
  return string.count('dr') + string.count('ea')
# Use map to apply dr_ea_count to each name in the "Name" column
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)
babynames = babynames.sort_values(by = "dr_ea_count", ascending=False)
babynames.head()


Unnamed: 0,State,Sex,Year,Name,Count,dr_ea_count
131029,CA,F,1994,Leandrea,5,3
101976,CA,F,1986,Deandrea,6,3
308131,CA,M,1985,Deandrea,6,3
115957,CA,F,1990,Deandrea,5,3
108731,CA,F,1988,Deandrea,5,3


In [None]:
# Drop the `dr_ea_count` column
babynames.drop("dr_ea_count", axis=1, inplace=True)
babynames.head()


Unnamed: 0,State,Sex,Year,Name,Count
131029,CA,F,1994,Leandrea,5
101976,CA,F,1986,Deandrea,6
308131,CA,M,1985,Deandrea,6
115957,CA,F,1990,Deandrea,5
108731,CA,F,1988,Deandrea,5


## Grouping

Group rows that share a common feature, then aggregate data across the group.

In this example, we count the total number of babies born in each year (considering only a small subset of the data, for simplicity).

<img src="images/groupby.png" width="800"/>

In [None]:
# DataFrame with baby gril names only
f_babynames = babynames[babynames["Sex"] == "F"]
print(f'Baby Girl Names Only Data Frame :\n\n', f_babynames)

#Groupby similar features like year and apply aggregate
groupby_similar_features = f_babynames.groupby("Year").agg({"Count": "sum"})
print(f'\n\nGroupby similar features like year :\n\n', groupby_similar_features)

# Sort by Count
sort_similar_features = groupby_similar_features.sort_values("Count", ascending=False)
print(f'\n\nSort similar features using Count :\n\n', sort_similar_features)

Baby Girl Names Only Data Frame :

        State Sex  Year      Name  Count
131029    CA   F  1994  Leandrea      5
101976    CA   F  1986  Deandrea      6
115957    CA   F  1990  Deandrea      5
108731    CA   F  1988  Deandrea      5
147303    CA   F  1999  Andreana     11
...      ...  ..   ...       ...    ...
139085    CA   F  1997   Azucena     49
139084    CA   F  1997     Areli     49
139083    CA   F  1997      Xena     50
139082    CA   F  1997    Unique     50
139081    CA   F  1997    Tierra     50

[239537 rows x 5 columns]


Groupby similar features like year :

        Count
Year        
1910    5950
1911    6602
1912    9804
1913   11860
1914   13815
...      ...
2018  189208
2019  184228
2020  173763
2021  173913
2022  172454

[113 rows x 1 columns]


Sort similar features using Count :

        Count
Year        
1990  262411
1991  261491
1992  256772
1993  249575
1989  243982
...      ...
1914   13815
1913   11860
1912    9804
1911    6602
1910    5950

[113 rows x 1

In [None]:
# print first 10 entries
f_babynames.head(10)

Unnamed: 0,State,Sex,Year,Name,Count
131029,CA,F,1994,Leandrea,5
101976,CA,F,1986,Deandrea,6
115957,CA,F,1990,Deandrea,5
108731,CA,F,1988,Deandrea,5
147303,CA,F,1999,Andreana,11
80382,CA,F,1978,Aundrea,5
80381,CA,F,1978,Audrea,5
65240,CA,F,1972,Andrea,843
191465,CA,F,2010,Leandra,12
80367,CA,F,1978,Andreana,5


In [None]:
#the total baby count in each year
total_baby_count = f_babynames.groupby("Year").agg({"Count": "sum"})
print(f'Total Baby Count in each year :\n\n', total_baby_count)

Total Baby Count in each year :

        Count
Year        
1910    5950
1911    6602
1912    9804
1913   11860
1914   13815
...      ...
2018  189208
2019  184228
2020  173763
2021  173913
2022  172454

[113 rows x 1 columns]


There are many different aggregation functions we can use, all of which are useful in different applications.

In [None]:
# What is the earliest year in which each name appeared?
earliest_year = f_babynames.groupby("Name").agg({"Year": "min"})
print(f'Earliest Year in which each name appeared :\n\n', earliest_year)

Earliest Year in which each name appeared :

          Year
Name         
Aadhini  2022
Aadhira  2017
Aadhya   2007
Aadya    2006
Aahana   2007
...       ...
Zyanya   1986
Zyla     2008
Zylah    2008
Zyra     2012
Zyrah    2011

[13782 rows x 1 columns]


In [None]:
# What is the largest single-year count of each name?
largest_count = f_babynames.groupby("Name").agg({"Count": "max"})
print(f'Largest single-year count of each name :\n\n', largest_count)

Largest single-year count of each name :

          Count
Name          
Aadhini      6
Aadhira     10
Aadhya      50
Aadya       29
Aahana      26
...        ...
Zyanya      15
Zyla        20
Zylah       14
Zyra        16
Zyrah        6

[13782 rows x 1 columns]


In [None]:
#Can you find the most popular baby name in the state of California (CA) for each year? use idxmax function.
result = babynames.groupby("Year")['Count'].idxmax()
result = babynames.loc[result]
result = result[["Year", "Name"]]

#Provide a list of years along with the corresponding most popular names."
result = result.rename(columns={"Name": "Most Popular Baby Name"})

print(f'Most popular baby name in the state of California (CA) for each year :\n\n', result)

Most popular baby name in the state of California (CA) for each year :

         Year Most Popular Baby Name
0       1910                   Mary
233     1911                   Mary
484     1912                   Mary
240064  1913                   John
1120    1914                   Mary
...      ...                    ...
221194  2018                   Emma
396004  2019                   Noah
398869  2020                   Noah
401665  2021                   Noah
404545  2022                   Liam

[113 rows x 2 columns]


## Case Study: Name "Popularity"

In this exercise, let's find the name with sex "F" that has dropped most in popularity since its peak usage. We'll start by filtering `babynames` to only include names corresponding to sex "F".

In [None]:
#Answer Here
f_babynames = babynames[babynames["Sex"] == "F"]
f_babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
131029,CA,F,1994,Leandrea,5
101976,CA,F,1986,Deandrea,6
115957,CA,F,1990,Deandrea,5
108731,CA,F,1988,Deandrea,5
147303,CA,F,1999,Andreana,11


In [None]:
# We sort the data by year
f_babynames = f_babynames.sort_values("Year")
f_babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
161,CA,F,1910,Freda,8
41,CA,F,1910,Clara,37
42,CA,F,1910,Marian,37
43,CA,F,1910,Violet,36
45,CA,F,1910,Laura,34


To build our intuition on how to answer our research question, let's visualize the prevalence of the name "Jennifer" over time.

In [None]:
# We'll talk about how to generate plots in a later lecture
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18,
                  autosize=False,
                 width=1000,
                  height=400)

We'll need a mathematical definition for the change in popularity of a name.

Define the metric "ratio to peak" (RTP). We'll calculate this as the count of the name in 2022 (the most recent year for which we have data) divided by the largest count of this name in *any* year.

A demo calculation for Jennifer:

In [None]:
# Find the highest Jennifer 'count'
highest_jenn_count = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"].max()
print(f'The highest Jennifer " count " is : {highest_jenn_count}')


The highest Jennifer " count " is : 6065


In [None]:
# Remember that we sorted f_babynames by year.
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born

most_recent_count = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"].iloc[-1]
print(f'The most recent count of Jennifers in 2022 is : {most_recent_count}')


The most recent count of Jennifers in 2022 is : 114


In [None]:
# Compute the RTP
ratio_to_peak_jene = most_recent_count / highest_jenn_count
print(f'The Ratio To Peak ( RTP ) for Jennifer is : {ratio_to_peak_jene}')


The Ratio To Peak ( RTP ) for Jennifer is : 0.018796372629843364


We can also write a function that produces the `ratio_to_peak`for a given `Series`. This will allow us to use `.groupby` to speed up our computation for all names in the dataset.

In [None]:
# define the function for RTP
"""
Compute the RTP for a Series containing the counts per year for a single name

"""

def ratio_to_peak(name):
  return name.iloc[-1] / name.max()


In [None]:
# Construct a Series containing our Jennifer count data
jenn_count = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"]

# Then, find the RTP using the function define above
jenn_ratio = ratio_to_peak(jenn_count)
print(f'The Ratio To Peak ( RTP ) for Jennifer is : {jenn_ratio}')

The Ratio To Peak ( RTP ) for Jennifer is : 0.018796372629843364


Now, let's use `.groupby` to compute the RTPs for *all* names in the dataset.

You may see a warning message when running the cell below. As discussed in lecture, `pandas` can't apply an aggregation function to non-numeric data (it doens't make sense to divide "CA" by a number). By default, `.groupby` will drop any columns that cannot be aggregated.

In [None]:
# Results in a TypeError
# rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
# rtp_table

In [None]:
# Find the RTP for all names at once using groupby as describe in lec slides
rtp_table = f_babynames.groupby("Name")[['Year','Count']].agg(ratio_to_peak)
rtp_table


Unnamed: 0_level_0,Year,Count
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aadhini,1.0,1.000000
Aadhira,1.0,0.500000
Aadhya,1.0,0.660000
Aadya,1.0,0.586207
Aahana,1.0,0.269231
...,...,...
Zyanya,1.0,0.466667
Zyla,1.0,1.000000
Zylah,1.0,1.000000
Zyra,1.0,1.000000


To avoid the warning message above, we explicitly extract only the columns relevant to our analysis before using `.agg`.

In [None]:
# Recompute the RTPs, but only performing the calculation on the "Count" column
rtp_table = f_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak)
print(f'The Ratio To Peak ( RTP ) using groupby :\n\n', rtp_table)


The Ratio To Peak ( RTP ) using groupby :

             Count
Name             
Aadhini  1.000000
Aadhira  0.500000
Aadhya   0.660000
Aadya    0.586207
Aahana   0.269231
...           ...
Zyanya   0.466667
Zyla     1.000000
Zylah    1.000000
Zyra     1.000000
Zyrah    0.833333

[13782 rows x 1 columns]


In [None]:
# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns={"Count": "Count RTP"})
print(f'The Ratio To Peak ( RTP ) table after Rename :\n\n', rtp_table)

The Ratio To Peak ( RTP ) table after Rename :

          Count RTP
Name              
Aadhini   1.000000
Aadhira   0.500000
Aadhya    0.660000
Aadya     0.586207
Aahana    0.269231
...            ...
Zyanya    0.466667
Zyla      1.000000
Zylah     1.000000
Zyra      1.000000
Zyrah     0.833333

[13782 rows x 1 columns]


In [None]:
# What name has fallen the most in popularity?
name_most_popular = rtp_table['Count RTP'].idxmax()
print(f'The name that has fallen the most in popularity is : {name_most_popular}')


The name that has fallen the most in popularity is : Aadhini


We can visualize the decrease in the popularity of the name "?:"

In [None]:
def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)],
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18,
                  autosize=False,
                  width=1000,
                  height=400)
    return fig
# pass the name into plot_name
plot_name("-")

In [None]:
# Find the 10 names that have decreased the most in popularity
top10_decreased_popularity = rtp_table.sort_values("Count RTP", ascending=True).head(10)
top10_decreased_popularity

Unnamed: 0_level_0,Count RTP
Name,Unnamed: 1_level_1
Debra,0.00126
Debbie,0.002815
Carol,0.00318
Tammy,0.003249
Susan,0.003305
Cheryl,0.003819
Shannon,0.004242
Tina,0.00428
Michele,0.0045
Terri,0.004753


In [None]:
plot_name(*top10_decreased_popularity.index)

For fun, try plotting your name or your friends' names.

In [3]:
# create Data Frame of my Friends Name
my_friends_name = pd.DataFrame([['Ahsan Akhtar',1,24],
                                ['Muzaffar Ali',2,24],
                                ['Adil Saeed',3,24],
                                ['Ameer Hamza',4,24],
                                ['Omer Farooq',5,24],

                                ['Ahsan Akhtar',2,25],
                                ['Muzaffar Ali',4,25],
                                ['Adil Saeed',5,25],
                                ['Ameer Hamza',1,25],
                                ['Omer Farooq',3,25],

                                ['Ahsan Akhtar',4,26],
                                ['Muzaffar Ali',5,26],
                                ['Adil Saeed',2,26],
                                ['Ameer Hamza',3,26],
                                ['Omer Farooq',1,26],

                                ['Ahsan Akhtar',3,27],
                                ['Muzaffar Ali',1,27],
                                ['Adil Saeed',4,27],
                                ['Ameer Hamza',5,27],
                                ['Omer Farooq',2,27],

                                ['Ahsan Akhtar',5,28],
                                ['Muzaffar Ali',3,28],
                                ['Adil Saeed',1,28],
                                ['Ameer Hamza',2,28],
                                ['Omer Farooq',4,28],

                                ],

                               columns=['Name', 'Study Hours', 'Age'])
print(f'My Friends Data Frame :\n\n', my_friends_name)



My Friends Data Frame :

             Name  Study Hours  Age
0   Ahsan Akhtar            1   24
1   Muzaffar Ali            2   24
2     Adil Saeed            3   24
3    Ameer Hamza            4   24
4    Omer Farooq            5   24
5   Ahsan Akhtar            2   25
6   Muzaffar Ali            4   25
7     Adil Saeed            5   25
8    Ameer Hamza            1   25
9    Omer Farooq            3   25
10  Ahsan Akhtar            4   26
11  Muzaffar Ali            5   26
12    Adil Saeed            2   26
13   Ameer Hamza            3   26
14   Omer Farooq            1   26
15  Ahsan Akhtar            3   27
16  Muzaffar Ali            1   27
17    Adil Saeed            4   27
18   Ameer Hamza            5   27
19   Omer Farooq            2   27
20  Ahsan Akhtar            5   28
21  Muzaffar Ali            3   28
22    Adil Saeed            1   28
23   Ameer Hamza            2   28
24   Omer Farooq            4   28


In [4]:
# Plot My Friends Name with Study Or Not
def plot_friends_name(*names):
    fig = px.line(my_friends_name[my_friends_name["Name"].isin(names)],
                  x = "Age", y = "Study Hours", color="Name",
                  title=f"My Friends Name: \n{names}")
    fig.update_layout(font_size = 16,
                  autosize=False,
                  width=900,
                  height=500)
    return fig
# pass the name into plot_name
plot_friends_name("-")

In [5]:
my_friends_name_Lists = my_friends_name.sort_values("Study Hours", ascending=True)
my_friends_name_Lists


Unnamed: 0,Name,Study Hours,Age
0,Ahsan Akhtar,1,24
22,Adil Saeed,1,28
16,Muzaffar Ali,1,27
14,Omer Farooq,1,26
8,Ameer Hamza,1,25
19,Omer Farooq,2,27
23,Ameer Hamza,2,28
12,Adil Saeed,2,26
1,Muzaffar Ali,2,24
5,Ahsan Akhtar,2,25


In [6]:
# Now Plot The Diagram
plot_friends_name(*my_friends_name_Lists.Name)

In [None]:
plot_name(*my_friends_name_Lists.Age)