# Lecture 5

 Summer 2024

A demonstration of advanced `pandas` syntax to accompany Lecture 5.

In [1]:
import pandas as pd

In [2]:
import zipfile

# Specify the path to your ZIP file using forward slashes
zip_file_path = "datafiles/babynamesbystate.zip"

# Specify the name of the file you want to read from the ZIP archive
ca_name = 'STATE.CA.TXT'

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zf:
    # Check if the specified file exists in the ZIP archive
    if ca_name in zf.namelist():
        # Read the CSV file directly from the ZIP archive
        with zf.open(ca_name) as fh:
            # Define field names for the DataFrame
            field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
            # Read CSV using Pandas
            orgbabynames = pd.read_csv(fh, header=None, names=field_names)
            # Display the first 15 rows of the DataFrame
            # orgbabynames.head(15)
    else:
        print(f"File '{ca_name}' not found in the ZIP archive.")

# orgbabynames.head(15)
babynames = orgbabynames.copy()
babynames.head(10)

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
5,CA,F,1910,Ruth,128
6,CA,F,1910,Evelyn,126
7,CA,F,1910,Alice,118
8,CA,F,1910,Virginia,101
9,CA,F,1910,Elizabeth,93


## Pivot Tables

### `Groupby` with multiple columns

We want to build a table showing the total number of babies born of each sex in each year. One way is to `groupby` using both columns of interest:

In [3]:
# Find total count of baby names for both female and Male for each year
# by groupby
table = babynames.groupby(['Year', 'Sex'])['Count'].agg(sum)
table

  table = babynames.groupby(['Year', 'Sex'])['Count'].agg(sum)


Year  Sex
1910  F        5950
      M        3213
1911  F        6602
      M        3381
1912  F        9804
              ...  
2020  M      189119
2021  F      173913
      M      188669
2022  F      172454
      M      187569
Name: Count, Length: 226, dtype: int64

### `pivot_table`

In [4]:
babynames

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134
...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5
407424,CA,M,2022,Zia,5
407425,CA,M,2022,Zora,5
407426,CA,M,2022,Zuriel,5


In [5]:
# Find total count of baby names for both female and Male for each year using Pivot table
pt = babynames.pivot_table(
    index= 'Year',
    columns= 'Sex',
    values= 'Count',
    aggfunc= max
)
pt

  pt = babynames.pivot_table(


Sex,F,M
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1910,295,237
1911,390,214
1912,534,501
1913,584,614
1914,773,769
...,...,...
2018,2751,2572
2019,2608,2681
2020,2353,2630
2021,2402,2613


![pivot_picture.png](attachment:pivot_picture.png)

### `pivot_table` with Multiple values

In [6]:
# Form a pivot table as describr in Lecture Slides
pt = babynames.pivot_table(
    index= 'Year',
    columns= 'Sex',
    values= ['Count','Name'],
    aggfunc= max
)
pt

  pt = babynames.pivot_table(


Unnamed: 0_level_0,Count,Count,Name,Name
Sex,F,M,F,M
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1910,295,237,Yvonne,William
1911,390,214,Zelma,Willis
1912,534,501,Yvonne,Woodrow
1913,584,614,Zelma,Yoshio
1914,773,769,Zelma,Yoshio
...,...,...,...,...
2018,2751,2572,Zyra,Zyon
2019,2608,2681,Zyra,Zyon
2020,2353,2630,Zyrah,Zyon
2021,2402,2613,Zyra,Zyrus


---

## Join Tables

What if we want to know the popularity of presidential candidates' first names in California in 2022? What can we do?

In [7]:
orgelections = pd.read_csv('datafiles/elections1.csv')
elections = orgelections.copy()
elections.head(10)

Unnamed: 0,Year,Candidate,Party,Popular vote,Result,%
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122
1,1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878
2,1828,Andrew Jackson,Democratic,642806,win,56.203927
3,1828,John Quincy Adams,National Republican,500897,loss,43.796073
4,1832,Andrew Jackson,Democratic,702735,win,54.574789
5,1832,Henry Clay,National Republican,484205,loss,37.603628
6,1832,William Wirt,Anti-Masonic,100715,loss,7.821583
7,1836,Hugh Lawson White,Whig,146109,loss,10.005985
8,1836,Martin Van Buren,Democratic,763291,win,52.272472
9,1836,William Henry Harrison,Whig,550816,loss,37.721543


In [8]:
# Collect baby names for 2022
babynames_2022 = babynames[babynames['Year'] == 2022]
babynames_2022

Unnamed: 0,State,Sex,Year,Name,Count
235835,CA,F,2022,Olivia,2178
235836,CA,F,2022,Emma,2080
235837,CA,F,2022,Camila,2046
235838,CA,F,2022,Mia,1882
235839,CA,F,2022,Sophia,1762
...,...,...,...,...,...
407423,CA,M,2022,Zayvier,5
407424,CA,M,2022,Zia,5
407425,CA,M,2022,Zora,5
407426,CA,M,2022,Zuriel,5


In [9]:
# Use split the candidate names in elections dataframe
elections
elections['First name'] = elections['Candidate'].str.split().str[0]
elections

Unnamed: 0,Year,Candidate,Party,Popular vote,Result,%,First name
0,1824,Andrew Jackson,Democratic-Republican,151271,loss,57.210122,Andrew
1,1824,John Quincy Adams,Democratic-Republican,113142,win,42.789878,John
2,1828,Andrew Jackson,Democratic,642806,win,56.203927,Andrew
3,1828,John Quincy Adams,National Republican,500897,loss,43.796073,John
4,1832,Andrew Jackson,Democratic,702735,win,54.574789,Andrew
...,...,...,...,...,...,...,...
177,2016,Jill Stein,Green,1457226,loss,1.073699,Jill
178,2020,Joseph Biden,Democratic,81268924,win,51.311515,Joseph
179,2020,Donald Trump,Republican,74216154,loss,46.858542,Donald
180,2020,Jo Jorgensen,Libertarian,1865724,loss,1.177979,Jo


`join` in pandas

In [10]:
#Merge both elections and babynames and report your analysis
merged = pd.merge(
    left= babynames_2022,
    right= elections,
    left_on= 'Name',
    right_on= 'First name',
    how= 'inner'
)

merged

Unnamed: 0,State,Sex,Year_x,Name,Count,Year_y,Candidate,Party,Popular vote,Result,%,First name
0,CA,F,2022,Cynthia,47,2008,Cynthia McKinney,Green,161797,loss,0.123442,Cynthia
1,CA,F,2022,Lenora,19,1988,Lenora Fulani,New Alliance,217221,loss,0.237804,Lenora
2,CA,F,2022,Evan,11,2016,Evan McMullin,Independent,732273,loss,0.539546,Evan
3,CA,F,2022,Hillary,10,2016,Hillary Clinton,Democratic,65853514,loss,48.521539,Hillary
4,CA,F,2022,James,8,1844,James Polk,Democratic,1339570,win,50.749477,James
...,...,...,...,...,...,...,...,...,...,...,...,...
147,CA,M,2022,Clinton,6,1888,Clinton B. Fisk,Prohibition,249819,loss,2.196299,Clinton
148,CA,M,2022,Lyndon,6,1964,Lyndon Johnson,Democratic,43127041,win,61.344703,Lyndon
149,CA,M,2022,Woodrow,6,1912,Woodrow Wilson,Democratic,6296284,win,41.933422,Woodrow
150,CA,M,2022,Woodrow,6,1916,Woodrow Wilson,Democratic,9126868,win,49.367987,Woodrow


In [11]:
# since we are working on only 2020 babies name data so droping the year! ans state is also same.
merged = merged.drop(columns=['State', 'Year_x'])

In [12]:
# 2020 is important for us!
merged = merged[merged['Year_y'] == 2020]
merged

Unnamed: 0,Sex,Name,Count,Year_y,Candidate,Party,Popular vote,Result,%,First name
45,M,Joseph,785,2020,Joseph Biden,Democratic,81268924,win,51.311515,Joseph
124,M,Donald,33,2020,Donald Trump,Republican,74216154,loss,46.858542,Donald
128,M,Howard,18,2020,Howard Hawkins,Green,405035,loss,0.255731,Howard


In [13]:
merged = merged.sort_values(by= 'Count', ascending=False)
merged

Unnamed: 0,Sex,Name,Count,Year_y,Candidate,Party,Popular vote,Result,%,First name
45,M,Joseph,785,2020,Joseph Biden,Democratic,81268924,win,51.311515,Joseph
124,M,Donald,33,2020,Donald Trump,Republican,74216154,loss,46.858542,Donald
128,M,Howard,18,2020,Howard Hawkins,Green,405035,loss,0.255731,Howard


### An insight found!
- `Jospeh` was the most popular candidate for presedential elections and he also won, His name was also popular for naming a baby boy in 2022 in CA.