# Dataframe exercises

## Exercise - US Baby Names

The folder '3-data-analysis/data/names' contains files with the names registered in the US from  1880 to 2021. Use Pandas to do the following tasks.

### Join the data in the files in a single dataframe (e.g., a name by row)

In [None]:
import pandas as pd

df_names = pd.DataFrame(columns=['name', 'gender']).set_index(['name', 'gender'])

for year in range(1880, 2022):
    df_temp = pd.read_csv(f'./../../data/names/yob{year}.txt', header=None, names=['name', 'gender', year]).set_index(['name', 'gender'])
    df_names = pd.merge(df_names, df_temp, left_index=True, right_index=True, how='outer')

df_names.fillna(0, inplace=True)
df_names

### Find the name most registered since 1880

In [None]:
name = df_names.sum(axis=1).idxmax()
number_of_registrations = df_names.sum(axis=1).max()

print(f'The name {name} was registered {number_of_registrations} times since 1880')

### The name less registered since 1880

In [None]:
name = df_names.sum(axis=1).idxmin()
number_of_registrations = df_names.sum(axis=1).min()

print(f'The name {name} was registered {number_of_registrations} times since 1880')

### Find the name most registered by year

In [None]:
df_names.idxmax(axis=0)

### Find the name most registered in the year 2000

In [None]:
name_most_registerd = df_names[2000].idxmax(axis=0)
number_of_registrations = df_names.loc[name_most_registerd, 2000]

print(f'The name {name_most_registerd} was registered {number_of_registrations} times in the year 2000')

### Find the name less registered in the year 2000

In [None]:
# idxmin Return index of first occurrence of minimum over requested axis.
df_names.idxmin(axis=0)[2000]

### Find the name most registered since 1880 by gender

In [None]:
df_names.sum(axis=1).groupby('gender').idxmax()

### Plot the evolution of the registration of the name `Mary'

In [None]:
df_names.loc[('Mary', 'F')].plot()

### Plot the evolution of the registration of the 10 most registered names

In [None]:
idx_top_10 = df_names.sum(axis=1).sort_values(ascending=False).head(10).index

df_names.loc[idx_top_10].T.plot(figsize=(15, 5))

In [None]:
#if you want to plot the cumulative sum of the top 10 names
df_names.loc[idx_top_10].cumsum(axis=1).T.plot(figsize=(15, 5))

## Exercise - US Baby Names by State

The folder '4-data-analysis/data/namesbystate' contains files with "State-specific data on the relative frequency of given names in the population of U.S. births where the individual has a Social Security Number"

### Load the data into a single dataframe: what is the best structure?

In [None]:
import glob

def process_state_file(filename):
    df_temp = pd.read_csv(filename, header=None, names=['state', 'gender', 'year', 'name', 'count']) \
        .set_index(['state', 'name', 'gender'])

    df_state = pd.DataFrame(columns=['state', 'name','gender']) \
        .set_index(['state', 'name','gender'])

    for year in df_temp.year.unique():
        mask = df_temp.year == year
        df_state = pd.merge(
            df_state,
            df_temp.loc[mask, 'count'].rename(year),
            left_index=True, right_index=True,
            how='outer')
    return df_state


df_names = pd.DataFrame(columns=['name', 'gender', 'state']).set_index(['state', 'name', 'gender'])

for file in glob.glob('./../../data/namesbystate/*.TXT'):
    print(file)
    df_state = process_state_file(file)
    df_names = pd.concat([df_names, df_state])

df_names.fillna(0, inplace=True)

# df_n To get a de-fragmented frame use df_names.copy()
df_names = df_names.copy()
df_names

### Get the registration of females for NY

In [None]:
df_names.reset_index(inplace=True)

In [None]:
mask = (df_names.state == 'NY') & (df_names.gender == 'F')
df_names[mask]

### How many females where registered in NY since the begining of the records?

In [None]:
df_names[mask].select_dtypes('number').sum(axis=1).sum()

### Get the registration of females named Olivia for NY

In [None]:
mask = (df_names.state == 'NY') & (df_names.gender == 'F') & (df_names.name == 'Olivia')
df_names[mask]

### What was the minimum, maximum, average and 10th percentile of registrations of the name Olivia in NY?

In [None]:
df_olivia = df_names[mask]
min_registration = df_olivia.select_dtypes('number').min(axis=1).values[0]
max_registration = df_olivia.select_dtypes('number').max(axis=1).values[0]
avg_registration = df_olivia.select_dtypes('number').mean(axis=1).values[0]
p10_registration = df_olivia.select_dtypes('number').quantile(0.1, axis=1).values[0]

print(f'Min: {min_registration}, Max: {max_registration}, Avg: {avg_registration}, P10: {p10_registration}')

### Find how many times "Olivia" was registered in NY in 2000

In [None]:
mask = (df_names.state == 'NY') & (df_names.gender == 'F') & (df_names.name == 'Olivia')
df_names[mask][2000]

### Find the year and state where Olivia was most times registered in a single year.

In [None]:
mask = (df_names.gender == 'F') & (df_names.name == 'Olivia')

df_olivia = df_names[mask].set_index(['state', 'name', 'gender'])

df_olivia

In [None]:
max_registration = df_olivia.max().max()
max_registration

In [None]:
df_olivia[df_olivia == max_registration].stack()

### Find the year where Olivia was most times registred (all states)

### Find the states where the name Mary and John were most times registered

In [None]:
mask = df_names.name.isin(['Mary', 'John'])
df_john_mary = df_names[mask].copy()
df_john_mary

In [None]:
df_john_mary['total'] = df_john_mary.select_dtypes('number').sum(axis=1)

In [None]:
df_john_mary[['state', 'name', 'gender', 'total']].groupby(['name', 'gender']).max()

### Plot the evolution of the names Mary and John through the years

In [None]:
df_john_mary.drop('total', inplace=True, axis=1)

In [None]:
df_john_mary.drop('state', axis=1).groupby(['name', 'gender']).sum().T.plot(figsize=(15, 5))

### Find the name most registered in a state for a single year

In [None]:
max_registration = df_names.select_dtypes('number').max().max()
max_registration

In [None]:
res = df_names[df_names == max_registration].stack().reset_index().values
res

In [None]:
idx, year, value = res[0] 

In [None]:
df_names.loc[idx, ['state', 'name', year]]


### Plot the evolution of the top 5 (comulattive, i.e., all times) female names

In [None]:
mask = df_names.gender == 'F'

top5_names = df_names[mask].drop(['gender', 'state'], axis=1).groupby('name').sum().sum(axis=1).sort_values(ascending=False).head(5)

top5_names

In [None]:
female_mask = (df_names.name.isin(top5_names.index)) & (df_names.gender == 'F')

df_names[female_mask].drop(['gender', 'state'], axis=1).groupby('name').sum().cumsum(axis=1).T.plot(figsize=(15, 5))

### Plot the comulative registration of the name Olivia in the states NY, NV, NM, Nj, and NH. 