# Baby Names

This dataset contains information on how many babies were born with each name. Both the raw number and proportion of births for the year are included.

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import mitosheet
from pandas_profiling import ProfileReport
url = 'https://raw.githubusercontent.com/zgulde/tidytuesday/master/data/2022/2022-03-22/babynames.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,year,sex,name,n,prop
0,1880,F,Mary,7065,0.072384
1,1880,F,Anna,2604,0.026679
2,1880,F,Emma,2003,0.020521
3,1880,F,Elizabeth,1939,0.019866
4,1880,F,Minnie,1746,0.017888


- What were the 10 most popular names the year you were born?
- What year was your name the most popular?
- Where does your name rank for the year that you were born?
- Does your name increase or decrease in popularity compared to the year after the year you were born? The year before? What about 5 years before/after?
- Visualize your name's popularity over time.

In [28]:
def describe_data(df):
    '''
    This function takes in a pandas dataframe and prints out the shape, datatypes, number of missing values, 
    columns and their data types, summary statistics of numeric columns in the dataframe, as well as the value counts for categorical variables.
    '''
    # Print out the "shape" of our dataframe - rows and columns
    print(f'This dataframe has {df.shape[0]} rows and {df.shape[1]} columns.')
    print('')
    print('--------------------------------------')
    print('--------------------------------------')
    
    # print the datatypes and column names with non-null counts
    print(df.info())
    print('')
    print('--------------------------------------')
    print('--------------------------------------')

    # print the number of missing values per column and the total
    print('Null Values: ')
    missing_total = df.isnull().sum().sum()
    missing_count = df.isnull().sum() # the count of missing values
    value_count = df.isnull().count() # the count of all values
    missing_percentage = round(missing_count / value_count * 100, 2) # percentage of missing values
    missing_df = pd.DataFrame({'count': missing_count, 'percentage': missing_percentage}) # create df
    print(missing_df)
    print(f' \n Total Number of Missing Values: {missing_total} \n')
    df_total = df[df.columns[:]].count().sum()
    proportion_of_nulls = round((missing_total / df_total), 4)
    print(f' Proportion of Nulls in Dataframe: {proportion_of_nulls}\n') 
    print('--------------------------------------')
    print('--------------------------------------')

    # print out summary stats for our dataset
    print('Here are the summary statistics of our dataset')
    print(df.describe().applymap(lambda x: f"{x:0.3f}"))
    print('')
    print('--------------------------------------')
    print('--------------------------------------')

    print('Relative Frequencies: \n')
    # Display top 5 values of each variable within reasonable limit
    limit = 25
    for col in df.columns:
        if df[col].nunique() < limit:
            print(f'Column: {col} \n {round(df[col].value_counts(normalize=True).nlargest(5), 3)} \n')
        else: 
            print(f'Column: {col} \n')
            print(f'Range of Values: [{df[col].min()} - {df[col].max()}] \n')
        print('------------------------------------------')
        print('--------------------------------------')

In [75]:
#describe_data(df)

In [6]:
#report = ProfileReport(df)

In [76]:
## Searching in whole column
#for i in range(len(df.year)):
#    if 1993 == df.year[i]:
#          
#        # indx will store the tuple having that 
#        # particular value in column.
#        indx = i
#  
## below line will print that tuple
#df.iloc[indx]

In [36]:
birth = pd.DataFrame(df.loc[df['year'] == 1993])

In [60]:
top_10 = birth.sort_values(by='n', ascending=False).head(10)
top_10

Unnamed: 0,year,sex,name,n,prop
1160653,1993,M,Michael,49550,0.023996
1160654,1993,M,Christopher,38228,0.018513
1160655,1993,M,Matthew,35769,0.017322
1144856,1993,F,Jessica,34988,0.01775
1144857,1993,F,Ashley,34850,0.01768
1160656,1993,M,Joshua,33578,0.016261
1160657,1993,M,Tyler,29796,0.014429
1160658,1993,M,Brandon,28732,0.013914
1160659,1993,M,Daniel,28682,0.01389
1160660,1993,M,Nicholas,28105,0.01361


- What were the 10 most popular names the year you were born?
- What year was your name the most popular?
- Where does your name rank for the year that you were born?
- Does your name increase or decrease in popularity compared to the year after the year you were born? The year before? What about 5 years before/after?
- Visualize your name's popularity over time.

In [61]:
nick = pd.DataFrame(birth.loc[birth['name'] == 'Nicholas'])
nick

Unnamed: 0,year,sex,name,n,prop
1146516,1993,F,Nicholas,91,4.6e-05
1160660,1993,M,Nicholas,28105,0.01361


In [64]:
me = pd.DataFrame(df.loc[df['name'] == 'Nicholas'])
me.sort_values(by='n').tail(10)

Unnamed: 0,year,sex,name,n,prop
1292894,1998,M,Nicholas,26634,0.013139
1265485,1997,M,Nicholas,27257,0.013647
1109794,1991,M,Nicholas,27357,0.012909
1238793,1996,M,Nicholas,27720,0.013837
1135051,1992,M,Nicholas,27755,0.013225
1186581,1994,M,Nicholas,27761,0.013622
1084853,1990,M,Nicholas,27900,0.01297
1160660,1993,M,Nicholas,28105,0.01361
1060391,1989,M,Nicholas,28225,0.01347
1212575,1995,M,Nicholas,29155,0.014498


In [70]:
df[df.name =='Nicholas'].groupby('year').n.sum().sort_values().tail(5)

year
1992    27835
1990    27970
1993    28196
1989    28346
1995    29211
Name: n, dtype: int64

In [73]:
# WHere does your name rank for the year you were born?
year_1993 = df[df.year == 1993].copy()
year_1993['rank'] = year_1993.n.rank(ascending=False)
year_1993[year_1993.name == 'Nicholas']

Unnamed: 0,year,sex,name,n,prop,rank
1146516,1993,F,Nicholas,91,4.6e-05,2830.0
1160660,1993,M,Nicholas,28105,0.01361,10.0


year
1880      73
1881      56
1882      66
1883      71
1884      69
        ... 
2013    7147
2014    6772
2015    6245
2016    5744
2017    5327
Name: n, Length: 138, dtype: int64

In [78]:
plt.figure(figsize=(13,7))
subset = df[df.name == 'Nicholas']
subset = subset.groupby('year').n.sum()
plt.plot(subset.year, subset.n)
plt.xlabel('Year')
plt.ylabel('# of Births') 
plt.gca().yaxis.sex_major_formatter('{:,.0f}'.format)

AttributeError: 'Series' object has no attribute 'year'

In [84]:
type(me)

pandas.core.frame.DataFrame

In [86]:
plt.figure(figsize=(13,7))
subset = me.groupby(me.year).n.sum()
plt.plot(me.year, me.n)
plt.xlabel('Year')
plt.ylabel('# of Births') 
# plt.gca().yaxis.sex_major_formatter('{:,.0f}'.format)

Text(0, 0.5, '# of Births')

In [40]:
#df[df.year.eq(1993).any(1)]

In [74]:
x = df
#x.columns = x.columns.str.lower().str.replace(' ', '_')
y = x.columns.values
z = y.tolist()
print("Note: It take Case Sensitive Values.")
keyWord = input("Type a Keyword to Search: ")
try:
    for k in range(len(z)-1):
        l = x[x[z[k]].str.match(keyWord)]
        print(l.head(10))
        k = k+1
except:
    print("")

Note: It take Case Sensitive Values.
Type a Keyword to Search: Nicholas

