## Evaluation of infant names from 1880-2010, determine most popular boys names and see how this list changes over time

## import modules

In [None]:
import pandas as pd
import math
import glob
import numpy as np
import matplotlib.pyplot as plt

## import data from files into a list of dataframes

In [None]:
path = 'yob*.csv'
files = glob.glob(path)

df= []
for file in files :
    df.append(pd.read_csv(file, index_col= None, header=None))

#Inspect loaded files
print(len(df))
print(df[0].head)
df[0].info()

## Add column names and combine data into one dataframe

In [None]:
# Add column names
i=0
n = 1880
for item in df :
    df[i].columns =  ['Name', 'Sex', 'Count']
    df[i]['Year']= n
    i= i  + 1
    n= n + 1

#Concatenate the list of dataframes into one dataframe
names= pd.concat(df)
print('CONCATENTATED NAMES DATAFRAME')
names.info()
print(names.head())

## Calculate popularity of names as percentage per births

In [None]:
names2 = names.copy()
total_births_by_year = names2.groupby('Year')['Count'].transform('sum')
names2['pct_name']= (names2['Count']/total_births_by_year)* 100
print('NAMES DATAFRAME WITH PCT NAME ADDED')
print(names2.tail())
print(names2.shape)

## Create a dataframe of boys names

In [None]:
#create dataframe with male names, with year as the index
male = names2['Sex'] == 'M'
names_m= names2[male]
print('MALE NAME DATAFRAME')
mnames_year= names_m.set_index('Year')
print(mnames_year.tail())
mnames_year.info()




## Select most popular boys names

In [None]:
top= mnames_year['pct_name'] >= 0.4
mnames_year_top= mnames_year[top]
del mnames_year_top['Sex']
del mnames_year_top['Count']
mnames_year_top= mnames_year_top.reset_index()

print('TOP MALE NAME DATAFRAME')
print(mnames_year_top.tail())
print(mnames_year_top.shape)
print(mnames_year_top.info())



## Pivot the dataframe to make the names columns

In [None]:
mnames_tidy = mnames_year_top.pivot_table(values='pct_name', index=['Year'], columns=['Name'])
mnames_tidy = mnames_tidy.fillna(0)
print(mnames_tidy.tail())
mnames_tidy.info()


## Plot the top boys names in groups of five


In [None]:
#Plot the top boys names in groups of five
mnames_tidy= mnames_tidy.reset_index()

def top_boys_names_1():
    n=1
    for item in mnames_tidy :
        if n <= 5 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 1 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names1.pdf')
    plt.show()

def top_boys_names_2():
    n= 6
    for item in mnames_tidy :
        if n <= 10 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 2 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names2.pdf')
    plt.show()

def top_boys_names_3():
    n= 11
    for item in mnames_tidy :
        if n <= 15 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 3 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_name3.pdf')
    plt.show()
    
def top_boys_names_4():
    n= 16
    for item in mnames_tidy :
        if n <= 20 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 4 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names4.pdf')
    plt.show()
    
def top_boys_names_5():
    n= 21
    for item in mnames_tidy :
        if n <= 25 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 5 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names5.pdf')
    plt.show()  
    
def top_boys_names_6():
    n= 26
    for item in mnames_tidy :
        if n <= 30 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 6 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names6.pdf')
    plt.show()
    
    
def top_boys_names_7():
    n= 31
    for item in mnames_tidy :
        if n <= 35 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 7 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names7.pdf')
    plt.show()
    
def top_boys_names_8():
    n= 36
    for item in mnames_tidy :
        if n <= 40 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 8 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names8.pdf')
    plt.show()

def top_boys_names_9():
    n= 41
    for item in mnames_tidy :
        if n <= 45 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 9 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names9.pdf')
    plt.show()    

def top_boys_names_10():
    n= 46
    for item in mnames_tidy :
        if n <= 50 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 10 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names10.pdf')
    plt.show()    
    
def top_boys_names_11():
    n= 51
    for item in mnames_tidy :
        if n <= 55 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 11 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names11.pdf')
    plt.show() 
    
def top_boys_names_12():
    n= 55
    for item in mnames_tidy :
        if n <= 60 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 12 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names12.pdf')
    plt.show()  
    
def top_boys_names_13():
    n= 61
    for item in mnames_tidy :
        if n <= 65 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 13 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names13.pdf')
    plt.show()  
    
def top_boys_names_14():
    n= 66
    for item in mnames_tidy :
        if n <= 70 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 14 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names14.pdf')
    plt.show() 
    
def top_boys_names_15():
    n= 71
    for item in mnames_tidy :
        if n <= 75 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 15 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_boys_names15.pdf')
    plt.show() 
    
def top_boys_names_16():
    n=76
    for item in mnames_tidy :
        if n <= 80 :
            x= mnames_tidy['Year']
            y = mnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
    #     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top boys Names (plot 16 of 16)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    #     plt.savefig('scatter_top_boys_names1.pdf')
    plt.show()

top_boys_names_1()
top_boys_names_2()
top_boys_names_3()
top_boys_names_4()
top_boys_names_5()
top_boys_names_6()
top_boys_names_7()
top_boys_names_8()
top_boys_names_9()
top_boys_names_10()
top_boys_names_11()
top_boys_names_12()
top_boys_names_13()
top_boys_names_14()
top_boys_names_15()
top_boys_names_16()






