## Evaluation of infant names from 1880-2010, determine most popular girls names and see how this list changes over time

## import modules

In [None]:
import pandas as pd
import math
import glob
import numpy as np
import matplotlib.pyplot as plt

## import data from files into a list of dataframes

In [None]:
path = 'yob*.csv'
files = glob.glob(path)

df= []
for file in files :
    df.append(pd.read_csv(file, index_col= None, header=None))

#Inspect loaded files
print(len(df))
print(df[0].head)
df[0].info()

## Add column names and combine data into one dataframe

In [None]:
# Add column names
i=0
n = 1880
for item in df :
    df[i].columns =  ['Name', 'Sex', 'Count']
    df[i]['Year']= n
    i= i  + 1
    n= n + 1

#Concatenate the list of dataframes into one dataframe
names= pd.concat(df)
print('CONCATENTATED NAMES DATAFRAME')
names.info()
print(names.head())

## Calculate popularity of names as percentage per births

In [None]:
names2 = names.copy()
total_births_by_year = names2.groupby('Year')['Count'].transform('sum')
names2['pct_name']= (names2['Count']/total_births_by_year)* 100
print('NAMES DATAFRAME WITH PCT NAME ADDED')
print(names2.tail())
print(names2.shape)

## Create a dataframe of girls names

In [None]:
#create dataframe with female names, with year as the index
female = names2['Sex'] == 'F'
names_f= names2[female]
print('FEMALE NAME DATAFRAME')
fnames_year= names_f.set_index('Year')
print(fnames_year.tail())
fnames_year.info()




## Select most popular girls names

In [None]:
top= fnames_year['pct_name'] >= 0.4
fnames_year_top= fnames_year[top]
del fnames_year_top['Sex']
del fnames_year_top['Count']
fnames_year_top= fnames_year_top.reset_index()

print('TOP FEMALE NAME DATAFRAME')
print(fnames_year_top.tail())
print(fnames_year_top.shape)
print(fnames_year_top.info())



In [None]:
#Pivot the top names dataframe to make the names columns, and reset the index
fnames_tidy = fnames_year_top.pivot_table(values='pct_name', index=['Year'], columns=['Name'])
fnames_tidy = fnames_tidy.fillna(0)
print(fnames_tidy.tail())
fnames_tidy.info()


## Plot the top girls names in groups of five


In [None]:
#Plot the top girls names in groups of five
fnames_tidy= fnames_tidy.reset_index()

def top_girls_names_1():
    colors=['red', 'blue', 'lime', 'deeppink']
    n=1
    for item in fnames_tidy :
        if n <= 5 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 1 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names1.pdf')
    plt.show()

def top_girls_names_2():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 6
    for item in fnames_tidy :
        if n <= 10 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 2 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names2.pdf')
    plt.show()

def top_girls_names_3():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 11
    for item in fnames_tidy :
        if n <= 15 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 3 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_name3.pdf')
    plt.show()
    
def top_girls_names_4():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 16
    for item in fnames_tidy :
        if n <= 20 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 4 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names4.pdf')
    plt.show()
    
def top_girls_names_5():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 21
    for item in fnames_tidy :
        if n <= 25 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 5 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names5.pdf')
    plt.show()  
    
def top_girls_names_6():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 26
    for item in fnames_tidy :
        if n <= 30 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 6 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names6.pdf')
    plt.show()
    
    
def top_girls_names_7():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 31
    for item in fnames_tidy :
        if n <= 35 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 7 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names7.pdf')
    plt.show()
    
def top_girls_names_8():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 36
    for item in fnames_tidy :
        if n <= 40 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 8 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names8.pdf')
    plt.show()

def top_girls_names_9():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 41
    for item in fnames_tidy :
        if n <= 45 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 9 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names9.pdf')
    plt.show()    

def top_girls_names_10():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 46
    for item in fnames_tidy :
        if n <= 50 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 10 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names10.pdf')
    plt.show()    
    
def top_girls_names_11():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 51
    for item in fnames_tidy :
        if n <= 55 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 11 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names11.pdf')
    plt.show() 
    
def top_girls_names_12():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 55
    for item in fnames_tidy :
        if n <= 60 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 12 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names12.pdf')
    plt.show()  
    
def top_girls_names_13():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 61
    for item in fnames_tidy :
        if n <= 65 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 13 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names13.pdf')
    plt.show()  
    
def top_girls_names_14():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 66
    for item in fnames_tidy :
        if n <= 70 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 14 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names14.pdf')
    plt.show() 
    
def top_girls_names_15():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 71
    for item in fnames_tidy :
        if n <= 75 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 15 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
    plt.savefig('scatter_top_girls_names15.pdf')
    plt.show() 
top_girls_names_1()
top_girls_names_2()
top_girls_names_3()
top_girls_names_4()
top_girls_names_5()
top_girls_names_6()
top_girls_names_7()
top_girls_names_8()
top_girls_names_9()
top_girls_names_10()
top_girls_names_11()
top_girls_names_12()
top_girls_names_13()
top_girls_names_14()
top_girls_names_15()








In [None]:
def top_girls_names_16():
    colors=['red', 'blue', 'lime', 'deeppink']
    n=76
    for item in fnames_tidy :
        if n <= 80 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 16 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names1.pdf')
    plt.show()

def top_girls_names_17():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 81
    for item in fnames_tidy :
        if n <= 85 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 17 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names2.pdf')
    plt.show()

def top_girls_names_18():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 86
    for item in fnames_tidy :
        if n <= 90 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 18 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_name3.pdf')
    plt.show()
    
def top_girls_names_19():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 91
    for item in fnames_tidy :
        if n <= 95 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 19 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names4.pdf')
    plt.show()
    
def top_girls_names_20():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 96
    for item in fnames_tidy :
        if n <= 100 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 20 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names5.pdf')
    plt.show()  
    
def top_girls_names_21():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 101
    for item in fnames_tidy :
        if n <= 105 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 21 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names6.pdf')
    plt.show()
    
    
def top_girls_names_22():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 106
    for item in fnames_tidy :
        if n <= 110 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 22 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names7.pdf')
    plt.show()
    
def top_girls_names_23():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 111
    for item in fnames_tidy :
        if n <= 115 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 23 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names8.pdf')
    plt.show()

def top_girls_names_24():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 116
    for item in fnames_tidy :
        if n <= 120 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 24 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names9.pdf')
    plt.show()    

def top_girls_names_25():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 121
    for item in fnames_tidy :
        if n <= 125 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 25 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names10.pdf')
    plt.show()    
    
def top_girls_names_26():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 126
    for item in fnames_tidy :
        if n <= 130 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 26 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names11.pdf')
    plt.show() 
    
def top_girls_names_27():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 131
    for item in fnames_tidy :
        if n <= 135 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 27 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names12.pdf')
    plt.show()  
    
def top_girls_names_28():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 136
    for item in fnames_tidy :
        if n <= 140 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 28 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names13.pdf')
    plt.show()  
    
def top_girls_names_29():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 141
    for item in fnames_tidy :
        if n <= 145 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 29 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names14.pdf')
    plt.show() 
    
def top_girls_names_30():
    colors=['red', 'blue', 'lime', 'deeppink']
    n= 146
    for item in fnames_tidy :
        if n <= 150 :
            x= fnames_tidy['Year']
            y = fnames_tidy.iloc[0:, n]
            plt.style.use('ggplot')
            plt.scatter(x, y)
            n= n + 1
#     plt.ylim(-0.05, 0.7)
    plt.subplots_adjust(left=0.1)
    plt.ylabel('Pecent of Names')
    plt.title('Top Girls Names (plot 30 of 30)')
    plt.legend(loc='best', fontsize='small', markerscale=0.7)
    plt.margins(0.2)
#     plt.savefig('scatter_top_girls_names15.pdf')
    plt.show() 
top_girls_names_16()
top_girls_names_17()
top_girls_names_18()
top_girls_names_19()
top_girls_names_20()
top_girls_names_21()
top_girls_names_22()
top_girls_names_23()
top_girls_names_24()
top_girls_names_25()
top_girls_names_26()
top_girls_names_27()
top_girls_names_28()
top_girls_names_29()
top_girls_names_30()






