In [16]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web, holdout_split, year_split
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from scipy.stats import kendalltau

In [17]:
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)
traintestval, holdout = year_split(dfraw)
trainval, test = year_split(traintestval)

In [18]:
test

Unnamed: 0,state,year,name,M/F,count
0,US,2000,Emily,F,25957
1,US,2000,Hannah,F,23085
2,US,2000,Madison,F,19968
3,US,2000,Ashley,F,17998
4,US,2000,Sarah,F,17712
...,...,...,...,...,...
1932848,US,1937,Wyley,M,5
1932849,US,1937,Ynes,M,5
1932850,US,1937,Yoshito,M,5
1932851,US,1937,Zacarias,M,5


In [19]:
def get_all_known_names(data):

    names = data.groupby(['state', 'name', 'M/F']).size().reset_index()
    names = names[['state', 'name', 'M/F']]
    return names

In [20]:
def select_top_names(data, first_year_to_predict, cutoff=10):
    '''
    Select names that have had at least one year at or above the cutoff count?
    Could also try to redo this to do the cutoff for the most recent known year's data
    '''

    data_noleak = data[data['year'] < first_year_to_predict]
    names = data_noleak[data_noleak['count'] >= cutoff][['state', 'name', 'M/F']].drop_duplicates()
    # display(names)
    data = names.merge(data, how='left', on=['state', 'name', 'M/F'])
    # display(data)

    return data

In [21]:
def evaluate(predictor, data_held_out, first_year_to_predict, metric='msle'):

    # display('data_held_out:')
    # display(data_held_out)

    most_recent_year = data_held_out['year'].max()
    
    years_to_predict = range(first_year_to_predict, most_recent_year+1)

    # only allow the model to see data from before the year to predict
    historical_data = data_held_out[data_held_out['year'] < first_year_to_predict]

    # display('historical_data:')
    # display(historical_data)

    # get our model's predictions
    predictions = predictor.predict(historical_data, years_to_predict)

    all_known_names = get_all_known_names(historical_data)

    for year_to_predict in years_to_predict:

        print(f'Predictions for {year_to_predict}:')

        names_to_predict = all_known_names.copy()
        names_to_predict['year'] = year_to_predict
        # display(names_to_predict)

        observed = names_to_predict.merge(data_held_out, how='left', on=['state', 'name', 'M/F', 'year'])

        # for now, fill in missing values with 2, same as FiveThirtyEight did;
        # reasoning: missing values could be 0 to 4, so average is 2
        observed['y'] = observed['count'].fillna(2)
        # observed = observed.rename(columns={'count': 'count_true'})
        # display(observed)

        score_df = observed.merge(predictions, how='left', on=['state', 'name', 'M/F', 'year'], suffixes=('_true', '_pred'))
        # display(score_df)

        y_true = score_df['y_true']
        y_pred = score_df['y_pred']

        if metric == 'msle':
            loss = mean_squared_log_error(y_true, y_pred)
            print(f'Loss: {loss}')

        if metric == 'rank':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            score = np.sum(np.abs(y_true-y_pred))/(len(y_true)*(len(y_true)-1))
            print(f'Score: {score}')

        if metric == 'rank_mae':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            loss = mean_absolute_error(y_true, y_pred)
            print(f'Loss: {loss}')

        if metric == 'kendalltau':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            tau, _ = kendalltau(y_true, y_pred)
            print(f'Tau: {tau}')
        
        top_F_true = score_df[score_df['M/F'] == 'F'][['name', 'y_true']].sort_values(by='y_true', ascending=False).reset_index(drop=True)
        top_F_pred = score_df[score_df['M/F'] == 'F'][['name', 'y_pred']].sort_values(by='y_pred', ascending=False).reset_index(drop=True)
        top_M_true = score_df[score_df['M/F'] == 'M'][['name', 'y_true']].sort_values(by='y_true', ascending=False).reset_index(drop=True)
        top_M_pred = score_df[score_df['M/F'] == 'M'][['name', 'y_pred']].sort_values(by='y_pred', ascending=False).reset_index(drop=True)
        top = pd.concat([top_F_true, top_F_pred, top_M_true, top_M_pred], axis=1, ignore_index=True)
        display(top.head(5))

In [22]:
class DummyPredictor():

    def __init__(self, strategy='naive'):
        self.strategy = strategy
    
    def predict(self, historical_data, years_to_predict):

        all_known_names = get_all_known_names(historical_data)

        predictions = []
        previous_year_data = historical_data[historical_data['year'] == years_to_predict[0] - 1].drop(columns=['year'])

        for year_to_predict in years_to_predict:

            prediction = all_known_names.copy()
            prediction['year'] = year_to_predict

            if self.strategy == 'naive':
                prediction = prediction.merge(previous_year_data, how='left', on=['state', 'name', 'M/F'])
                prediction['y'] = prediction['count'].fillna(2)
                # display(prediction)
            elif self.strategy == 'mean':
                prediction['y'] = previous_year_data['count'].mean()
                # display(prediction)

            predictions.append(prediction)

        predictions = pd.concat(predictions, ignore_index=True)
        # display(predictions)

        return predictions

In [27]:
first_year_to_predict = 2003
cutoff = 100
# data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=0)
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
evaluate(predictor=DummyPredictor(strategy='naive'), data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='rank')

Predictions for 2003:
Score: 0.027288780081125986


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,25691.0,Emily,24468.0,Jacob,29650.0,Jacob,30583.0
1,Emma,22714.0,Madison,21773.0,Michael,27126.0,Michael,28256.0
2,Madison,20201.0,Hannah,18827.0,Joshua,25103.0,Joshua,25997.0
3,Hannah,17638.0,Emma,16554.0,Matthew,24009.0,Matthew,25154.0
4,Olivia,16152.0,Alexis,15636.0,Andrew,22160.0,Ethan,22112.0


Predictions for 2004:
Score: 0.03493179140208993


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,25040.0,Emily,24468.0,Jacob,27895.0,Jacob,30583.0
1,Emma,21617.0,Madison,21773.0,Michael,25465.0,Michael,28256.0
2,Madison,20628.0,Hannah,18827.0,Joshua,24210.0,Joshua,25997.0
3,Olivia,16107.0,Emma,16554.0,Matthew,22881.0,Matthew,25154.0
4,Hannah,15610.0,Alexis,15636.0,Ethan,22208.0,Ethan,22112.0


Predictions for 2005:
Score: 0.04152661595169166


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,23949.0,Emily,24468.0,Jacob,25843.0,Jacob,30583.0
1,Emma,20353.0,Madison,21773.0,Michael,23826.0,Michael,28256.0
2,Madison,19577.0,Hannah,18827.0,Joshua,23259.0,Joshua,25997.0
3,Abigail,15755.0,Emma,16554.0,Matthew,21477.0,Matthew,25154.0
4,Olivia,15699.0,Alexis,15636.0,Ethan,21317.0,Ethan,22112.0


Predictions for 2006:
Score: 0.04762864552562127


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,21410.0,Emily,24468.0,Jacob,24855.0,Jacob,30583.0
1,Emma,19126.0,Madison,21773.0,Michael,22647.0,Michael,28256.0
2,Madison,18632.0,Hannah,18827.0,Joshua,22332.0,Joshua,25997.0
3,Isabella,18228.0,Emma,16554.0,Ethan,20521.0,Matthew,25154.0
4,Ava,16943.0,Alexis,15636.0,Matthew,20335.0,Ethan,22112.0


Predictions for 2007:
Score: 0.05360115785996065


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,19359.0,Emily,24468.0,Jacob,24299.0,Jacob,30583.0
1,Isabella,19140.0,Madison,21773.0,Michael,22014.0,Michael,28256.0
2,Emma,18381.0,Hannah,18827.0,Ethan,21034.0,Joshua,25997.0
3,Ava,18053.0,Emma,16554.0,Joshua,20661.0,Matthew,25154.0
4,Madison,17965.0,Alexis,15636.0,Daniel,20258.0,Ethan,22112.0


Predictions for 2008:
Score: 0.060049393309967246


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emma,18823.0,Emily,24468.0,Jacob,22611.0,Jacob,30583.0
1,Isabella,18625.0,Madison,21773.0,Michael,20642.0,Michael,28256.0
2,Emily,17441.0,Hannah,18827.0,Ethan,20220.0,Joshua,25997.0
3,Olivia,17086.0,Emma,16554.0,Joshua,19219.0,Matthew,25154.0
4,Ava,17047.0,Alexis,15636.0,Daniel,19018.0,Ethan,22112.0


Predictions for 2009:
Score: 0.06718221988352223


Unnamed: 0,0,1,2,3,4,5,6,7
0,Isabella,22316.0,Emily,24468.0,Jacob,21196.0,Jacob,30583.0
1,Emma,17908.0,Madison,21773.0,Ethan,19849.0,Michael,28256.0
2,Olivia,17441.0,Hannah,18827.0,Michael,18956.0,Joshua,25997.0
3,Sophia,16948.0,Emma,16554.0,Alexander,18247.0,Matthew,25154.0
4,Ava,15874.0,Alexis,15636.0,William,17925.0,Ethan,22112.0


Predictions for 2010:
Score: 0.07379042851703842


Unnamed: 0,0,1,2,3,4,5,6,7
0,Isabella,22929.0,Emily,24468.0,Jacob,22144.0,Jacob,30583.0
1,Sophia,20652.0,Madison,21773.0,Ethan,18011.0,Michael,28256.0
2,Emma,17357.0,Hannah,18827.0,Michael,17374.0,Joshua,25997.0
3,Olivia,17033.0,Emma,16554.0,Jayden,17192.0,Matthew,25154.0
4,Ava,15438.0,Alexis,15636.0,William,17064.0,Ethan,22112.0


Predictions for 2011:
Score: 0.07965084517469759


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sophia,21856.0,Emily,24468.0,Jacob,20389.0,Jacob,30583.0
1,Isabella,19927.0,Madison,21773.0,Mason,19522.0,Michael,28256.0
2,Emma,18814.0,Hannah,18827.0,William,17354.0,Joshua,25997.0
3,Olivia,17331.0,Emma,16554.0,Jayden,17010.0,Matthew,25154.0
4,Ava,15512.0,Alexis,15636.0,Noah,16876.0,Ethan,22112.0


Predictions for 2012:
Score: 0.08636040406835459


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sophia,22325.0,Emily,24468.0,Jacob,19094.0,Jacob,30583.0
1,Emma,20964.0,Madison,21773.0,Mason,18987.0,Michael,28256.0
2,Isabella,19126.0,Hannah,18827.0,Ethan,17687.0,Joshua,25997.0
3,Olivia,17328.0,Emma,16554.0,Noah,17366.0,Matthew,25154.0
4,Ava,15545.0,Alexis,15636.0,William,16904.0,Ethan,22112.0


In [24]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor

class MyPredictor():

    def __init__(self):
        
        # params
        cols_to_keep = ['this_year', 'M/F', 'sum', 'median_age', 'thisyear_count', 'diff'] # best
        # cols_to_keep = ['this_year', 'M/F', 'sum', 'median_age', 'thisyear_count', 'diff', 'first_letter_1_pct_change'] # first letter pct change helps in some years
        # cols_to_keep = ['this_year', 'M/F', 'sum_scaled', 'median_age_scaled', 'thisyear_count_scaled']
        categorical_features = ['M/F']
        max_leaf_nodes = 16 # 16
        max_iter = 100 # 100
        loss = 'absolute_error' # abs better than default

        categorical_features = [True if f in categorical_features else False for f in cols_to_keep]
        # print(categorical_features)
        
        self.pipe = make_pipeline(
            ColumnTransformer(
                transformers=[
                    # ('category_encoder', LabelEncoder(), categorical_features),
                    ('cols_to_keep', 'passthrough', cols_to_keep),
                ], remainder='drop'),
            HistGradientBoostingRegressor(
                random_state=0,
                categorical_features=categorical_features,
                max_leaf_nodes=max_leaf_nodes,
                max_iter=max_iter,
                loss=loss
            )
        )

        self.gender_encoding = {'M': 0, 'F': 1}

    def preprocess(self, df, latest_known_year):

        # print(f'Latest known year: {latest_known_year}')

        # find median age of people with name, 
        # total born with that name,
        # and latest year's count

        df = df.copy()
        df = df.sort_values(by='year')
        df['cumsum'] = df.groupby(['state', 'name', 'M/F'])['count'].cumsum()
        df['sum'] = df.groupby(['state', 'name', 'M/F'])['count'].transform('sum')

        df['diff'] = df.groupby(['state', 'name', 'M/F'])['count'].diff()
        df['diff2'] = df.groupby(['state', 'name', 'M/F'])['count'].diff(2)
        df['shift'] = df.groupby(['state', 'name', 'M/F'])['count'].shift()
        df['pct_change'] = df.groupby(['state', 'name', 'M/F'])['count'].pct_change()
        df['accel'] = df.groupby(['state', 'name', 'M/F'])['diff'].diff()

        df['first_letter_1'] = df['name'].str[0].str.lower()

        total_names_per_year = df.groupby(['year', 'state', 'M/F'])['count'].sum()
        letter_names_per_year = df.groupby(['year', 'state', 'M/F', 'first_letter_1'])['count'].sum()
        percentage_of_total_per_year = (letter_names_per_year / total_names_per_year).rename('first_letter_1_pct')
        # display(percentage_of_total_per_year)
        percentage_change_per_year = percentage_of_total_per_year.groupby(['state', 'M/F', 'first_letter_1']).pct_change().rename('first_letter_1_pct_change')
        # display(percentage_change_per_year)
        # df['first_letter_1_pct'] = percentage_of_total_per_year
        # df['first_letter_1_pct_change'] = percentage_change_per_year
        # df = df.merge(percentage_of_total_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        # df = df.merge(percentage_change_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        
        # display(df[(df['name'] == 'Maximus')])

        medians = df[df['cumsum'] >= df['sum']/2]
        medians = medians.drop_duplicates(subset=['state', 'name', 'M/F'], keep='first')
        medians['median_age'] = latest_known_year - medians['year']
        # display(medians[medians['name'] == 'Madison'])

        medians = medians.drop(['count', 'cumsum', 'diff', 'shift', 'pct_change', 'accel', 'diff2'], axis=1)

        thisyear = df[df['year'] == latest_known_year][['state', 'name', 'M/F', 'count', 'diff', 'shift', 'pct_change', 'accel', 'diff2']].rename(columns={'count': 'thisyear_count'})
        # thisyear = thisyear.merge(percentage_of_total_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        # thisyear = thisyear.merge(percentage_change_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        # thisyear = thisyear.rename(columns={'year': 'this_year'})

        df = medians.merge(thisyear, how='left', on=['state', 'name', 'M/F']).rename(columns={'year': 'median_year'})
        # df2[['thisyear_count']] = df2[['thisyear_count']].fillna(0) # might want to shift this to 2 and fill in 2s for missing years? or maybe not

        # this is sort of a rough assumption that if a row didn't exist for this year, not only is the count 0, but so is the diff, shift and pct_change. not always true if the prior year had a count, but often true
        df[['thisyear_count', 'diff', 'shift', 'pct_change', 'accel', 'diff2']] = df[['thisyear_count', 'diff', 'shift', 'pct_change', 'accel', 'diff2']].fillna(0)

        df['this_year'] = latest_known_year # adding this does seem to improve accuracy. we need to know how far along we are in time

        df = df.merge(percentage_of_total_per_year, how='left', left_on=['this_year', 'state', 'M/F', 'first_letter_1'], right_on=['year', 'state', 'M/F', 'first_letter_1'])
        df = df.merge(percentage_change_per_year, how='left', left_on=['this_year', 'state', 'M/F', 'first_letter_1'], right_on=['year', 'state', 'M/F', 'first_letter_1'])

        # display(df2)
        # display(df2.groupby(['state','name','M/F']).ngroups)

        # change M/F to 0/1 so it works with various models
        # (even HistGradientBoostingRegressor, which accepts categorical values,
        # still needs those values to be numbers not strings)
        df['M/F'] = df['M/F'].map(self.gender_encoding)


        
        '''
        grams = df.groupby(['state', 'M/F', 'first_letter_1'])['thisyear_count'].sum().rename('first_letter_1_pct')
        grams = grams / grams.sum()
        # display(grams[0])
        df = df.merge(grams, how='left', on=['state', 'M/F', 'first_letter_1'])
        '''

        '''
        # SCALING
        scaler = preprocessing.MinMaxScaler((0, 1000000))
        columns_scaled = ['sum', 'median_age', 'thisyear_count']
        scaled = scaler.fit_transform(df2[columns_scaled])
        scaled = pd.DataFrame(scaled, columns=[f'{column}_scaled' for column in columns_scaled])
        df3 = pd.concat([df2, scaled], axis=1)
        '''

        return df
    
    def fit(self, historical_data, first_year_to_predict, years_to_fit=1, weight_decay=0.9):
        # first things first, we don't want to know about future data
        historical_data = historical_data[historical_data['year'] < first_year_to_predict]
        # at this point the data we don't want to know should be inaccessible

        X_all = pd.DataFrame()
        y_all = pd.Series()

        # each year_to_fit is the year that's essentially our y for that loop
        for year_to_fit in range(first_year_to_predict - years_to_fit, first_year_to_predict):

            # now we "know" even less for X
            X = historical_data[historical_data['year'] < year_to_fit]
            y = historical_data[historical_data['year'] == year_to_fit]

            X = self.preprocess(X, latest_known_year=year_to_fit - 1)
            y = y[['state', 'name', 'M/F', 'count']].rename(columns={'count': 'y'})
            y['M/F'] = y['M/F'].map(self.gender_encoding)

            data = X.merge(y, how='left', on=['state', 'name', 'M/F'])
            data['y'] = data['y'].fillna(0)
            # display(data)

            X = data.drop(columns=['y'])
            y = data['y']
            X['sample_weight'] = weight_decay ** (first_year_to_predict - year_to_fit)

            X_all = pd.concat([X_all, X], ignore_index=True)
            y_all = pd.concat([y_all, y], ignore_index=True)
        
        temp = X_all.copy()
        temp['y'] = y_all
        display(temp)

        sample_weights = X_all['sample_weight']
        X_all = X_all.drop(columns=['sample_weight'])

        self.pipe.fit(X_all, y_all, **{'histgradientboostingregressor__sample_weight': sample_weights})
        # this seems like a silly way to pass params to individual steps of the pipeline, but it's true. See: https://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t

    def predict(self, historical_data, years_to_predict):

        # all_known_names = get_all_known_names(historical_data)

        predictions = []

        # display('historical_data in predict:')
        # display(historical_data)
        # display('years_to_predict:')
        # display(years_to_predict)

        for year_to_predict in years_to_predict:

            # display('historical_data in predict loop:')
            # display(historical_data)

            df = self.preprocess(historical_data, latest_known_year=year_to_predict - 1)
            # df = self.preprocess(historical_data, years_to_predict[0] - 1)

            df['y'] = self.pipe.predict(df)

            df['year'] = year_to_predict
            # display(df)

            # if we want to simply, do the following; 
            # but for now, might be useful to see all data displayed.
            # df = df[['state', 'year', 'name', 'M/F', 'y']]

            predictions.append(df)

            assumed_new_year_of_historical_data = df[['state', 'year', 'name', 'M/F', 'y']].rename(columns={'y': 'count'})
            assumed_new_year_of_historical_data['M/F'] = assumed_new_year_of_historical_data['M/F'].map({v: k for k, v in self.gender_encoding.items()})
            historical_data = pd.concat([historical_data, assumed_new_year_of_historical_data], ignore_index=True)

        predictions = pd.concat(predictions, ignore_index=True)

        # we have to reverse the mapping to send our predictions
        # (at least the way we currently have it set up)
        predictions['M/F'] = predictions['M/F'].map({v: k for k, v in self.gender_encoding.items()})

        predictions.loc[predictions['y'] < 4.5, 'y'] = 2

        display(predictions)
        # display(predictions[predictions['y'] < 4.5])

        return predictions

In [26]:
first_year_to_predict = 2003
cutoff = 100
my_predictor = MyPredictor()
data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=cutoff) # could consider whether we are harming our model by having it "know" top names beyond years it's fitting in a given loop cycle; should this be done for each loop cycle in the fitting process?
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
my_predictor.fit(historical_data=data_to_fit, first_year_to_predict=first_year_to_predict, years_to_fit=30)
evaluate(predictor=my_predictor, data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='rank')

Unnamed: 0,state,name,M/F,median_year,sum,first_letter_1,median_age,thisyear_count,diff,shift,pct_change,accel,diff2,this_year,first_letter_1_pct,first_letter_1_pct_change,sample_weight,y
0,US,Payne,0,1939,5,p,33,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.025762,-0.054110,0.042391,0.0
1,US,Shelba,1,1940,1917,s,32,7.0,0.0,7.0,0.000000,1.0,-1.0,1972,0.099260,-0.016319,0.042391,5.0
2,US,Memphis,0,1940,24,m,32,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.098125,0.005348,0.042391,0.0
3,US,Shelva,1,1940,1010,s,32,8.0,2.0,6.0,0.333333,4.0,0.0,1972,0.099260,-0.016319,0.042391,0.0
4,US,Harlie,1,1940,5,h,32,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.021586,0.111901,0.042391,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159401,US,Imanol,0,2001,278,i,0,169.0,120.0,49.0,2.448980,122.0,118.0,2001,0.018344,0.090769,0.900000,100.0
159402,US,Jaheim,0,2001,564,j,0,554.0,544.0,10.0,54.400000,0.0,0.0,2001,0.178923,-0.000809,0.900000,752.0
159403,US,Jaheem,0,2001,208,j,0,131.0,122.0,9.0,13.555556,118.0,126.0,2001,0.178923,-0.000809,0.900000,182.0
159404,US,Jahiem,0,2001,155,j,0,155.0,0.0,0.0,0.000000,0.0,0.0,2001,0.178923,-0.000809,0.900000,235.0


Unnamed: 0,state,name,M/F,median_year,sum,first_letter_1,median_age,thisyear_count,diff,shift,pct_change,accel,diff2,this_year,first_letter_1_pct,first_letter_1_pct_change,y,year
0,US,Shelva,F,1940,1028.000000,s,62,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2002,0.082283,-0.017463,2.000000,2003
1,US,Shelba,F,1941,1964.000000,s,61,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2002,0.082283,-0.017463,2.000000,2003
2,US,Melvyn,M,1942,3100.000000,m,60,18.000000,6.000000,12.000000,0.500000,2.000000,10.000000,2002,0.069485,-0.029089,14.187686,2003
3,US,Dick,M,1944,14088.000000,d,58,6.000000,0.000000,6.000000,0.000000,0.000000,0.000000,2002,0.077368,-0.018639,4.769475,2003
4,US,Shirley,M,1944,3411.000000,s,58,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2002,0.042288,-0.026673,2.000000,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56085,US,Omarion,M,2007,6430.678760,o,4,676.408689,1.112580,675.296109,0.001648,0.611602,1.613557,2011,0.008798,0.000010,677.160867,2012
56086,US,Yahir,M,2007,5514.921815,y,4,578.055233,1.864757,576.190475,0.003236,0.292101,3.437413,2011,0.001118,-0.000414,578.055233,2012
56087,US,Amarion,M,2007,3407.414264,a,4,339.310133,-3.256607,342.566741,-0.009506,-1.007580,-5.505634,2011,0.102741,-0.000917,338.770867,2012
56088,US,Bode,M,2007,1846.338820,b,4,209.864690,0.170796,209.693894,0.000815,1.739791,-1.398199,2011,0.055938,-0.001871,209.480420,2012


Predictions for 2003:
Score: 0.026429941537304144


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,25691.0,Emily,25172.075006,Jacob,29650.0,Ethan,25505.557681
1,Emma,22714.0,Madison,25172.075006,Michael,27126.0,Joshua,25182.685787
2,Madison,20201.0,Emma,17433.909035,Joshua,25103.0,Joseph,25182.111052
3,Hannah,17638.0,Olivia,17190.282175,Matthew,24009.0,Andrew,25175.313704
4,Olivia,16152.0,Abigail,17190.282175,Andrew,22160.0,Jacob,25175.313704


Predictions for 2004:
Score: 0.03562770018329519


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,25040.0,Madison,25505.459652,Jacob,27895.0,Joseph,25505.660521
1,Emma,21617.0,Emily,25407.765326,Michael,25465.0,Ethan,25505.557681
2,Madison,20628.0,Abigail,17435.014844,Joshua,24210.0,Andrew,25505.557681
3,Olivia,16107.0,Alexis,17433.562155,Matthew,22881.0,Matthew,25184.786971
4,Hannah,15610.0,Olivia,17433.562155,Ethan,22208.0,Joshua,25175.313704


Predictions for 2005:
Score: 0.04076747452700457


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,23949.0,Madison,25331.963817,Jacob,25843.0,Joseph,25335.305355
1,Emma,20353.0,Emily,25176.543259,Michael,23826.0,Andrew,25335.202514
2,Madison,19577.0,Elizabeth,17191.470858,Joshua,23259.0,Matthew,25184.889812
3,Abigail,15755.0,Alexis,17188.829486,Matthew,21477.0,Michael,25182.920375
4,Olivia,15699.0,Samantha,17187.723678,Ethan,21317.0,Jacob,25182.685787


Predictions for 2006:
Score: 0.04678188762761295


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,21410.0,Emily,25179.44709,Jacob,24855.0,Michael,25185.02156
1,Emma,19126.0,Madison,25179.44709,Michael,22647.0,Joshua,25184.786971
2,Madison,18632.0,Hannah,17030.065942,Joshua,22332.0,Jacob,25184.786971
3,Isabella,18228.0,Elizabeth,16989.357971,Ethan,20521.0,Matthew,25182.788627
4,Ava,16943.0,Alexis,16987.683597,Matthew,20335.0,Ethan,25182.685787


Predictions for 2007:
Score: 0.05178813904390971


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emily,19359.0,Madison,25179.44709,Jacob,24299.0,Michael,25185.653182
1,Isabella,19140.0,Emily,25179.44709,Michael,22014.0,Matthew,25182.788627
2,Emma,18381.0,Hannah,17018.520085,Ethan,21034.0,Jacob,25182.685787
3,Ava,18053.0,Abigail,17016.027859,Joshua,20661.0,Joshua,25182.685787
4,Madison,17965.0,Olivia,17016.027859,Daniel,20258.0,Ethan,25182.685787


Predictions for 2008:
Score: 0.05905413884197218


Unnamed: 0,0,1,2,3,4,5,6,7
0,Emma,18823.0,Emily,25179.44709,Jacob,22611.0,Michael,25185.653182
1,Isabella,18625.0,Madison,25179.44709,Michael,20642.0,Joseph,25182.920375
2,Emily,17441.0,Abigail,17031.17175,Ethan,20220.0,Matthew,25182.788627
3,Olivia,17086.0,Olivia,17031.17175,Joshua,19219.0,Andrew,25182.788627
4,Ava,17047.0,Emma,17025.687305,Daniel,19018.0,Joshua,25182.685787


Predictions for 2009:
Score: 0.06433020194516201


Unnamed: 0,0,1,2,3,4,5,6,7
0,Isabella,22316.0,Madison,25179.44709,Jacob,21196.0,Michael,25185.653182
1,Emma,17908.0,Emily,25179.44709,Ethan,19849.0,Joseph,25182.920375
2,Olivia,17441.0,Elizabeth,17034.209571,Michael,18956.0,Matthew,25182.788627
3,Sophia,16948.0,Sarah,17024.684337,Alexander,18247.0,Andrew,25182.788627
4,Ava,15874.0,Ashley,17024.581497,William,17925.0,Joshua,25182.685787


Predictions for 2010:
Score: 0.07226581922419872


Unnamed: 0,0,1,2,3,4,5,6,7
0,Isabella,22929.0,Emily,25179.44709,Jacob,22144.0,Michael,25185.653182
1,Sophia,20652.0,Madison,25179.44709,Ethan,18011.0,Joseph,25182.920375
2,Emma,17357.0,Elizabeth,17023.027742,Michael,17374.0,Matthew,25182.788627
3,Olivia,17033.0,Sarah,17018.622925,Jayden,17192.0,Joshua,25182.788627
4,Ava,15438.0,Ashley,17018.520085,William,17064.0,Andrew,25182.788627


Predictions for 2011:
Score: 0.07712176197363672


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sophia,21856.0,Madison,25179.44709,Jacob,20389.0,Michael,25185.653182
1,Isabella,19927.0,Emily,25179.44709,Mason,19522.0,Joseph,25182.920375
2,Emma,18814.0,Elizabeth,17020.535516,William,17354.0,Andrew,25182.788627
3,Olivia,17331.0,Hannah,17016.4189,Jayden,17010.0,Joshua,25182.788627
4,Ava,15512.0,Sarah,17016.130699,Noah,16876.0,Matthew,25182.788627


Predictions for 2012:
Score: 0.08612464072795174


Unnamed: 0,0,1,2,3,4,5,6,7
0,Sophia,22325.0,Madison,25179.44709,Jacob,19094.0,Michael,25185.653182
1,Emma,20964.0,Emily,25179.44709,Mason,18987.0,Joseph,25185.653182
2,Isabella,19126.0,Elizabeth,17020.535516,Ethan,17687.0,Joshua,25182.788627
3,Olivia,17328.0,Olivia,17016.4189,Noah,17366.0,Andrew,25182.788627
4,Ava,15545.0,Emma,17016.4189,William,16904.0,Matthew,25182.788627
