In [1]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web, holdout_split
from sklearn.metrics import mean_squared_log_error

In [2]:
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)
traintestval, holdout = holdout_split(dfraw)
trainval, test = holdout_split(traintestval)

In [3]:
test

Unnamed: 0,state,year,name,M/F,count
0,US,2000,Madison,F,19968
1,US,2000,Olivia,F,12854
2,US,2000,Sydney,F,10244
3,US,2000,Jennifer,F,9389
4,US,2000,Amanda,F,8557
...,...,...,...,...,...
212633,US,2019,Zaheen,M,5
212634,US,2019,Zahi,M,5
212635,US,2019,Zaymar,M,5
212636,US,2019,Zeo,M,5


In [4]:
def get_all_known_names(data):

    names = data.groupby(['state', 'name', 'M/F']).size().reset_index()
    names = names[['state', 'name', 'M/F']]
    return names

In [5]:
def select_top_names(data, first_year_to_predict, cutoff=10):
    '''
    Select names that have had at least one year at or above the cutoff count?
    Could also try to redo this to do the cutoff for the most recent known year's data
    '''

    data_noleak = data[data['year'] < first_year_to_predict]
    names = data_noleak[data_noleak['count'] >= cutoff][['state', 'name', 'M/F']].drop_duplicates()
    # display(names)
    data = names.merge(data, how='left', on=['state', 'name', 'M/F'])
    # display(data)

    return data

In [6]:
def evaluate(predictor, data_held_out, first_year_to_predict, metric='msle'):

    # display('data_held_out:')
    # display(data_held_out)

    most_recent_year = data_held_out['year'].max()
    
    years_to_predict = range(first_year_to_predict, most_recent_year+1)

    # only allow the model to see data from before the year to predict
    historical_data = data_held_out[data_held_out['year'] < first_year_to_predict]

    # display('historical_data:')
    # display(historical_data)

    # get our model's predictions
    predictions = predictor.predict(historical_data, years_to_predict)

    all_known_names = get_all_known_names(historical_data)

    for year_to_predict in years_to_predict:

        print(f'Predictions for {year_to_predict}:')

        names_to_predict = all_known_names.copy()
        names_to_predict['year'] = year_to_predict
        # display(names_to_predict)

        observed = names_to_predict.merge(data_held_out, how='left', on=['state', 'name', 'M/F', 'year'])

        # for now, fill in missing values with 2, same as FiveThirtyEight did;
        # reasoning: missing values could be 0 to 4, so average is 2
        observed['y'] = observed['count'].fillna(2)
        # observed = observed.rename(columns={'count': 'count_true'})
        # display(observed)

        score_df = observed.merge(predictions, how='left', on=['state', 'name', 'M/F', 'year'], suffixes=('_true', '_pred'))
        # display(score_df)

        y_true = score_df['y_true']
        y_pred = score_df['y_pred']

        if metric == 'msle':
            loss = mean_squared_log_error(y_true, y_pred)
            print(f'Loss: {loss}')

        if metric == 'rank':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            score = np.sum(np.abs(y_true-y_pred))/(len(y_true)*(len(y_true)-1))
            print(f'Score: {score}')
        
        top_F_true = score_df[score_df['M/F'] == 'F'][['name', 'y_true']].sort_values(by='y_true', ascending=False).reset_index(drop=True)
        top_F_pred = score_df[score_df['M/F'] == 'F'][['name', 'y_pred']].sort_values(by='y_pred', ascending=False).reset_index(drop=True)
        top_M_true = score_df[score_df['M/F'] == 'M'][['name', 'y_true']].sort_values(by='y_true', ascending=False).reset_index(drop=True)
        top_M_pred = score_df[score_df['M/F'] == 'M'][['name', 'y_pred']].sort_values(by='y_pred', ascending=False).reset_index(drop=True)
        top = pd.concat([top_F_true, top_F_pred, top_M_true, top_M_pred], axis=1, ignore_index=True)
        display(top.head(5))

In [7]:
class DummyPredictor():

    def __init__(self, strategy='naive'):
        self.strategy = strategy
    
    def predict(self, historical_data, years_to_predict):

        all_known_names = get_all_known_names(historical_data)

        predictions = []
        previous_year_data = historical_data[historical_data['year'] == years_to_predict[0] - 1].drop(columns=['year'])

        for year_to_predict in years_to_predict:

            prediction = all_known_names.copy()
            prediction['year'] = year_to_predict

            if self.strategy == 'naive':
                prediction = prediction.merge(previous_year_data, how='left', on=['state', 'name', 'M/F'])
                prediction['y'] = prediction['count'].fillna(2)
                # display(prediction)
            elif self.strategy == 'mean':
                prediction['y'] = previous_year_data['count'].mean()
                # display(prediction)

            predictions.append(prediction)

        predictions = pd.concat(predictions, ignore_index=True)
        # display(predictions)

        return predictions

In [8]:
first_year_to_predict = 2003
cutoff = 100
# data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=0)
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
evaluate(predictor=DummyPredictor(strategy='naive'), data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='rank')

Predictions for 2003:
Score: 0.03204052376706701


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20201.0,Madison,21773.0,Michael,27126.0,Michael,28256.0
1,Olivia,16152.0,Olivia,14630.0,Ethan,21361.0,Ethan,22112.0
2,Sydney,8911.0,Sydney,9034.0,Anthony,20149.0,Anthony,19590.0
3,Hailey,8797.0,Jennifer,8539.0,David,18631.0,David,18697.0
4,Jennifer,7566.0,Hailey,7992.0,Zachary,15540.0,Zachary,16622.0


Predictions for 2004:
Score: 0.040194518743844804


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20628.0,Madison,21773.0,Michael,25465.0,Michael,28256.0
1,Olivia,16107.0,Olivia,14630.0,Ethan,22208.0,Ethan,22112.0
2,Hailey,8685.0,Sydney,9034.0,Anthony,19612.0,Anthony,19590.0
3,Sydney,8368.0,Jennifer,8539.0,David,18400.0,David,18697.0
4,Jennifer,6545.0,Hailey,7992.0,Zachary,13711.0,Zachary,16622.0


Predictions for 2005:
Score: 0.05132340343700285


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,19577.0,Madison,21773.0,Michael,23826.0,Michael,28256.0
1,Olivia,15699.0,Olivia,14630.0,Ethan,21317.0,Ethan,22112.0
2,Hailey,8592.0,Sydney,9034.0,Anthony,19383.0,Anthony,19590.0
3,Sydney,7376.0,Jennifer,8539.0,David,18133.0,David,18697.0
4,Savannah,6484.0,Hailey,7992.0,Logan,13473.0,Zachary,16622.0


Predictions for 2006:
Score: 0.058113785889524074


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,18632.0,Madison,21773.0,Michael,22647.0,Michael,28256.0
1,Olivia,15505.0,Olivia,14630.0,Ethan,20521.0,Ethan,22112.0
2,Hailey,8423.0,Sydney,9034.0,Anthony,19411.0,Anthony,19590.0
3,Savannah,7096.0,Jennifer,8539.0,David,17653.0,David,18697.0
4,Sydney,6516.0,Hailey,7992.0,Logan,15150.0,Zachary,16622.0


Predictions for 2007:
Score: 0.06422673263287938


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,17965.0,Madison,21773.0,Michael,22014.0,Michael,28256.0
1,Olivia,16587.0,Olivia,14630.0,Ethan,21034.0,Ethan,22112.0
2,Hailey,8437.0,Sydney,9034.0,Anthony,19633.0,Anthony,19590.0
3,Savannah,7050.0,Jennifer,8539.0,David,17556.0,David,18697.0
4,Sydney,6030.0,Hailey,7992.0,Logan,15329.0,Zachary,16622.0


Predictions for 2008:
Score: 0.07240549407644388


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17086.0,Madison,21773.0,Michael,20642.0,Michael,28256.0
1,Madison,17043.0,Olivia,14630.0,Ethan,20220.0,Ethan,22112.0
2,Hailey,7869.0,Sydney,9034.0,Anthony,18409.0,Anthony,19590.0
3,Savannah,6129.0,Jennifer,8539.0,David,16338.0,David,18697.0
4,Gabriella,6036.0,Hailey,7992.0,Logan,13859.0,Zachary,16622.0


Predictions for 2009:
Score: 0.080627961049632


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17441.0,Madison,21773.0,Ethan,19849.0,Michael,28256.0
1,Madison,15249.0,Olivia,14630.0,Michael,18956.0,Ethan,22112.0
2,Hailey,7320.0,Sydney,9034.0,Anthony,16321.0,Anthony,19590.0
3,Gabriella,6150.0,Jennifer,8539.0,David,15450.0,David,18697.0
4,Savannah,5602.0,Hailey,7992.0,Logan,14463.0,Zachary,16622.0


Predictions for 2010:
Score: 0.08712843015564996


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17033.0,Madison,21773.0,Ethan,18011.0,Michael,28256.0
1,Madison,13191.0,Olivia,14630.0,Michael,17374.0,Ethan,22112.0
2,Hailey,7024.0,Sydney,9034.0,Anthony,15503.0,Anthony,19590.0
3,Gabriella,6182.0,Jennifer,8539.0,David,14215.0,David,18697.0
4,Evelyn,5840.0,Hailey,7992.0,Logan,14027.0,Zachary,16622.0


Predictions for 2011:
Score: 0.09472299435324558


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17331.0,Madison,21773.0,Michael,16815.0,Michael,28256.0
1,Madison,12399.0,Olivia,14630.0,Ethan,16693.0,Ethan,22112.0
2,Evelyn,6708.0,Sydney,9034.0,Anthony,14283.0,Anthony,19590.0
3,Zoey,6403.0,Jennifer,8539.0,David,13244.0,David,18697.0
4,Hailey,6281.0,Hailey,7992.0,Logan,12996.0,Zachary,16622.0


Predictions for 2012:
Score: 0.10091606790091082


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17328.0,Madison,21773.0,Ethan,17687.0,Michael,28256.0
1,Madison,11410.0,Olivia,14630.0,Michael,16202.0,Ethan,22112.0
2,Zoey,7476.0,Sydney,9034.0,Anthony,13215.0,Anthony,19590.0
3,Evelyn,6877.0,Jennifer,8539.0,David,12553.0,David,18697.0
4,Hailey,5914.0,Hailey,7992.0,Logan,12485.0,Zachary,16622.0


Predictions for 2013:
Score: 0.10854122596924297


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18446.0,Madison,21773.0,Ethan,16254.0,Michael,28256.0
1,Madison,10602.0,Olivia,14630.0,Michael,15530.0,Ethan,22112.0
2,Evelyn,7668.0,Sydney,9034.0,David,12381.0,Anthony,19590.0
3,Zoey,7250.0,Jennifer,8539.0,Logan,12360.0,David,18697.0
4,Savannah,5252.0,Hailey,7992.0,Anthony,12287.0,Zachary,16622.0


Predictions for 2014:
Score: 0.11577303427096262


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19829.0,Madison,21773.0,Ethan,15737.0,Michael,28256.0
1,Madison,10328.0,Olivia,14630.0,Michael,15487.0,Ethan,22112.0
2,Evelyn,8750.0,Sydney,9034.0,Logan,13661.0,Anthony,19590.0
3,Zoey,7430.0,Jennifer,8539.0,David,12243.0,David,18697.0
4,Savannah,5485.0,Hailey,7992.0,Anthony,11597.0,Zachary,16622.0


Predictions for 2015:
Score: 0.1199877041776659


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19717.0,Madison,21773.0,Ethan,15095.0,Michael,28256.0
1,Madison,10099.0,Olivia,14630.0,Michael,14474.0,Ethan,22112.0
2,Evelyn,9370.0,Sydney,9034.0,Logan,12934.0,Anthony,19590.0
3,Zoey,6964.0,Jennifer,8539.0,David,11821.0,David,18697.0
4,Savannah,5465.0,Hailey,7992.0,Anthony,10650.0,Zachary,16622.0


Predictions for 2016:
Score: 0.1251303881633771


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19397.0,Madison,21773.0,Michael,14120.0,Michael,28256.0
1,Evelyn,10115.0,Olivia,14630.0,Ethan,13843.0,Ethan,22112.0
2,Madison,9047.0,Sydney,9034.0,Logan,11268.0,Anthony,19590.0
3,Zoey,6472.0,Jennifer,8539.0,David,11142.0,David,18697.0
4,Savannah,5383.0,Hailey,7992.0,Owen,10301.0,Zachary,16622.0


Predictions for 2017:
Score: 0.13105540112935088


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18763.0,Madison,21773.0,Logan,14042.0,Michael,28256.0
1,Evelyn,10741.0,Olivia,14630.0,Michael,12700.0,Ethan,22112.0
2,Madison,7901.0,Sydney,9034.0,Ethan,12471.0,Anthony,19590.0
3,Zoey,6069.0,Jennifer,8539.0,David,10222.0,David,18697.0
4,Savannah,5252.0,Hailey,7992.0,Owen,9360.0,Zachary,16622.0


Predictions for 2018:
Score: 0.13403029084573115


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18039.0,Madison,21773.0,Logan,12427.0,Michael,28256.0
1,Evelyn,10427.0,Olivia,14630.0,Ethan,11918.0,Ethan,22112.0
2,Madison,7084.0,Sydney,9034.0,Michael,11729.0,Anthony,19590.0
3,Zoey,5937.0,Jennifer,8539.0,David,9795.0,David,18697.0
4,Ellie,5216.0,Hailey,7992.0,Owen,9330.0,Zachary,16622.0


Predictions for 2019:
Score: 0.13911033023898184


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18549.0,Madison,21773.0,Ethan,11291.0,Michael,28256.0
1,Evelyn,10433.0,Olivia,14630.0,Michael,10589.0,Ethan,22112.0
2,Madison,6290.0,Sydney,9034.0,Logan,10551.0,Anthony,19590.0
3,Zoey,5676.0,Jennifer,8539.0,Owen,9341.0,David,18697.0
4,Ellie,5427.0,Hailey,7992.0,Mateo,9011.0,Zachary,16622.0


Predictions for 2020:
Score: 0.1412052819589401


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17664.0,Madison,21773.0,Michael,9805.0,Michael,28256.0
1,Evelyn,9503.0,Olivia,14630.0,Ethan,9533.0,Ethan,22112.0
2,Madison,6238.0,Sydney,9034.0,Logan,9140.0,Anthony,19590.0
3,Ellie,5592.0,Jennifer,8539.0,Mateo,8990.0,David,18697.0
4,Zoey,5373.0,Hailey,7992.0,Owen,8658.0,Zachary,16622.0


Predictions for 2021:
Score: 0.1478150148890171


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17798.0,Madison,21773.0,Mateo,9159.0,Michael,28256.0
1,Evelyn,9475.0,Olivia,14630.0,Michael,9103.0,Ethan,22112.0
2,Madison,5952.0,Sydney,9034.0,Ethan,8842.0,Anthony,19590.0
3,Ellie,5850.0,Jennifer,8539.0,Logan,8816.0,David,18697.0
4,Zoey,5197.0,Hailey,7992.0,Owen,8755.0,Zachary,16622.0


Predictions for 2022:
Score: 0.15419310851208895


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,16573.0,Madison,21773.0,Mateo,10321.0,Michael,28256.0
1,Evelyn,9289.0,Olivia,14630.0,Michael,8829.0,Ethan,22112.0
2,Ellie,5761.0,Sydney,9034.0,Owen,8546.0,Anthony,19590.0
3,Madison,5479.0,Jennifer,8539.0,Ethan,8271.0,David,18697.0
4,Stella,4830.0,Hailey,7992.0,David,7741.0,Zachary,16622.0


In [23]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor

class MyPredictor():

    def __init__(self):
        
        # params
        cols_to_keep = ['this_year', 'M/F', 'sum', 'median_age', 'thisyear_count', 'diff', 'shift', 'pct_change']
        # cols_to_keep = ['this_year', 'M/F', 'sum_scaled', 'median_age_scaled', 'thisyear_count_scaled']
        categorical_features = ['M/F']
        max_leaf_nodes = 16 # 16
        max_iter = 100 # 100
        loss = 'absolute_error' # abs better than default

        categorical_features = [True if f in categorical_features else False for f in cols_to_keep]
        # print(categorical_features)
        
        self.pipe = make_pipeline(
            ColumnTransformer(
                transformers=[
                    # ('category_encoder', LabelEncoder(), categorical_features),
                    ('cols_to_keep', 'passthrough', cols_to_keep),
                ], remainder='drop'),
            HistGradientBoostingRegressor(
                random_state=0,
                categorical_features=categorical_features,
                max_leaf_nodes=max_leaf_nodes,
                max_iter=max_iter,
                loss=loss
            )
        )

        self.gender_encoding = {'M': 0, 'F': 1}

    def preprocess(self, df, this_year, latest_known_year):

        # find median age of people with name, 
        # total born with that name,
        # and latest year's count

        df = df.copy()
        df = df.sort_values(by='year')
        df['cumsum'] = df.groupby(['state', 'name', 'M/F'])['count'].cumsum()
        df['sum'] = df.groupby(['state', 'name', 'M/F'])['count'].transform('sum')
        # display(df[(df['name'] == 'Millie') & (df['M/F'] == 'M')])

        df['diff'] = df.groupby(['state', 'name', 'M/F'])['count'].diff()
        df['shift'] = df.groupby(['state', 'name', 'M/F'])['count'].shift()
        df['pct_change'] = df.groupby(['state', 'name', 'M/F'])['count'].pct_change()

        medians = df[df['cumsum'] >= df['sum']/2]
        medians = medians.drop_duplicates(subset=['state', 'name', 'M/F'], keep='first')
        medians['median_age'] = latest_known_year - medians['year']
        # display(medians[medians['name'] == 'Madison'])

        medians = medians.drop(['count', 'cumsum', 'diff', 'shift', 'pct_change'], axis=1)

        thisyear = df[df['year'] == latest_known_year][['state', 'name', 'M/F', 'count', 'diff', 'shift', 'pct_change']].rename(columns={'count': 'thisyear_count'})
        df2 = medians.merge(thisyear, how='left', on=['state', 'name', 'M/F']).rename(columns={'year': 'median_year'})
        df2['thisyear_count'] = df2['thisyear_count'].fillna(0) # might want to shift this to 2 and fill in 2s for missing years? or maybe not
        # display(df2)
        # display(df2.groupby(['state','name','M/F']).ngroups)

        # change M/F to 0/1 so it works with various models
        # (even HistGradientBoostingRegressor, which accepts categorical values,
        # still needs those values to be numbers not strings)
        df2['M/F'] = df2['M/F'].map(self.gender_encoding)

        df2['this_year'] = this_year # adding this does seem to improve accuracy. we need to know how far along we are in time

        

        '''
        # SCALING
        scaler = preprocessing.MinMaxScaler((0, 1000000))
        columns_scaled = ['sum', 'median_age', 'thisyear_count']
        scaled = scaler.fit_transform(df2[columns_scaled])
        scaled = pd.DataFrame(scaled, columns=[f'{column}_scaled' for column in columns_scaled])
        df3 = pd.concat([df2, scaled], axis=1)
        '''

        return df2
    
    def fit(self, historical_data, first_year_to_predict, years_to_fit=1, weight_decay=0.9):
        # first things first, we don't want to know about future data
        historical_data = historical_data[historical_data['year'] < first_year_to_predict]
        # at this point the data we don't want to know should be inaccessible

        X_all = pd.DataFrame()
        y_all = pd.Series()

        # each year_to_fit is the year that's essentially our y for that loop
        for year_to_fit in range(first_year_to_predict - years_to_fit, first_year_to_predict):

            # now we "know" even less for X
            X = historical_data[historical_data['year'] < year_to_fit]
            y = historical_data[historical_data['year'] == year_to_fit]

            X = self.preprocess(X, this_year=year_to_fit, latest_known_year=year_to_fit - 1)
            y = y[['state', 'name', 'M/F', 'count']].rename(columns={'count': 'y'})
            y['M/F'] = y['M/F'].map(self.gender_encoding)

            data = X.merge(y, how='left', on=['state', 'name', 'M/F'])
            data['y'] = data['y'].fillna(0)
            # display(data)

            X = data.drop(columns=['y'])
            y = data['y']
            X['sample_weight'] = weight_decay ** (first_year_to_predict - year_to_fit)

            X_all = pd.concat([X_all, X], ignore_index=True)
            y_all = pd.concat([y_all, y], ignore_index=True)
        
        temp = X_all.copy()
        temp['y'] = y_all
        display(temp)

        sample_weights = X_all['sample_weight']
        X_all = X_all.drop(columns=['sample_weight'])

        self.pipe.fit(X_all, y_all, **{'histgradientboostingregressor__sample_weight': sample_weights})
        # this seems like a silly way to pass params to individual steps of the pipeline, but it's true. See: https://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t

    def predict(self, historical_data, years_to_predict):

        # all_known_names = get_all_known_names(historical_data)

        predictions = []

        # display('historical_data in predict:')
        # display(historical_data)
        # display('years_to_predict:')
        # display(years_to_predict)

        for year_to_predict in years_to_predict:

            # display('historical_data in predict loop:')
            # display(historical_data)

            df = self.preprocess(historical_data, this_year=year_to_predict - 1, latest_known_year=year_to_predict - 1)
            # df = self.preprocess(historical_data, years_to_predict[0] - 1)

            df['y'] = self.pipe.predict(df)

            df['year'] = year_to_predict
            # display(df)

            # if we want to simply, do the following; 
            # but for now, might be useful to see all data displayed.
            # df = df[['state', 'year', 'name', 'M/F', 'y']]

            predictions.append(df)

            assumed_new_year_of_historical_data = df[['state', 'year', 'name', 'M/F', 'y']].rename(columns={'y': 'count'})
            assumed_new_year_of_historical_data['M/F'] = assumed_new_year_of_historical_data['M/F'].map({v: k for k, v in self.gender_encoding.items()})
            historical_data = pd.concat([historical_data, assumed_new_year_of_historical_data], ignore_index=True)

        predictions = pd.concat(predictions, ignore_index=True)

        # we have to reverse the mapping to send our predictions
        # (at least the way we currently have it set up)
        predictions['M/F'] = predictions['M/F'].map({v: k for k, v in self.gender_encoding.items()})

        predictions.loc[predictions['y'] < 4.5, 'y'] = 2

        display(predictions)
        # display(predictions[predictions['y'] < 4.5])

        return predictions

In [24]:
first_year_to_predict = 2003
cutoff = 100
my_predictor = MyPredictor()
data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
my_predictor.fit(historical_data=data_to_fit, first_year_to_predict=first_year_to_predict, years_to_fit=30)
evaluate(predictor=my_predictor, data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='rank')

Unnamed: 0,state,name,M/F,median_year,sum,median_age,thisyear_count,diff,shift,pct_change,this_year,sample_weight,y
0,US,Cloe,1,1940,42,32,0.0,,,,1973,0.042391,0.0
1,US,Harlie,1,1940,5,32,0.0,,,,1973,0.042391,0.0
2,US,Gracey,1,1941,12,31,0.0,,,,1973,0.042391,0.0
3,US,Rowan,1,1942,7,30,0.0,,,,1973,0.042391,0.0
4,US,Melvyn,0,1942,2811,30,7.0,-11.0,18.0,-0.611111,1973,0.042391,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93592,US,Adamari,1,2001,240,0,130.0,85.0,45.0,1.888889,2002,0.900000,168.0
93593,US,Maximus,0,2001,1114,0,892.0,704.0,188.0,3.744681,2002,0.900000,912.0
93594,US,Jersey,1,2001,88,0,61.0,55.0,6.0,9.166667,2002,0.900000,110.0
93595,US,Dasani,1,2001,161,0,94.0,36.0,58.0,0.620690,2002,0.900000,105.0


Unnamed: 0,state,name,M/F,median_year,sum,median_age,thisyear_count,diff,shift,pct_change,this_year,y,year
0,US,Gertrude,F,1945,17407.000000,57,11.000000,1.000000,10.000000,0.100000,2002,9.416433,2003
1,US,Flossie,F,1946,2988.000000,56,0.000000,,,,2002,2.000000,2003
2,US,Eula,F,1946,9796.000000,56,0.000000,,,,2002,2.000000,2003
3,US,Olive,F,1946,3510.000000,56,43.000000,6.000000,37.000000,0.162162,2002,37.117296,2003
4,US,Marlys,F,1946,5565.000000,56,0.000000,,,,2002,2.000000,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16575,US,Anaiya,F,2012,4355.477779,9,231.088364,-0.531139,231.619503,-0.002293,2021,231.088364,2022
16576,US,Aydan,M,2012,4535.477636,9,231.192301,0.163864,231.028437,0.000709,2021,231.192301,2022
16577,US,Memphis,M,2012,4690.938477,9,231.192301,0.616767,230.575533,0.002675,2021,231.192301,2022
16578,US,Kadejah,F,2013,1715.148943,8,101.208892,0.267558,100.941334,0.002651,2021,101.208892,2022


Predictions for 2003:
Score: 0.031195550194343922


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20201.0,Madison,18899.121368,Michael,27126.0,Michael,25070.863169
1,Olivia,16152.0,Olivia,14325.525129,Ethan,21361.0,Ethan,20995.656497
2,Sydney,8911.0,Sydney,9830.956977,Anthony,20149.0,Anthony,18906.346116
3,Hailey,8797.0,Hailey,7256.589896,David,18631.0,David,18656.883477
4,Jennifer,7566.0,Jennifer,7089.862353,Zachary,15540.0,Zachary,14765.665665


Predictions for 2004:
Score: 0.038954738553521794


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20628.0,Madison,17818.006638,Michael,25465.0,Michael,25015.503262
1,Olivia,16107.0,Olivia,13456.105263,Ethan,22208.0,David,18910.586674
2,Hailey,8685.0,Sydney,10318.530338,Anthony,19612.0,Ethan,18685.061015
3,Sydney,8368.0,Savannah,7473.810534,David,18400.0,Anthony,18654.912574
4,Jennifer,6545.0,Hailey,6939.838116,Zachary,13711.0,Logan,14847.930132


Predictions for 2005:
Score: 0.04910024883014866


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,19577.0,Madison,18412.740033,Michael,23826.0,Michael,25291.640139
1,Olivia,15699.0,Olivia,13562.927868,Ethan,21317.0,David,19367.356998
2,Hailey,8592.0,Sydney,10548.301677,Anthony,19383.0,Anthony,18906.346116
3,Sydney,7376.0,Savannah,7808.659349,David,18133.0,Ethan,18105.206305
4,Savannah,6484.0,Hailey,7126.434839,Logan,13473.0,Logan,14480.371119


Predictions for 2006:
Score: 0.05492328222700069


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,18632.0,Madison,19384.313357,Michael,22647.0,Michael,25785.037708
1,Olivia,15505.0,Olivia,13886.960936,Ethan,20521.0,David,19372.591029
2,Hailey,8423.0,Sydney,10427.9858,Anthony,19411.0,Anthony,19365.607332
3,Savannah,7096.0,Savannah,7769.588252,David,17653.0,Ethan,18818.323649
4,Sydney,6516.0,Hailey,7592.913201,Logan,15150.0,Zachary,13631.444337


Predictions for 2007:
Score: 0.061400441717219395


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,17965.0,Madison,19918.641212,Michael,22014.0,Michael,25790.27174
1,Olivia,16587.0,Olivia,14001.42462,Ethan,21034.0,Ethan,19535.909076
2,Hailey,8437.0,Sydney,9738.000593,Anthony,19633.0,Anthony,19370.841363
3,Savannah,7050.0,Hailey,7809.507133,David,17556.0,David,18910.564517
4,Sydney,6030.0,Jennifer,7777.482901,Logan,15329.0,Zachary,14615.280529


Predictions for 2008:
Score: 0.0697394567694038


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17086.0,Madison,19384.313357,Michael,20642.0,Michael,25291.640139
1,Madison,17043.0,Olivia,13887.228494,Ethan,20220.0,Ethan,19533.183217
2,Hailey,7869.0,Sydney,9818.748017,Anthony,18409.0,Anthony,18908.845593
3,Savannah,6129.0,Jennifer,7756.326486,David,16338.0,David,18842.239133
4,Gabriella,6036.0,Hailey,7752.616164,Logan,13859.0,Zachary,14615.280529


Predictions for 2009:
Score: 0.07643514390774055


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17441.0,Madison,18666.212457,Ethan,19849.0,Michael,25291.640139
1,Madison,15249.0,Olivia,13456.372821,Michael,18956.0,Ethan,19033.651237
2,Hailey,7320.0,Sydney,10208.428887,Anthony,16321.0,David,18914.843147
3,Gabriella,6150.0,Jennifer,7216.015149,David,15450.0,Anthony,18842.948031
4,Savannah,5602.0,Hailey,7199.033231,Logan,14463.0,Logan,14153.544751


Predictions for 2010:
Score: 0.08276224774625152


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17033.0,Madison,18666.212457,Ethan,18011.0,Michael,25291.710353
1,Madison,13191.0,Olivia,13605.62785,Michael,17374.0,David,19047.726299
2,Hailey,7024.0,Sydney,10542.219861,Anthony,15503.0,Ethan,18962.83179
3,Gabriella,6182.0,Savannah,7320.890439,David,14215.0,Anthony,18908.837008
4,Evelyn,5840.0,Hailey,7141.866792,Logan,14027.0,Logan,14039.482427


Predictions for 2011:
Score: 0.09073850690255998


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17331.0,Madison,18884.647576,Michael,16815.0,Michael,25291.710353
1,Madison,12399.0,Olivia,13887.228494,Ethan,16693.0,David,19059.568965
2,Evelyn,6708.0,Sydney,10542.219861,Anthony,14283.0,Anthony,19046.007374
3,Zoey,6403.0,Savannah,7743.903897,David,13244.0,Ethan,19028.920932
4,Hailey,6281.0,Hailey,7199.033231,Logan,12996.0,Logan,13537.978284


Predictions for 2012:
Score: 0.09774450330122433


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17328.0,Madison,19033.182617,Ethan,17687.0,Michael,25291.710353
1,Madison,11410.0,Olivia,14001.42462,Michael,16202.0,Ethan,19100.489289
2,Zoey,7476.0,Sydney,9834.206636,Anthony,13215.0,Anthony,19056.979714
3,Evelyn,6877.0,Savannah,7801.537752,David,12553.0,David,18917.609855
4,Hailey,5914.0,Hailey,7463.756864,Logan,12485.0,Zachary,13906.456891


Predictions for 2013:
Score: 0.10673618759578796


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18446.0,Madison,19033.220551,Ethan,16254.0,Michael,25291.679612
1,Madison,10602.0,Olivia,13887.228494,Michael,15530.0,Ethan,19100.489289
2,Evelyn,7668.0,Sydney,9770.447352,David,12381.0,Anthony,18909.029609
3,Zoey,7250.0,Jennifer,7756.405203,Logan,12360.0,David,18905.637218
4,Savannah,5252.0,Hailey,7747.776754,Anthony,12287.0,Zachary,14025.485689


Predictions for 2014:
Score: 0.11380337173592536


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19829.0,Madison,18884.647576,Ethan,15737.0,Michael,25291.679612
1,Madison,10328.0,Olivia,13456.372821,Michael,15487.0,Ethan,18962.888654
2,Evelyn,8750.0,Sydney,9827.003898,Logan,13661.0,David,18911.426259
3,Zoey,7430.0,Jennifer,7778.216702,David,12243.0,Anthony,18906.346116
4,Savannah,5485.0,Hailey,7747.776754,Anthony,11597.0,Logan,14082.51461


Predictions for 2015:
Score: 0.11784759007709655


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19717.0,Madison,18877.312266,Ethan,15095.0,Michael,25291.679612
1,Madison,10099.0,Olivia,13605.62785,Michael,14474.0,Ethan,18955.667457
2,Evelyn,9370.0,Sydney,10197.724105,Logan,12934.0,David,18911.434844
3,Zoey,6964.0,Jennifer,7418.611527,David,11821.0,Anthony,18908.845593
4,Savannah,5465.0,Hailey,7206.956431,Anthony,10650.0,Logan,13968.719844


Predictions for 2016:
Score: 0.1242417090610304


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19397.0,Madison,18884.647576,Michael,14120.0,Michael,25291.679612
1,Evelyn,10115.0,Olivia,13887.228494,Ethan,13843.0,Ethan,18962.888654
2,Madison,9047.0,Sydney,10542.219861,Logan,11268.0,David,18910.283192
3,Zoey,6472.0,Savannah,7206.956431,David,11142.0,Anthony,18908.845593
4,Savannah,5383.0,Jennifer,7096.360938,Owen,10301.0,Zachary,13628.704917


Predictions for 2017:
Score: 0.1297733722603917


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18763.0,Madison,18884.647576,Logan,14042.0,Michael,25291.679612
1,Evelyn,10741.0,Olivia,14001.42462,Michael,12700.0,Ethan,18962.888654
2,Madison,7901.0,Sydney,10542.219861,Ethan,12471.0,David,18910.283192
3,Zoey,6069.0,Savannah,7563.129835,David,10222.0,Anthony,18908.845593
4,Savannah,5252.0,Hailey,7110.565042,Owen,9360.0,Zachary,13911.42555


Predictions for 2018:
Score: 0.13350582449024784


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18039.0,Madison,18884.647576,Logan,12427.0,Michael,25292.549939
1,Evelyn,10427.0,Olivia,13887.228494,Ethan,11918.0,Ethan,18962.888654
2,Madison,7084.0,Sydney,9834.206636,Michael,11729.0,David,18910.283192
3,Zoey,5937.0,Savannah,7769.588252,David,9795.0,Anthony,18908.845593
4,Ellie,5216.0,Hailey,7327.504303,Owen,9330.0,Zachary,14025.220316


Predictions for 2019:
Score: 0.13758063670215556


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18549.0,Madison,18884.647576,Ethan,11291.0,Michael,25292.549939
1,Evelyn,10433.0,Olivia,13456.372821,Michael,10589.0,Ethan,18962.888654
2,Madison,6290.0,Sydney,9770.447352,Logan,10551.0,David,18923.546698
3,Zoey,5676.0,Savannah,7743.903897,Owen,9341.0,Anthony,18908.845593
4,Ellie,5427.0,Hailey,7743.903897,Mateo,9011.0,Logan,14027.265099


Predictions for 2020:
Score: 0.1402321054993211


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17664.0,Madison,18884.647576,Michael,9805.0,Michael,25292.549939
1,Evelyn,9503.0,Olivia,13605.62785,Ethan,9533.0,David,18929.721709
2,Madison,6238.0,Sydney,9827.003898,Logan,9140.0,Anthony,18908.845593
3,Ellie,5592.0,Hailey,7801.537752,Mateo,8990.0,Ethan,18907.639144
4,Zoey,5373.0,Jennifer,7780.079003,Owen,8658.0,Logan,13902.497992


Predictions for 2021:
Score: 0.14484595257658667


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17798.0,Madison,18884.647576,Mateo,9159.0,Michael,25292.549939
1,Evelyn,9475.0,Olivia,13887.228494,Michael,9103.0,David,18923.546698
2,Madison,5952.0,Sydney,10197.724105,Ethan,8842.0,Anthony,18910.525044
3,Ellie,5850.0,Jennifer,7755.079493,Logan,8816.0,Ethan,18902.908838
4,Zoey,5197.0,Hailey,7463.756864,Owen,8755.0,Zachary,13635.794157


Predictions for 2022:
Score: 0.15177182217094107


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,16573.0,Madison,18884.647576,Mateo,10321.0,Michael,25292.549939
1,Evelyn,9289.0,Olivia,14001.42462,Michael,8829.0,David,18923.546698
2,Ellie,5761.0,Sydney,10542.219861,Owen,8546.0,Anthony,18910.525044
3,Madison,5479.0,Jennifer,7223.912691,Ethan,8271.0,Ethan,18907.639144
4,Stella,4830.0,Savannah,7110.565042,David,7741.0,Zachary,13912.120114
