In [5]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web, holdout_split
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from scipy.stats import kendalltau

In [6]:
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)
traintestval, holdout = holdout_split(dfraw)
trainval, test = holdout_split(traintestval)

In [7]:
test

Unnamed: 0,state,year,name,M/F,count
0,US,2000,Madison,F,19968
1,US,2000,Olivia,F,12854
2,US,2000,Sydney,F,10244
3,US,2000,Jennifer,F,9389
4,US,2000,Amanda,F,8557
...,...,...,...,...,...
212633,US,2019,Zaheen,M,5
212634,US,2019,Zahi,M,5
212635,US,2019,Zaymar,M,5
212636,US,2019,Zeo,M,5


In [8]:
def get_all_known_names(data):

    names = data.groupby(['state', 'name', 'M/F']).size().reset_index()
    names = names[['state', 'name', 'M/F']]
    return names

In [9]:
def select_top_names(data, first_year_to_predict, cutoff=10):
    '''
    Select names that have had at least one year at or above the cutoff count?
    Could also try to redo this to do the cutoff for the most recent known year's data
    '''

    data_noleak = data[data['year'] < first_year_to_predict]
    names = data_noleak[data_noleak['count'] >= cutoff][['state', 'name', 'M/F']].drop_duplicates()
    # display(names)
    data = names.merge(data, how='left', on=['state', 'name', 'M/F'])
    # display(data)

    return data

In [10]:
def evaluate(predictor, data_held_out, first_year_to_predict, metric='msle'):

    # display('data_held_out:')
    # display(data_held_out)

    most_recent_year = data_held_out['year'].max()
    
    years_to_predict = range(first_year_to_predict, most_recent_year+1)

    # only allow the model to see data from before the year to predict
    historical_data = data_held_out[data_held_out['year'] < first_year_to_predict]

    # display('historical_data:')
    # display(historical_data)

    # get our model's predictions
    predictions = predictor.predict(historical_data, years_to_predict)

    all_known_names = get_all_known_names(historical_data)

    for year_to_predict in years_to_predict:

        print(f'Predictions for {year_to_predict}:')

        names_to_predict = all_known_names.copy()
        names_to_predict['year'] = year_to_predict
        # display(names_to_predict)

        observed = names_to_predict.merge(data_held_out, how='left', on=['state', 'name', 'M/F', 'year'])

        # for now, fill in missing values with 2, same as FiveThirtyEight did;
        # reasoning: missing values could be 0 to 4, so average is 2
        observed['y'] = observed['count'].fillna(2)
        # observed = observed.rename(columns={'count': 'count_true'})
        # display(observed)

        score_df = observed.merge(predictions, how='left', on=['state', 'name', 'M/F', 'year'], suffixes=('_true', '_pred'))
        # display(score_df)

        y_true = score_df['y_true']
        y_pred = score_df['y_pred']

        if metric == 'msle':
            loss = mean_squared_log_error(y_true, y_pred)
            print(f'Loss: {loss}')

        if metric == 'rank':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            score = np.sum(np.abs(y_true-y_pred))/(len(y_true)*(len(y_true)-1))
            print(f'Score: {score}')

        if metric == 'rank_mae':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            loss = mean_absolute_error(y_true, y_pred)
            print(f'Loss: {loss}')

        if metric == 'kendalltau':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            tau, _ = kendalltau(y_true, y_pred)
            print(f'Tau: {tau}')
        
        top_F_true = score_df[score_df['M/F'] == 'F'][['name', 'y_true']].sort_values(by='y_true', ascending=False).reset_index(drop=True)
        top_F_pred = score_df[score_df['M/F'] == 'F'][['name', 'y_pred']].sort_values(by='y_pred', ascending=False).reset_index(drop=True)
        top_M_true = score_df[score_df['M/F'] == 'M'][['name', 'y_true']].sort_values(by='y_true', ascending=False).reset_index(drop=True)
        top_M_pred = score_df[score_df['M/F'] == 'M'][['name', 'y_pred']].sort_values(by='y_pred', ascending=False).reset_index(drop=True)
        top = pd.concat([top_F_true, top_F_pred, top_M_true, top_M_pred], axis=1, ignore_index=True)
        display(top.head(5))

In [11]:
class DummyPredictor():

    def __init__(self, strategy='naive'):
        self.strategy = strategy
    
    def predict(self, historical_data, years_to_predict):

        all_known_names = get_all_known_names(historical_data)

        predictions = []
        previous_year_data = historical_data[historical_data['year'] == years_to_predict[0] - 1].drop(columns=['year'])

        for year_to_predict in years_to_predict:

            prediction = all_known_names.copy()
            prediction['year'] = year_to_predict

            if self.strategy == 'naive':
                prediction = prediction.merge(previous_year_data, how='left', on=['state', 'name', 'M/F'])
                prediction['y'] = prediction['count'].fillna(2)
                # display(prediction)
            elif self.strategy == 'mean':
                prediction['y'] = previous_year_data['count'].mean()
                # display(prediction)

            predictions.append(prediction)

        predictions = pd.concat(predictions, ignore_index=True)
        # display(predictions)

        return predictions

In [12]:
first_year_to_predict = 2003
cutoff = 100
# data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=0)
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
evaluate(predictor=DummyPredictor(strategy='naive'), data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='kendalltau')

Predictions for 2003:
Tau: 0.9110130503800179


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20201.0,Madison,21773.0,Michael,27126.0,Michael,28256.0
1,Olivia,16152.0,Olivia,14630.0,Ethan,21361.0,Ethan,22112.0
2,Sydney,8911.0,Sydney,9034.0,Anthony,20149.0,Anthony,19590.0
3,Hailey,8797.0,Jennifer,8539.0,David,18631.0,David,18697.0
4,Jennifer,7566.0,Hailey,7992.0,Zachary,15540.0,Zachary,16622.0


Predictions for 2004:
Tau: 0.8862572816422383


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20628.0,Madison,21773.0,Michael,25465.0,Michael,28256.0
1,Olivia,16107.0,Olivia,14630.0,Ethan,22208.0,Ethan,22112.0
2,Hailey,8685.0,Sydney,9034.0,Anthony,19612.0,Anthony,19590.0
3,Sydney,8368.0,Jennifer,8539.0,David,18400.0,David,18697.0
4,Jennifer,6545.0,Hailey,7992.0,Zachary,13711.0,Zachary,16622.0


Predictions for 2005:
Tau: 0.8552602546804069


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,19577.0,Madison,21773.0,Michael,23826.0,Michael,28256.0
1,Olivia,15699.0,Olivia,14630.0,Ethan,21317.0,Ethan,22112.0
2,Hailey,8592.0,Sydney,9034.0,Anthony,19383.0,Anthony,19590.0
3,Sydney,7376.0,Jennifer,8539.0,David,18133.0,David,18697.0
4,Savannah,6484.0,Hailey,7992.0,Logan,13473.0,Zachary,16622.0


Predictions for 2006:
Tau: 0.834800661334781


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,18632.0,Madison,21773.0,Michael,22647.0,Michael,28256.0
1,Olivia,15505.0,Olivia,14630.0,Ethan,20521.0,Ethan,22112.0
2,Hailey,8423.0,Sydney,9034.0,Anthony,19411.0,Anthony,19590.0
3,Savannah,7096.0,Jennifer,8539.0,David,17653.0,David,18697.0
4,Sydney,6516.0,Hailey,7992.0,Logan,15150.0,Zachary,16622.0


Predictions for 2007:
Tau: 0.8165064953668796


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,17965.0,Madison,21773.0,Michael,22014.0,Michael,28256.0
1,Olivia,16587.0,Olivia,14630.0,Ethan,21034.0,Ethan,22112.0
2,Hailey,8437.0,Sydney,9034.0,Anthony,19633.0,Anthony,19590.0
3,Savannah,7050.0,Jennifer,8539.0,David,17556.0,David,18697.0
4,Sydney,6030.0,Hailey,7992.0,Logan,15329.0,Zachary,16622.0


Predictions for 2008:
Tau: 0.7939492380894682


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17086.0,Madison,21773.0,Michael,20642.0,Michael,28256.0
1,Madison,17043.0,Olivia,14630.0,Ethan,20220.0,Ethan,22112.0
2,Hailey,7869.0,Sydney,9034.0,Anthony,18409.0,Anthony,19590.0
3,Savannah,6129.0,Jennifer,8539.0,David,16338.0,David,18697.0
4,Gabriella,6036.0,Hailey,7992.0,Logan,13859.0,Zachary,16622.0


Predictions for 2009:
Tau: 0.7729846890949784


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17441.0,Madison,21773.0,Ethan,19849.0,Michael,28256.0
1,Madison,15249.0,Olivia,14630.0,Michael,18956.0,Ethan,22112.0
2,Hailey,7320.0,Sydney,9034.0,Anthony,16321.0,Anthony,19590.0
3,Gabriella,6150.0,Jennifer,8539.0,David,15450.0,David,18697.0
4,Savannah,5602.0,Hailey,7992.0,Logan,14463.0,Zachary,16622.0


Predictions for 2010:
Tau: 0.752701311440414


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17033.0,Madison,21773.0,Ethan,18011.0,Michael,28256.0
1,Madison,13191.0,Olivia,14630.0,Michael,17374.0,Ethan,22112.0
2,Hailey,7024.0,Sydney,9034.0,Anthony,15503.0,Anthony,19590.0
3,Gabriella,6182.0,Jennifer,8539.0,David,14215.0,David,18697.0
4,Evelyn,5840.0,Hailey,7992.0,Logan,14027.0,Zachary,16622.0


Predictions for 2011:
Tau: 0.7315830878024379


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17331.0,Madison,21773.0,Michael,16815.0,Michael,28256.0
1,Madison,12399.0,Olivia,14630.0,Ethan,16693.0,Ethan,22112.0
2,Evelyn,6708.0,Sydney,9034.0,Anthony,14283.0,Anthony,19590.0
3,Zoey,6403.0,Jennifer,8539.0,David,13244.0,David,18697.0
4,Hailey,6281.0,Hailey,7992.0,Logan,12996.0,Zachary,16622.0


Predictions for 2012:
Tau: 0.7134263805171484


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17328.0,Madison,21773.0,Ethan,17687.0,Michael,28256.0
1,Madison,11410.0,Olivia,14630.0,Michael,16202.0,Ethan,22112.0
2,Zoey,7476.0,Sydney,9034.0,Anthony,13215.0,Anthony,19590.0
3,Evelyn,6877.0,Jennifer,8539.0,David,12553.0,David,18697.0
4,Hailey,5914.0,Hailey,7992.0,Logan,12485.0,Zachary,16622.0


Predictions for 2013:
Tau: 0.692521144434704


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18446.0,Madison,21773.0,Ethan,16254.0,Michael,28256.0
1,Madison,10602.0,Olivia,14630.0,Michael,15530.0,Ethan,22112.0
2,Evelyn,7668.0,Sydney,9034.0,David,12381.0,Anthony,19590.0
3,Zoey,7250.0,Jennifer,8539.0,Logan,12360.0,David,18697.0
4,Savannah,5252.0,Hailey,7992.0,Anthony,12287.0,Zachary,16622.0


Predictions for 2014:
Tau: 0.6716364916451607


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19829.0,Madison,21773.0,Ethan,15737.0,Michael,28256.0
1,Madison,10328.0,Olivia,14630.0,Michael,15487.0,Ethan,22112.0
2,Evelyn,8750.0,Sydney,9034.0,Logan,13661.0,Anthony,19590.0
3,Zoey,7430.0,Jennifer,8539.0,David,12243.0,David,18697.0
4,Savannah,5485.0,Hailey,7992.0,Anthony,11597.0,Zachary,16622.0


Predictions for 2015:
Tau: 0.6598438531667463


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19717.0,Madison,21773.0,Ethan,15095.0,Michael,28256.0
1,Madison,10099.0,Olivia,14630.0,Michael,14474.0,Ethan,22112.0
2,Evelyn,9370.0,Sydney,9034.0,Logan,12934.0,Anthony,19590.0
3,Zoey,6964.0,Jennifer,8539.0,David,11821.0,David,18697.0
4,Savannah,5465.0,Hailey,7992.0,Anthony,10650.0,Zachary,16622.0


Predictions for 2016:
Tau: 0.6439492555890445


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19397.0,Madison,21773.0,Michael,14120.0,Michael,28256.0
1,Evelyn,10115.0,Olivia,14630.0,Ethan,13843.0,Ethan,22112.0
2,Madison,9047.0,Sydney,9034.0,Logan,11268.0,Anthony,19590.0
3,Zoey,6472.0,Jennifer,8539.0,David,11142.0,David,18697.0
4,Savannah,5383.0,Hailey,7992.0,Owen,10301.0,Zachary,16622.0


Predictions for 2017:
Tau: 0.6273196260077823


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18763.0,Madison,21773.0,Logan,14042.0,Michael,28256.0
1,Evelyn,10741.0,Olivia,14630.0,Michael,12700.0,Ethan,22112.0
2,Madison,7901.0,Sydney,9034.0,Ethan,12471.0,Anthony,19590.0
3,Zoey,6069.0,Jennifer,8539.0,David,10222.0,David,18697.0
4,Savannah,5252.0,Hailey,7992.0,Owen,9360.0,Zachary,16622.0


Predictions for 2018:
Tau: 0.6193894008532494


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18039.0,Madison,21773.0,Logan,12427.0,Michael,28256.0
1,Evelyn,10427.0,Olivia,14630.0,Ethan,11918.0,Ethan,22112.0
2,Madison,7084.0,Sydney,9034.0,Michael,11729.0,Anthony,19590.0
3,Zoey,5937.0,Jennifer,8539.0,David,9795.0,David,18697.0
4,Ellie,5216.0,Hailey,7992.0,Owen,9330.0,Zachary,16622.0


Predictions for 2019:
Tau: 0.6028504516095443


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18549.0,Madison,21773.0,Ethan,11291.0,Michael,28256.0
1,Evelyn,10433.0,Olivia,14630.0,Michael,10589.0,Ethan,22112.0
2,Madison,6290.0,Sydney,9034.0,Logan,10551.0,Anthony,19590.0
3,Zoey,5676.0,Jennifer,8539.0,Owen,9341.0,David,18697.0
4,Ellie,5427.0,Hailey,7992.0,Mateo,9011.0,Zachary,16622.0


Predictions for 2020:
Tau: 0.59648447372637


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17664.0,Madison,21773.0,Michael,9805.0,Michael,28256.0
1,Evelyn,9503.0,Olivia,14630.0,Ethan,9533.0,Ethan,22112.0
2,Madison,6238.0,Sydney,9034.0,Logan,9140.0,Anthony,19590.0
3,Ellie,5592.0,Jennifer,8539.0,Mateo,8990.0,David,18697.0
4,Zoey,5373.0,Hailey,7992.0,Owen,8658.0,Zachary,16622.0


Predictions for 2021:
Tau: 0.5816960267068629


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17798.0,Madison,21773.0,Mateo,9159.0,Michael,28256.0
1,Evelyn,9475.0,Olivia,14630.0,Michael,9103.0,Ethan,22112.0
2,Madison,5952.0,Sydney,9034.0,Ethan,8842.0,Anthony,19590.0
3,Ellie,5850.0,Jennifer,8539.0,Logan,8816.0,David,18697.0
4,Zoey,5197.0,Hailey,7992.0,Owen,8755.0,Zachary,16622.0


Predictions for 2022:
Tau: 0.5628143503817918


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,16573.0,Madison,21773.0,Mateo,10321.0,Michael,28256.0
1,Evelyn,9289.0,Olivia,14630.0,Michael,8829.0,Ethan,22112.0
2,Ellie,5761.0,Sydney,9034.0,Owen,8546.0,Anthony,19590.0
3,Madison,5479.0,Jennifer,8539.0,Ethan,8271.0,David,18697.0
4,Stella,4830.0,Hailey,7992.0,David,7741.0,Zachary,16622.0


In [13]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor

class MyPredictor():

    def __init__(self):
        
        # params
        cols_to_keep = ['this_year', 'M/F', 'sum', 'median_age', 'thisyear_count', 'diff'] # best
        # cols_to_keep = ['this_year', 'M/F', 'sum', 'median_age', 'thisyear_count', 'diff', 'first_letter_1_pct_change'] # first letter pct change helps in some years
        # cols_to_keep = ['this_year', 'M/F', 'sum_scaled', 'median_age_scaled', 'thisyear_count_scaled']
        categorical_features = ['M/F']
        max_leaf_nodes = 16 # 16
        max_iter = 100 # 100
        loss = 'absolute_error' # abs better than default

        categorical_features = [True if f in categorical_features else False for f in cols_to_keep]
        # print(categorical_features)
        
        self.pipe = make_pipeline(
            ColumnTransformer(
                transformers=[
                    # ('category_encoder', LabelEncoder(), categorical_features),
                    ('cols_to_keep', 'passthrough', cols_to_keep),
                ], remainder='drop'),
            HistGradientBoostingRegressor(
                random_state=0,
                categorical_features=categorical_features,
                max_leaf_nodes=max_leaf_nodes,
                max_iter=max_iter,
                loss=loss
            )
        )

        self.gender_encoding = {'M': 0, 'F': 1}

    def preprocess(self, df, latest_known_year):

        # print(f'Latest known year: {latest_known_year}')

        # find median age of people with name, 
        # total born with that name,
        # and latest year's count

        df = df.copy()
        df = df.sort_values(by='year')
        df['cumsum'] = df.groupby(['state', 'name', 'M/F'])['count'].cumsum()
        df['sum'] = df.groupby(['state', 'name', 'M/F'])['count'].transform('sum')

        df['diff'] = df.groupby(['state', 'name', 'M/F'])['count'].diff()
        df['diff2'] = df.groupby(['state', 'name', 'M/F'])['count'].diff(2)
        df['shift'] = df.groupby(['state', 'name', 'M/F'])['count'].shift()
        df['pct_change'] = df.groupby(['state', 'name', 'M/F'])['count'].pct_change()
        df['accel'] = df.groupby(['state', 'name', 'M/F'])['diff'].diff()

        df['first_letter_1'] = df['name'].str[0].str.lower()

        total_names_per_year = df.groupby(['year', 'state', 'M/F'])['count'].sum()
        letter_names_per_year = df.groupby(['year', 'state', 'M/F', 'first_letter_1'])['count'].sum()
        percentage_of_total_per_year = (letter_names_per_year / total_names_per_year).rename('first_letter_1_pct')
        # display(percentage_of_total_per_year)
        percentage_change_per_year = percentage_of_total_per_year.groupby(['state', 'M/F', 'first_letter_1']).pct_change().rename('first_letter_1_pct_change')
        # display(percentage_change_per_year)
        # df['first_letter_1_pct'] = percentage_of_total_per_year
        # df['first_letter_1_pct_change'] = percentage_change_per_year
        # df = df.merge(percentage_of_total_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        # df = df.merge(percentage_change_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        
        # display(df[(df['name'] == 'Maximus')])

        medians = df[df['cumsum'] >= df['sum']/2]
        medians = medians.drop_duplicates(subset=['state', 'name', 'M/F'], keep='first')
        medians['median_age'] = latest_known_year - medians['year']
        # display(medians[medians['name'] == 'Madison'])

        medians = medians.drop(['count', 'cumsum', 'diff', 'shift', 'pct_change', 'accel', 'diff2'], axis=1)

        thisyear = df[df['year'] == latest_known_year][['state', 'name', 'M/F', 'count', 'diff', 'shift', 'pct_change', 'accel', 'diff2']].rename(columns={'count': 'thisyear_count'})
        # thisyear = thisyear.merge(percentage_of_total_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        # thisyear = thisyear.merge(percentage_change_per_year, how='left', on=['year', 'state', 'M/F', 'first_letter_1'])
        # thisyear = thisyear.rename(columns={'year': 'this_year'})

        df = medians.merge(thisyear, how='left', on=['state', 'name', 'M/F']).rename(columns={'year': 'median_year'})
        # df2[['thisyear_count']] = df2[['thisyear_count']].fillna(0) # might want to shift this to 2 and fill in 2s for missing years? or maybe not

        # this is sort of a rough assumption that if a row didn't exist for this year, not only is the count 0, but so is the diff, shift and pct_change. not always true if the prior year had a count, but often true
        df[['thisyear_count', 'diff', 'shift', 'pct_change', 'accel', 'diff2']] = df[['thisyear_count', 'diff', 'shift', 'pct_change', 'accel', 'diff2']].fillna(0)

        df['this_year'] = latest_known_year # adding this does seem to improve accuracy. we need to know how far along we are in time

        df = df.merge(percentage_of_total_per_year, how='left', left_on=['this_year', 'state', 'M/F', 'first_letter_1'], right_on=['year', 'state', 'M/F', 'first_letter_1'])
        df = df.merge(percentage_change_per_year, how='left', left_on=['this_year', 'state', 'M/F', 'first_letter_1'], right_on=['year', 'state', 'M/F', 'first_letter_1'])

        # display(df2)
        # display(df2.groupby(['state','name','M/F']).ngroups)

        # change M/F to 0/1 so it works with various models
        # (even HistGradientBoostingRegressor, which accepts categorical values,
        # still needs those values to be numbers not strings)
        df['M/F'] = df['M/F'].map(self.gender_encoding)


        
        '''
        grams = df.groupby(['state', 'M/F', 'first_letter_1'])['thisyear_count'].sum().rename('first_letter_1_pct')
        grams = grams / grams.sum()
        # display(grams[0])
        df = df.merge(grams, how='left', on=['state', 'M/F', 'first_letter_1'])
        '''

        '''
        # SCALING
        scaler = preprocessing.MinMaxScaler((0, 1000000))
        columns_scaled = ['sum', 'median_age', 'thisyear_count']
        scaled = scaler.fit_transform(df2[columns_scaled])
        scaled = pd.DataFrame(scaled, columns=[f'{column}_scaled' for column in columns_scaled])
        df3 = pd.concat([df2, scaled], axis=1)
        '''

        return df
    
    def fit(self, historical_data, first_year_to_predict, years_to_fit=1, weight_decay=0.9):
        # first things first, we don't want to know about future data
        historical_data = historical_data[historical_data['year'] < first_year_to_predict]
        # at this point the data we don't want to know should be inaccessible

        X_all = pd.DataFrame()
        y_all = pd.Series()

        # each year_to_fit is the year that's essentially our y for that loop
        for year_to_fit in range(first_year_to_predict - years_to_fit, first_year_to_predict):

            # now we "know" even less for X
            X = historical_data[historical_data['year'] < year_to_fit]
            y = historical_data[historical_data['year'] == year_to_fit]

            X = self.preprocess(X, latest_known_year=year_to_fit - 1)
            y = y[['state', 'name', 'M/F', 'count']].rename(columns={'count': 'y'})
            y['M/F'] = y['M/F'].map(self.gender_encoding)

            data = X.merge(y, how='left', on=['state', 'name', 'M/F'])
            data['y'] = data['y'].fillna(0)
            # display(data)

            X = data.drop(columns=['y'])
            y = data['y']
            X['sample_weight'] = weight_decay ** (first_year_to_predict - year_to_fit)

            X_all = pd.concat([X_all, X], ignore_index=True)
            y_all = pd.concat([y_all, y], ignore_index=True)
        
        temp = X_all.copy()
        temp['y'] = y_all
        display(temp)

        sample_weights = X_all['sample_weight']
        X_all = X_all.drop(columns=['sample_weight'])

        self.pipe.fit(X_all, y_all, **{'histgradientboostingregressor__sample_weight': sample_weights})
        # this seems like a silly way to pass params to individual steps of the pipeline, but it's true. See: https://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t

    def predict(self, historical_data, years_to_predict):

        # all_known_names = get_all_known_names(historical_data)

        predictions = []

        # display('historical_data in predict:')
        # display(historical_data)
        # display('years_to_predict:')
        # display(years_to_predict)

        for year_to_predict in years_to_predict:

            # display('historical_data in predict loop:')
            # display(historical_data)

            df = self.preprocess(historical_data, latest_known_year=year_to_predict - 1)
            # df = self.preprocess(historical_data, years_to_predict[0] - 1)

            df['y'] = self.pipe.predict(df)

            df['year'] = year_to_predict
            # display(df)

            # if we want to simply, do the following; 
            # but for now, might be useful to see all data displayed.
            # df = df[['state', 'year', 'name', 'M/F', 'y']]

            predictions.append(df)

            assumed_new_year_of_historical_data = df[['state', 'year', 'name', 'M/F', 'y']].rename(columns={'y': 'count'})
            assumed_new_year_of_historical_data['M/F'] = assumed_new_year_of_historical_data['M/F'].map({v: k for k, v in self.gender_encoding.items()})
            historical_data = pd.concat([historical_data, assumed_new_year_of_historical_data], ignore_index=True)

        predictions = pd.concat(predictions, ignore_index=True)

        # we have to reverse the mapping to send our predictions
        # (at least the way we currently have it set up)
        predictions['M/F'] = predictions['M/F'].map({v: k for k, v in self.gender_encoding.items()})

        predictions.loc[predictions['y'] < 4.5, 'y'] = 2

        display(predictions)
        # display(predictions[predictions['y'] < 4.5])

        return predictions

In [14]:
first_year_to_predict = 2003
cutoff = 100
my_predictor = MyPredictor()
data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=cutoff) # could consider whether we are harming our model by having it "know" top names beyond years it's fitting in a given loop cycle; should this be done for each loop cycle in the fitting process?
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
my_predictor.fit(historical_data=data_to_fit, first_year_to_predict=first_year_to_predict, years_to_fit=30)
evaluate(predictor=my_predictor, data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='kendalltau')

Unnamed: 0,state,name,M/F,median_year,sum,first_letter_1,median_age,thisyear_count,diff,shift,pct_change,accel,diff2,this_year,first_letter_1_pct,first_letter_1_pct_change,sample_weight,y
0,US,Cloe,1,1940,42,c,32,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.088587,0.000420,0.042391,0.0
1,US,Harlie,1,1940,5,h,32,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.007803,0.093319,0.042391,0.0
2,US,Gracey,1,1941,12,g,31,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.014191,-0.064416,0.042391,0.0
3,US,Rowan,1,1942,7,r,30,0.0,0.0,0.0,0.000000,0.0,0.0,1972,0.041787,-0.016825,0.042391,0.0
4,US,Melvyn,0,1942,2811,m,30,7.0,-11.0,18.0,-0.611111,-17.0,-5.0,1972,0.069289,0.009619,0.042391,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93592,US,Adamari,1,2001,240,a,0,130.0,85.0,45.0,1.888889,77.0,93.0,2001,0.144378,0.049375,0.900000,168.0
93593,US,Maximus,0,2001,1114,m,0,892.0,704.0,188.0,3.744681,527.0,881.0,2001,0.065675,-0.000615,0.900000,912.0
93594,US,Jersey,1,2001,88,j,0,61.0,55.0,6.0,9.166667,55.0,55.0,2001,0.083400,0.042691,0.900000,110.0
93595,US,Dasani,1,2001,161,d,0,94.0,36.0,58.0,0.620690,-13.0,85.0,2001,0.041520,-0.030922,0.900000,105.0


Unnamed: 0,state,name,M/F,median_year,sum,first_letter_1,median_age,thisyear_count,diff,shift,pct_change,accel,diff2,this_year,first_letter_1_pct,first_letter_1_pct_change,y,year
0,US,Gertrude,F,1945,17407.000000,g,57,11.000000,1.000000,10.000000,0.100000,8.000000,-6.000000,2002,0.038293,-0.055275,9.641740,2003
1,US,Flossie,F,1946,2988.000000,f,56,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2002,0.002896,-0.027566,2.000000,2003
2,US,Eula,F,1946,9796.000000,e,56,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2002,0.029819,0.248995,2.000000,2003
3,US,Olive,F,1946,3510.000000,o,56,43.000000,6.000000,37.000000,0.162162,4.000000,8.000000,2002,0.061092,0.065870,35.014165,2003
4,US,Marlys,F,1946,5565.000000,m,56,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2002,0.160021,0.016487,2.000000,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16575,US,Citlally,F,2012,2456.723957,c,9,118.823727,-1.529666,120.353393,-0.012710,0.912365,-3.971698,2021,0.068665,0.003741,117.607421,2022
16576,US,Aydan,M,2012,4636.288076,a,9,232.340825,0.000000,232.340825,0.000000,0.000000,0.000000,2021,0.195576,-0.005794,232.340825,2022
16577,US,Anaiya,F,2012,3853.342905,a,9,193.438929,0.000000,193.438929,0.000000,0.364297,-0.364297,2021,0.133985,0.003008,193.278168,2022
16578,US,Memphis,M,2012,5328.282616,m,9,281.866752,0.000000,281.866752,0.000000,0.000000,0.000000,2021,0.119306,0.008241,281.866752,2022


Predictions for 2003:
Tau: 0.9121898080928765


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20201.0,Madison,18459.23149,Michael,27126.0,Michael,24950.789524
1,Olivia,16152.0,Olivia,14012.649886,Ethan,21361.0,Ethan,20590.565838
2,Sydney,8911.0,Sydney,9599.901842,Anthony,20149.0,Anthony,18448.772783
3,Hailey,8797.0,Hailey,7269.450672,David,18631.0,David,18366.913893
4,Jennifer,7566.0,Savannah,7086.518435,Zachary,15540.0,Logan,14104.730529


Predictions for 2004:
Tau: 0.8884694095653874


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,20628.0,Madison,18325.545964,Michael,25465.0,Michael,24950.789524
1,Olivia,16107.0,Olivia,13308.075569,Ethan,22208.0,Ethan,18472.512998
2,Hailey,8685.0,Sydney,10457.227845,Anthony,19612.0,David,18449.016958
3,Sydney,8368.0,Savannah,7611.602723,David,18400.0,Anthony,18366.669718
4,Jennifer,6545.0,Hailey,7091.018045,Zachary,13711.0,Logan,14393.782741


Predictions for 2005:
Tau: 0.8586843653772223


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,19577.0,Madison,18416.819764,Michael,23826.0,Michael,24953.453334
1,Olivia,15699.0,Olivia,13308.075569,Ethan,21317.0,David,18859.16417
2,Hailey,8592.0,Sydney,10713.741074,Anthony,19383.0,Ethan,18472.512998
3,Sydney,7376.0,Savannah,7611.602723,David,18133.0,Anthony,18453.490844
4,Savannah,6484.0,Hailey,7183.072439,Logan,13473.0,Logan,14104.730529


Predictions for 2006:
Tau: 0.8417512784625854


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,18632.0,Madison,18804.448493,Michael,22647.0,Michael,24953.453334
1,Olivia,15505.0,Olivia,13431.971622,Ethan,20521.0,David,19236.069646
2,Hailey,8423.0,Sydney,10303.951675,Anthony,19411.0,Anthony,18858.919995
3,Savannah,7096.0,Hailey,7388.715775,David,17653.0,Ethan,18558.587002
4,Sydney,6516.0,Jennifer,7383.426705,Logan,15150.0,Zachary,14024.208995


Predictions for 2007:
Tau: 0.8232569771982563


Unnamed: 0,0,1,2,3,4,5,6,7
0,Madison,17965.0,Madison,19181.353969,Michael,22014.0,Michael,24953.453334
1,Olivia,16587.0,Olivia,13906.868954,Ethan,21034.0,David,19236.192992
2,Hailey,8437.0,Sydney,9587.654548,Anthony,19633.0,Anthony,19235.825471
3,Savannah,7050.0,Hailey,7428.987052,David,17556.0,Ethan,18960.933081
4,Sydney,6030.0,Jennifer,7423.697982,Logan,15329.0,Zachary,14062.89777


Predictions for 2008:
Tau: 0.8023914589007222


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17086.0,Madison,19181.016153,Michael,20642.0,Michael,24953.453334
1,Madison,17043.0,Olivia,13947.140231,Ethan,20220.0,Ethan,19337.500741
2,Hailey,7869.0,Sydney,9578.980354,Anthony,18409.0,Anthony,19235.825471
3,Savannah,6129.0,Jennifer,7348.325088,David,16338.0,David,18457.592046
4,Gabriella,6036.0,Hailey,7346.683569,Logan,13859.0,Zachary,14022.626493


Predictions for 2009:
Tau: 0.783383598404025


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17441.0,Madison,18401.764598,Ethan,19849.0,Michael,24953.453334
1,Madison,15249.0,Olivia,13906.868954,Michael,18956.0,Ethan,19337.500741
2,Hailey,7320.0,Sydney,9705.50261,Anthony,16321.0,Anthony,18456.573916
3,Gabriella,6150.0,Savannah,7329.154846,David,15450.0,David,18366.485615
4,Savannah,5602.0,Jennifer,7099.264826,Logan,14463.0,Logan,14063.353542


Predictions for 2010:
Tau: 0.7638450461963663


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17033.0,Madison,18311.418381,Ethan,18011.0,Michael,24953.453334
1,Madison,13191.0,Olivia,13430.580548,Michael,17374.0,Ethan,18558.249186
2,Hailey,7024.0,Sydney,10266.440741,Anthony,15503.0,David,18453.306741
3,Gabriella,6182.0,Savannah,7406.912076,David,14215.0,Anthony,18366.669718
4,Evelyn,5840.0,Jennifer,7085.698475,Logan,14027.0,Logan,14103.624819


Predictions for 2011:
Tau: 0.7434830361208389


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17331.0,Madison,18398.239508,Michael,16815.0,Michael,24953.453334
1,Madison,12399.0,Olivia,13316.749763,Ethan,16693.0,David,18861.03437
2,Evelyn,6708.0,Sydney,10459.650372,Anthony,14283.0,Ethan,18467.902969
3,Zoey,6403.0,Savannah,7366.640799,David,13244.0,Anthony,18453.490844
4,Hailey,6281.0,Jennifer,7228.749185,Logan,12996.0,Logan,14027.487466


Predictions for 2012:
Tau: 0.7242070979607882


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17328.0,Madison,18804.110677,Ethan,17687.0,Michael,24953.453334
1,Madison,11410.0,Olivia,13340.188931,Michael,16202.0,David,19237.939846
2,Zoey,7476.0,Sydney,10266.440741,Anthony,13215.0,Anthony,18858.919995
3,Evelyn,6877.0,Jennifer,7385.811041,David,12553.0,Ethan,18518.85802
4,Hailey,5914.0,Hailey,7366.640799,Logan,12485.0,Zachary,13947.401376


Predictions for 2013:
Tau: 0.7015504793585317


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18446.0,Madison,19181.016153,Ethan,16254.0,Michael,24954.22729
1,Madison,10602.0,Olivia,13831.9422,Michael,15530.0,David,19236.912752
2,Evelyn,7668.0,Sydney,9613.854059,David,12381.0,Anthony,19235.825471
3,Zoey,7250.0,Jennifer,7385.811041,Logan,12360.0,Ethan,18924.729189
4,Savannah,5252.0,Hailey,7366.640799,Anthony,12287.0,Zachary,14059.320936


Predictions for 2014:
Tau: 0.6833604938392841


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19829.0,Madison,19181.016153,Ethan,15737.0,Michael,24954.22729
1,Madison,10328.0,Olivia,13943.861761,Michael,15487.0,Ethan,19301.634665
2,Evelyn,8750.0,Sydney,9581.740697,Logan,13661.0,Anthony,19235.825471
3,Zoey,7430.0,Jennifer,7228.923354,David,12243.0,David,18458.311807
4,Savannah,5485.0,Hailey,7209.753112,Anthony,11597.0,Zachary,14019.16992


Predictions for 2015:
Tau: 0.6681561215868408


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19717.0,Madison,18401.764598,Ethan,15095.0,Michael,24954.22729
1,Madison,10099.0,Olivia,13903.590483,Michael,14474.0,Ethan,19301.634665
2,Evelyn,9370.0,Sydney,9707.535314,Logan,12934.0,Anthony,18456.573916
3,Zoey,6964.0,Savannah,7212.513455,David,11821.0,David,18365.458521
4,Savannah,5465.0,Jennifer,7088.016254,Anthony,10650.0,Logan,14027.487466


Predictions for 2016:
Tau: 0.6511287538344822


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,19397.0,Madison,18311.418381,Michael,14120.0,Michael,24954.22729
1,Evelyn,10115.0,Olivia,13427.302077,Ethan,13843.0,Ethan,18522.38311
2,Madison,9047.0,Sydney,10266.440741,Logan,11268.0,David,18453.239909
3,Zoey,6472.0,Savannah,7369.401142,David,11142.0,Anthony,18366.669718
4,Savannah,5383.0,Jennifer,7088.016254,Owen,10301.0,Logan,14064.480272


Predictions for 2017:
Tau: 0.6365565434761141


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18763.0,Madison,18398.239508,Logan,14042.0,Michael,24954.22729
1,Evelyn,10741.0,Olivia,13313.471292,Michael,12700.0,David,18861.560845
2,Madison,7901.0,Sydney,10459.650372,Ethan,12471.0,Anthony,18453.490844
3,Zoey,6069.0,Savannah,7369.401142,David,10222.0,Ethan,18432.036893
4,Savannah,5252.0,Jennifer,7231.241133,Owen,9360.0,Logan,13954.189207


Predictions for 2018:
Tau: 0.628480432867197


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18039.0,Madison,18804.110677,Logan,12427.0,Michael,24955.974144
1,Evelyn,10427.0,Olivia,13336.91046,Ethan,11918.0,David,19238.466321
2,Madison,7084.0,Sydney,10266.440741,Michael,11729.0,Anthony,18858.919995
3,Zoey,5937.0,Jennifer,7388.12882,David,9795.0,Ethan,18518.85802
4,Ellie,5216.0,Hailey,7369.401142,Owen,9330.0,Zachary,13948.028813


Predictions for 2019:
Tau: 0.6126304221382397


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,18549.0,Madison,19179.43365,Ethan,11291.0,Michael,24955.974144
1,Evelyn,10433.0,Olivia,13831.9422,Michael,10589.0,David,19239.086273
2,Madison,6290.0,Sydney,9613.854059,Logan,10551.0,Anthony,19235.825471
3,Zoey,5676.0,Jennifer,7388.12882,Owen,9341.0,Ethan,18924.729189
4,Ellie,5427.0,Hailey,7369.401142,Mateo,9011.0,Zachary,14059.948374


Predictions for 2020:
Tau: 0.6040943037949353


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17664.0,Madison,19179.43365,Michael,9805.0,Michael,24954.94705
1,Evelyn,9503.0,Olivia,13943.893172,Ethan,9533.0,Ethan,19300.052162
2,Madison,6238.0,Sydney,9581.740697,Logan,9140.0,Anthony,19235.825471
3,Ellie,5592.0,Jennifer,7231.241133,Mateo,8990.0,David,18462.224602
4,Zoey,5373.0,Hailey,7212.513455,Owen,8658.0,Zachary,14019.677096


Predictions for 2021:
Tau: 0.5925637798130988


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,17798.0,Madison,18400.182095,Mateo,9159.0,Michael,24954.94705
1,Evelyn,9475.0,Olivia,13903.621894,Michael,9103.0,Ethan,19300.052162
2,Madison,5952.0,Sydney,9707.535314,Ethan,8842.0,Anthony,18456.818091
3,Ellie,5850.0,Savannah,7212.544866,Logan,8816.0,David,18370.963007
4,Zoey,5197.0,Jennifer,7088.016254,Owen,8755.0,Logan,14024.208995


Predictions for 2022:
Tau: 0.5732889796517637


Unnamed: 0,0,1,2,3,4,5,6,7
0,Olivia,16573.0,Madison,18309.835878,Mateo,10321.0,Michael,24954.94705
1,Evelyn,9289.0,Olivia,13427.333488,Michael,8829.0,Ethan,18520.800607
2,Ellie,5761.0,Sydney,10266.472152,Owen,8546.0,David,18457.784133
3,Madison,5479.0,Savannah,7369.432553,Ethan,8271.0,Anthony,18366.913893
4,Stella,4830.0,Jennifer,7091.187395,David,7741.0,Logan,14064.480272
