In [3]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web, holdout_split
from sklearn.metrics import mean_squared_log_error

In [4]:
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)
traintestval, holdout = holdout_split(dfraw)
trainval, test = holdout_split(traintestval)

In [61]:
test

Unnamed: 0,state,year,name,M/F,count
0,US,2000,Madison,F,19968
1,US,2000,Olivia,F,12854
2,US,2000,Sydney,F,10244
3,US,2000,Jennifer,F,9389
4,US,2000,Amanda,F,8557
...,...,...,...,...,...
212633,US,2019,Zaheen,M,5
212634,US,2019,Zahi,M,5
212635,US,2019,Zaymar,M,5
212636,US,2019,Zeo,M,5


In [6]:
def get_all_known_names(data):

    names = data.groupby(['state', 'name', 'M/F']).size().reset_index()
    names = names[['state', 'name', 'M/F']]
    return names

In [91]:
def select_top_names(data, first_year_to_predict, cutoff=10):
    '''
    Select names that have had at least one year at or above the cutoff count?
    Could also try to redo this to do the cutoff for the most recent known year's data
    '''

    data_noleak = data[data['year'] < first_year_to_predict]
    names = data_noleak[data_noleak['count'] >= cutoff][['state', 'name', 'M/F']].drop_duplicates()
    # display(names)
    data = names.merge(data, how='left', on=['state', 'name', 'M/F'])
    # display(data)

    return data

In [92]:
def evaluate(predictor, data_held_out, first_year_to_predict, metric='msle'):

    # display('data_held_out:')
    # display(data_held_out)

    most_recent_year = data_held_out['year'].max()
    
    years_to_predict = range(first_year_to_predict, most_recent_year+1)

    # only allow the model to see data from before the year to predict
    historical_data = data_held_out[data_held_out['year'] < first_year_to_predict]

    # display('historical_data:')
    # display(historical_data)

    # get our model's predictions
    predictions = predictor.predict(historical_data, years_to_predict)

    all_known_names = get_all_known_names(historical_data)

    for year_to_predict in years_to_predict:

        print(f'Predictions for {year_to_predict}:')

        names_to_predict = all_known_names.copy()
        names_to_predict['year'] = year_to_predict
        # display(names_to_predict)

        observed = names_to_predict.merge(data_held_out, how='left', on=['state', 'name', 'M/F', 'year'])

        # for now, fill in missing values with 2, same as FiveThirtyEight did;
        # reasoning: missing values could be 0 to 4, so average is 2
        observed['y'] = observed['count'].fillna(2)
        # observed = observed.rename(columns={'count': 'count_true'})
        # display(observed)

        score_df = observed.merge(predictions, how='left', on=['state', 'name', 'M/F', 'year'], suffixes=('_true', '_pred'))
        # display(score_df)

        y_true = score_df['y_true']
        y_pred = score_df['y_pred']

        if metric == 'msle':
            loss = mean_squared_log_error(y_true, y_pred)
            print(f'Loss: {loss}')

        if metric == 'rank':
            y_true = y_true.rank()
            y_pred = y_pred.rank()
            score = np.sum(np.abs(y_true-y_pred))/(len(y_true)*(len(y_true)-1))
            print(f'Score: {score}')

In [93]:
class DummyPredictor():

    def __init__(self, strategy='naive'):
        self.strategy = strategy
    
    def predict(self, historical_data, years_to_predict):

        all_known_names = get_all_known_names(historical_data)

        predictions = []
        previous_year_data = historical_data[historical_data['year'] == years_to_predict[0] - 1].drop(columns=['year'])

        for year_to_predict in years_to_predict:

            prediction = all_known_names.copy()
            prediction['year'] = year_to_predict

            if self.strategy == 'naive':
                prediction = prediction.merge(previous_year_data, how='left', on=['state', 'name', 'M/F'])
                prediction['y'] = prediction['count'].fillna(2)
                # display(prediction)
            elif self.strategy == 'mean':
                prediction['y'] = previous_year_data['count'].mean()
                # display(prediction)

            predictions.append(prediction)

        predictions = pd.concat(predictions, ignore_index=True)
        # display(predictions)

        return predictions

In [104]:
first_year_to_predict = 2003
cutoff = 100
# data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=0)
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
evaluate(predictor=DummyPredictor(strategy='naive'), data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='rank')

Predictions for 2003:
Score: 0.03204052376706701
Predictions for 2004:
Score: 0.040194518743844804
Predictions for 2005:
Score: 0.05132340343700285
Predictions for 2006:
Score: 0.058113785889524074
Predictions for 2007:
Score: 0.06422673263287938
Predictions for 2008:
Score: 0.07240549407644388
Predictions for 2009:
Score: 0.080627961049632
Predictions for 2010:
Score: 0.08712843015564996
Predictions for 2011:
Score: 0.09472299435324558
Predictions for 2012:
Score: 0.10091606790091082
Predictions for 2013:
Score: 0.10854122596924297
Predictions for 2014:
Score: 0.11577303427096262
Predictions for 2015:
Score: 0.1199877041776659
Predictions for 2016:
Score: 0.1251303881633771
Predictions for 2017:
Score: 0.13105540112935088
Predictions for 2018:
Score: 0.13403029084573115
Predictions for 2019:
Score: 0.13911033023898184
Predictions for 2020:
Score: 0.1412052819589401
Predictions for 2021:
Score: 0.1478150148890171
Predictions for 2022:
Score: 0.15419310851208895


In [102]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor

class MyPredictor():

    def __init__(self):
        
        # params
        cols_to_keep = ['M/F', 'sum', 'median_age', 'thisyear_count']
        categorical_features = ['M/F']
        max_leaf_nodes = 16 # 16
        max_iter = 100 # 100
        loss = 'absolute_error' # abs better than default

        categorical_features = [True if f in categorical_features else False for f in cols_to_keep]
        # print(categorical_features)
        
        self.pipe = make_pipeline(
            ColumnTransformer(
                transformers=[
                    # ('category_encoder', LabelEncoder(), categorical_features),
                    ('cols_to_keep', 'passthrough', cols_to_keep),
                ], remainder='drop'),
            HistGradientBoostingRegressor(
                random_state=0,
                categorical_features=categorical_features,
                max_leaf_nodes=max_leaf_nodes,
                max_iter=max_iter,
                loss=loss
            )
        )

        self.gender_encoding = {'M': 0, 'F': 1}

    def preprocess(self, df, latest_known_year):

        # find median age of people with name, 
        # total born with that name,
        # and latest year's count

        df = df.copy()
        df = df.sort_values(by='year')
        df['cumsum'] = df.groupby(['state', 'name', 'M/F'])['count'].cumsum()
        df['sum'] = df.groupby(['state', 'name', 'M/F'])['count'].transform('sum')
        # display(df[(df['name'] == 'Millie') & (df['M/F'] == 'M')])

        medians = df[df['cumsum'] >= df['sum']/2]
        medians = medians.drop_duplicates(subset=['state', 'name', 'M/F'], keep='first')
        medians['median_age'] = latest_known_year - medians['year']
        # display(medians[medians['name'] == 'Madison'])

        thisyear = df[df['year'] == latest_known_year][['state', 'name', 'M/F', 'count']].rename(columns={'count': 'thisyear_count'})
        df2 = medians.merge(thisyear, how='left', on=['state', 'name', 'M/F']).rename(columns={'year': 'median_year'})
        df2['thisyear_count'] = df2['thisyear_count'].fillna(0) # might want to shift this to 2 and fill in 2s for missing years? or maybe not
        # display(df2)
        # display(df2.groupby(['state','name','M/F']).ngroups)

        # change M/F to 0/1 so it works with various models
        # (even HistGradientBoostingRegressor, which accepts categorical values,
        # still needs those values to be numbers not strings)
        df2['M/F'] = df2['M/F'].map(self.gender_encoding)

        return df2
    
    def fit(self, historical_data, first_year_to_predict, years_to_fit=1, weight_decay=0.9):
        # first things first, we don't want to know about future data
        historical_data = historical_data[historical_data['year'] < first_year_to_predict]
        # at this point the data we don't want to know should be inaccessible

        X_all = pd.DataFrame()
        y_all = pd.Series()

        # each year_to_fit is the year that's essentially our y for that loop
        for year_to_fit in range(first_year_to_predict - years_to_fit, first_year_to_predict):

            # now we "know" even less for X
            X = historical_data[historical_data['year'] < year_to_fit]
            y = historical_data[historical_data['year'] == year_to_fit]

            X = self.preprocess(X, year_to_fit - 1)
            y = y[['state', 'name', 'M/F', 'count']].rename(columns={'count': 'y'})
            y['M/F'] = y['M/F'].map(self.gender_encoding)

            data = X.merge(y, how='left', on=['state', 'name', 'M/F'])
            data['y'] = data['y'].fillna(0)
            # display(data)

            X = data.drop(columns=['y'])
            y = data['y']
            X['sample_weight'] = weight_decay ** (first_year_to_predict - year_to_fit)

            X_all = pd.concat([X_all, X], ignore_index=True)
            y_all = pd.concat([y_all, y], ignore_index=True)
        
        temp = X_all.copy()
        temp['y'] = y_all
        display(temp)

        sample_weights = X_all['sample_weight']
        X_all = X_all.drop(columns=['sample_weight'])

        self.pipe.fit(X_all, y_all, **{'histgradientboostingregressor__sample_weight': sample_weights})
        # this seems like a silly way to pass params to individual steps of the pipeline, but it's true. See: https://stackoverflow.com/questions/36205850/sklearn-pipeline-applying-sample-weights-after-applying-a-polynomial-feature-t

    def predict(self, historical_data, years_to_predict):

        # all_known_names = get_all_known_names(historical_data)

        predictions = []

        # display('historical_data in predict:')
        # display(historical_data)
        # display('years_to_predict:')
        # display(years_to_predict)

        for year_to_predict in years_to_predict:

            # display('historical_data in predict loop:')
            # display(historical_data)

            df = self.preprocess(historical_data, year_to_predict - 1)
            # df = self.preprocess(historical_data, years_to_predict[0] - 1)

            df['y'] = self.pipe.predict(df)

            df['year'] = year_to_predict
            # display(df)

            # if we want to simply, do the following; 
            # but for now, might be useful to see all data displayed.
            # df = df[['state', 'year', 'name', 'M/F', 'y']]

            predictions.append(df)

            assumed_new_year_of_historical_data = df[['state', 'year', 'name', 'M/F', 'y']].rename(columns={'y': 'count'})
            assumed_new_year_of_historical_data['M/F'] = assumed_new_year_of_historical_data['M/F'].map({v: k for k, v in self.gender_encoding.items()})
            historical_data = pd.concat([historical_data, assumed_new_year_of_historical_data], ignore_index=True)

        predictions = pd.concat(predictions, ignore_index=True)

        # we have to reverse the mapping to send our predictions
        # (at least the way we currently have it set up)
        predictions['M/F'] = predictions['M/F'].map({v: k for k, v in self.gender_encoding.items()})

        predictions.loc[predictions['y'] < 4.5, 'y'] = 2

        display(predictions)
        display(predictions[predictions['y'] < 4.5])

        return predictions

In [105]:
first_year_to_predict = 2003
cutoff = 100
my_predictor = MyPredictor()
data_to_fit = select_top_names(trainval, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
data_to_eval = select_top_names(test, first_year_to_predict=first_year_to_predict, cutoff=cutoff)
my_predictor.fit(historical_data=data_to_fit, first_year_to_predict=first_year_to_predict, years_to_fit=30)
evaluate(predictor=my_predictor, data_held_out=data_to_eval, first_year_to_predict=first_year_to_predict, metric='rank')

Unnamed: 0,state,name,M/F,median_year,count,cumsum,sum,median_age,thisyear_count,sample_weight,y
0,US,Cloe,1,1940,8,26,42,32,0.0,0.042391,0.0
1,US,Harlie,1,1940,5,5,5,32,0.0,0.042391,0.0
2,US,Gracey,1,1941,7,7,12,31,0.0,0.042391,0.0
3,US,Rowan,1,1942,7,7,7,30,0.0,0.042391,0.0
4,US,Melvyn,0,1942,282,1613,2811,30,7.0,0.042391,14.0
...,...,...,...,...,...,...,...,...,...,...,...
93592,US,Adamari,1,2001,130,240,240,0,130.0,0.900000,168.0
93593,US,Maximus,0,2001,892,1114,1114,0,892.0,0.900000,912.0
93594,US,Jersey,1,2001,61,88,88,0,61.0,0.900000,110.0
93595,US,Dasani,1,2001,94,161,161,0,94.0,0.900000,105.0


Unnamed: 0,state,name,M/F,median_year,count,cumsum,sum,median_age,thisyear_count,y,year
0,US,Gertrude,F,1945,664.000000,8864.000000,17407.000000,57,11.000000,10.309556,2003
1,US,Flossie,F,1946,108.000000,1532.000000,2988.000000,56,0.000000,2.000000,2003
2,US,Eula,F,1946,380.000000,5131.000000,9796.000000,56,0.000000,2.000000,2003
3,US,Olive,F,1946,123.000000,1847.000000,3510.000000,56,43.000000,38.912649,2003
4,US,Marlys,F,1946,243.000000,2902.000000,5565.000000,56,0.000000,2.000000,2003
...,...,...,...,...,...,...,...,...,...,...,...
16575,US,Memphis,M,2011,151.726435,1695.266727,3187.517440,10,146.769696,145.490800,2022
16576,US,Charlize,F,2012,326.560429,3397.273694,6287.194733,9,317.798428,312.392126,2022
16577,US,Kadejah,F,2013,52.571746,663.468476,1273.727742,8,92.935179,94.783283,2022
16578,US,Diquan,M,2013,45.249426,612.373570,1199.111111,8,93.449667,96.434158,2022


Unnamed: 0,state,name,M/F,median_year,count,cumsum,sum,median_age,thisyear_count,y,year
1,US,Flossie,F,1946,108.0,1532.0,2988.000000,56,0.000000,2.0,2003
2,US,Eula,F,1946,380.0,5131.0,9796.000000,56,0.000000,2.0,2003
4,US,Marlys,F,1946,243.0,2902.0,5565.000000,56,0.000000,2.0,2003
8,US,Virgie,F,1947,188.0,2654.0,5216.000000,55,0.000000,2.0,2003
10,US,Willie,F,1947,1684.0,21064.0,39570.000000,55,0.000000,2.0,2003
...,...,...,...,...,...,...,...,...,...,...,...
14936,US,Gene,F,1949,95.0,1232.0,2443.006201,71,3.885710,2.0,2021
14979,US,Vonnie,F,1954,83.0,1288.0,2546.938881,66,3.823917,2.0,2021
15753,US,Eula,F,1946,380.0,5131.0,9876.892634,75,4.438106,2.0,2022
15770,US,Gene,F,1949,95.0,1232.0,2446.891910,72,3.885710,2.0,2022


Predictions for 2003:
Score: 0.031695250083040506
Predictions for 2004:
Score: 0.03898970297722068
Predictions for 2005:
Score: 0.04927215724666818
Predictions for 2006:
Score: 0.055989697149816726
Predictions for 2007:
Score: 0.06263585135458005
Predictions for 2008:
Score: 0.07140463744806326
Predictions for 2009:
Score: 0.07912303397959243
Predictions for 2010:
Score: 0.08568469082708344
Predictions for 2011:
Score: 0.09351817858662144
Predictions for 2012:
Score: 0.10028525142334341
Predictions for 2013:
Score: 0.10885007837858313
Predictions for 2014:
Score: 0.11635723151693152
Predictions for 2015:
Score: 0.12087346957803768
Predictions for 2016:
Score: 0.12723845154222246
Predictions for 2017:
Score: 0.13193825282774777
Predictions for 2018:
Score: 0.1347660005943952
Predictions for 2019:
Score: 0.1396202280845906
Predictions for 2020:
Score: 0.1422163365442329
Predictions for 2021:
Score: 0.146828726770511
Predictions for 2022:
Score: 0.15365553049771857
