In [91]:
import numpy as np
import pandas as pd
from data_loader import load_names_from_web, holdout_split
from sklearn.metrics import mean_squared_log_error

In [2]:
dfraw = load_names_from_web(category='national', hide_pre_1937=True, use_existing_files=True)
traintestval, holdout = holdout_split(dfraw)
trainval, test = holdout_split(traintestval)

In [39]:
trainval

Unnamed: 0,state,year,name,M/F,count
0,US,2000,Emily,F,25957
1,US,2000,Hannah,F,23085
2,US,2000,Ashley,F,17998
3,US,2000,Sarah,F,17712
4,US,2000,Alexis,F,17631
...,...,...,...,...,...
866154,US,2019,Zygmunt,M,5
866155,US,2019,Zyheem,M,5
866156,US,2019,Zyking,M,5
866157,US,2019,Zyn,M,5


In [44]:
def get_all_known_names(data):

    names = data.groupby(['state', 'name', 'M/F']).size().reset_index()
    names = names[['state', 'name', 'M/F']]
    return names

In [105]:
def evaluate(predictor, data_held_out, first_year_to_predict, metric='msle'):

    most_recent_year = data_held_out['year'].max()
    
    years_to_predict = range(first_year_to_predict, most_recent_year+1)

    # only allow the model to see data from before the year to predict
    historical_data = data_held_out[data_held_out['year'] < first_year_to_predict]

    # get our model's predictions
    predictions = predictor.predict(historical_data, years_to_predict)

    all_known_names = get_all_known_names(historical_data)

    for year_to_predict in years_to_predict:

        print(f'Predictions for {year_to_predict}:')

        names_to_predict = all_known_names.copy()
        names_to_predict['year'] = year_to_predict
        # display(names_to_predict)

        observed = names_to_predict.merge(data_held_out, how='left', on=['state', 'name', 'M/F', 'year'])

        # for now, fill in missing values with 2, same as FiveThirtyEight did;
        # reasoning: missing values could be 0 to 4, so average is 2
        observed['count'] = observed['count'].fillna(2)
        # observed = observed.rename(columns={'count': 'count_true'})
        # display(observed)

        score_df = observed.merge(predictions, how='left', on=['state', 'name', 'M/F', 'year'], suffixes=('_true', '_pred'))
        # display(score_df)

        y_true = score_df['count_true']
        y_pred = score_df['count_pred']

        if metric == 'msle':
            loss = mean_squared_log_error(y_true, y_pred)
            print(f'Loss: {loss}')

evaluate(DummyPredictor(strategy='naive'), test, 1993)

Predictions for 1993:
Loss: 0.2048210376167501
Predictions for 1994:
Loss: 0.23748961163833746
Predictions for 1995:
Loss: 0.2944425991076037
Predictions for 1996:
Loss: 0.345131451965153
Predictions for 1997:
Loss: 0.39172905985529827
Predictions for 1998:
Loss: 0.46885007263667744
Predictions for 1999:
Loss: 0.5491453016551922
Predictions for 2000:
Loss: 0.6217554907260967
Predictions for 2001:
Loss: 0.6990891031726914
Predictions for 2002:
Loss: 0.7676513819895272
Predictions for 2003:
Loss: 0.8523447530601621
Predictions for 2004:
Loss: 0.924022538364825
Predictions for 2005:
Loss: 0.9822846196943936
Predictions for 2006:
Loss: 1.091002004905991
Predictions for 2007:
Loss: 1.1744378337438273
Predictions for 2008:
Loss: 1.2724181760639353
Predictions for 2009:
Loss: 1.3272445805372455
Predictions for 2010:
Loss: 1.3913044503027057
Predictions for 2011:
Loss: 1.4703149252733005
Predictions for 2012:
Loss: 1.5317617595476543
Predictions for 2013:
Loss: 1.5798350289998204
Predictions f

In [99]:
class DummyPredictor():

    def __init__(self, strategy='naive'):
        self.strategy = strategy
    
    def predict(self, historical_data, years_to_predict):

        all_known_names = get_all_known_names(historical_data)

        predictions = []
        previous_year_data = historical_data[historical_data['year'] == years_to_predict[0] - 1].drop(columns=['year'])

        for year_to_predict in years_to_predict:

            prediction = all_known_names.copy()
            prediction['year'] = year_to_predict

            if self.strategy == 'naive':
                prediction = prediction.merge(previous_year_data, how='left', on=['state', 'name', 'M/F'])
                prediction['count'] = prediction['count'].fillna(2)
                # display(prediction)
            elif self.strategy == 'mean':
                prediction['count'] = previous_year_data['count'].mean()
                # display(prediction)

            predictions.append(prediction)

        predictions = pd.concat(predictions, ignore_index=True)
        # display(predictions)

        return predictions