In [3]:
%matplotlib inline

# data manipulation and modeling
import numpy as np
import pandas as pd
import statsmodels.api as sm

# graphix
import matplotlib.pyplot as plt
#import prettyplotlib as pplt
import seaborn as sns
import statsmodels.graphics.tsaplots as tsaplots

# utility
import os

# notebook parameters
pd.set_option('display.max_columns', 40) # number of columns in training set
plt.rcParams['figure.figsize'] = (14.0, 8.0)

In [5]:
training_data = pd.read_csv("../data/TrainingSet.csv", index_col=0)
submission_labels = pd.read_csv("../data/SubmissionRows.csv", index_col=0)

In [6]:
training_data.head()

Unnamed: 0,1972 [YR1972],1973 [YR1973],1974 [YR1974],1975 [YR1975],1976 [YR1976],1977 [YR1977],1978 [YR1978],1979 [YR1979],1980 [YR1980],1981 [YR1981],1982 [YR1982],1983 [YR1983],1984 [YR1984],1985 [YR1985],1986 [YR1986],1987 [YR1987],1988 [YR1988],1989 [YR1989],1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],1996 [YR1996],1997 [YR1997],1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007],Country Name,Series Code,Series Name
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.769214,Afghanistan,allsi.bi_q1,(%) Benefits held by 1st 20% population - All ...
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.027746,Afghanistan,allsp.bi_q1,(%) Benefits held by 1st 20% population - All ...
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.244887,Afghanistan,allsa.bi_q1,(%) Benefits held by 1st 20% population - All ...
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.933105,Afghanistan,allsi.gen_pop,(%) Generosity of All Social Insurance
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,18.996814,Afghanistan,allsp.gen_pop,(%) Generosity of All Social Protection


In [7]:
submission_labels.head()

Unnamed: 0,2008 [YR2008],2012 [YR2012]
559,,
618,,
753,,
1030,,
1896,,


In [8]:
training_data.loc[559]

1972 [YR1972]                                    NaN
1973 [YR1973]                                    NaN
1974 [YR1974]                                    NaN
1975 [YR1975]                                    NaN
1976 [YR1976]                                    NaN
1977 [YR1977]                                    NaN
1978 [YR1978]                                    NaN
1979 [YR1979]                                    NaN
1980 [YR1980]                                    NaN
1981 [YR1981]                                    NaN
1982 [YR1982]                                    NaN
1983 [YR1983]                                    NaN
1984 [YR1984]                                    NaN
1985 [YR1985]                                    NaN
1986 [YR1986]                                    NaN
1987 [YR1987]                                    NaN
1988 [YR1988]                                    NaN
1989 [YR1989]                                    NaN
1990 [YR1990]                                 

In [12]:
def generate_year_list(start, stop=None):
    """ 
    make a list of column names for specific years
    in the format they appear in the data frame start/stop inclusive
    """
    
    if isinstance(start, list):
        data_range = start
    elif stop:
        data_range = range(start, stop+1)
    else:
        data_range = [start]
    
    yrs = []
    
    for yr in data_range:
        yrs.append("{0} [YR{0}]".format(yr))
        
    return yrs

# ========== TEST CASES =======
# one year
print(generate_year_list(2008))

# start and stop (inclusive)
print(generate_year_list(1985, 1990))

# custom year list
print(generate_year_list([1985, 1990]))

['2008 [YR2008]']
['1985 [YR1985]', '1986 [YR1986]', '1987 [YR1987]', '1988 [YR1988]', '1989 [YR1989]', '1990 [YR1990]']
['1985 [YR1985]', '1990 [YR1990]']


In [13]:

prediction_rows = training_data.loc[submission_labels.index]
prediction_rows = prediction_rows[generate_year_list(1972, 2007)]
prediction_rows.head()

Unnamed: 0,1972 [YR1972],1973 [YR1973],1974 [YR1974],1975 [YR1975],1976 [YR1976],1977 [YR1977],1978 [YR1978],1979 [YR1979],1980 [YR1980],1981 [YR1981],1982 [YR1982],1983 [YR1983],1984 [YR1984],1985 [YR1985],1986 [YR1986],1987 [YR1987],1988 [YR1988],1989 [YR1989],1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],1996 [YR1996],1997 [YR1997],1998 [YR1998],1999 [YR1999],2000 [YR2000],2001 [YR2001],2002 [YR2002],2003 [YR2003],2004 [YR2004],2005 [YR2005],2006 [YR2006],2007 [YR2007]
559,,,,,,,,,,,,,,,,,,,,0.048,0.049,0.049,0.049,0.049,0.084,0.118,0.152,0.187,0.221,0.256,0.291,0.325,0.36,0.395,0.43,0.465
618,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,4.7e-05,4.6e-05,0.000879,0.001058,0.012241,0.021071,0.019
753,0.296,0.2909,0.2852,0.2798,0.2742,0.2683,0.2624,0.2565,0.2503,0.2439,0.2374,0.2304,0.2229,0.2151,0.2071,0.1993,0.1914,0.1836,0.1762,0.1693,0.1627,0.1571,0.1521,0.1479,0.1446,0.1417,0.1391,0.1366,0.1339,0.131,0.1277,0.1244,0.121,0.1177,0.1145,0.1115
1030,,,,,,,,,,,,,,,,,,,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
1896,,,,,,,,,,,,,,,,,,,,,,,0.964,0.964,0.965,0.965,0.965,0.965,0.965,0.964,0.964,0.963,0.963,0.962,0.962,0.961


In [14]:
# grab a random sample of 10 of the timeseries
np.random.seed(896)
rand_rows = np.random.choice(prediction_rows.index.values, size=10)

def plot_rows(data, ids=None, linestyle="-", legend=True):
    # get some colors for the lines
    bmap = pplt.brewer2mpl.get_map('Set3','Qualitative', 10)
    colors = bmap.mpl_colors
    
    if not None == ids:
        get_rows = lambda: enumerate(ids)
    else:
        get_rows = lambda: enumerate(data.index.values)
    
    for i, r in get_rows():
        # get the time series values
        time_data = data.loc[r]

        # create an x axis to plot along
        just_years = [y[:4] for y in data.columns]
        X = pd.DatetimeIndex(just_years)

        # get time series info for labeling
        country, descrip = training_data[["Country Name", "Series Name"]].loc[r]

        # plot the series
        plt.plot(X, time_data, c=colors[i],
                 label="{} - {}".format(country, descrip), ls=linestyle)
        plt.scatter(X, time_data, alpha=0.8, c=colors[i])

    if legend:
        plt.legend(loc=0)
    plt.title("Progress Towards Subset of MDGs")

plot_rows(prediction_rows, ids=rand_rows)
plt.show()

NameError: name 'pplt' is not defined

In [16]:
def simple_model(series):
    point_2007 = series.iloc[-1]
    point_2006 = series.iloc[-2]
    
    # if just one point, status quo
    if np.isnan(point_2006):
        predictions = np.array([point_2007, point_2007])
    else:
        slope = point_2007 - point_2006
        
        # one year
        pred_2008 = point_2007 + slope
        
        # five years
        pred_2012 = point_2007 + 5*slope
        
        predictions = np.array([pred_2008, pred_2012])

    ix = pd.Index(generate_year_list([2008, 2012]))
    return pd.Series(data=predictions, index=ix)
        
# let's try just these predictions on the first five rows
test_data = prediction_rows.head()
test_predictions = test_data.apply(simple_model, axis=1)

# combine the data and the predictions
test_predictions = test_data.join(test_predictions)

# let's take a look at 2006, 2007, and our predictions

In [17]:

# make the predictions
predictions = prediction_rows.loc[rand_rows].apply(simple_model, axis=1)

# plot the data
plot_rows(prediction_rows, ids=rand_rows)

# plot the predictions
plot_rows(predictions, linestyle="--", legend=False)

plt.show()

NameError: name 'pplt' is not defined

In [19]:
def write_submission_file(preds, filename):
    # load the submission labels
    file_format = pd.read_csv(os.path.join("../data", "SubmissionRows.csv"), index_col=0)
    expected_row_count = file_format.shape[0]

    if isinstance(preds, pd.DataFrame):
        # check indices
        assert(preds.index == file_format.index).all(), \
            "DataFrame: Prediction indices must match submission format."
        
        # check columns
        assert (preds.columns == file_format.columns).all(), \
            "DataFrame: Column names must match submission format."
        
        final_predictions = preds
        
    elif isinstance(preds, np.ndarray):
        rows, cols = preds.shape
        
        if cols == 3:
            assert (preds[:,0] == file_format.index.values).all(), \
                "Numpy Array: First column must be indices."
            
            # now we know the indices are cool, ditch them
            preds = preds[:,1:]
        
        assert rows == expected_row_count, \
            "Numpy Array: The predictions must have the right number of rows."
        
        # put the predictions into the dataframe
        final_predictions = file_format.copy()
        final_predictions[generate_year_list([2008, 2012])] = preds
            
    elif isinstance(preds, list):
        assert len(preds) == 2, \
            "list: Predictions must be a list containing two lists"
        assert len(preds[0]) == expected_row_count, \
            "list: There must be the right number of predictions in the first list."
        assert len(preds[1]) == expected_row_count, \
            "list: There must be the right number of predictions in the second list."
    
        # write the predictions
        final_predictions = file_format.copy()
        final_predictions[generate_year_list(2008)] = np.array(preds[0], dtype=np.float64).reshape(-1, 1)
        final_predictions[generate_year_list(2012)] = np.array(preds[1], dtype=np.float64).reshape(-1, 1)
        
    elif isinstance(preds, dict):
        assert preds.keys() == generate_year_list([2008, 2012]), \
            "dict: keys must be properly formatted"
        assert len(preds[generate_year_list(2008)[0]]) == expected_row_count, \
            "dict: length of value for 2008 must match the number of predictions"
        assert len(preds[generate_year_list(2012)[0]]) == expected_row_count, \
            "dict: length of value for 2012 must match the number of predictions"
        
        # create dataframe from dictionary
        final_predictions = pd.DataFrame(preds, index=file_format.index)

    final_predictions.to_csv(filename)
    
simple_predictions = prediction_rows.apply(simple_model, axis=1)
write_submission_file(simple_predictions, "Getting Started Benchmark.csv")