In [1]:
import numpy as np
import pandas as pd
import datetime

hp = pd.read_csv("house_price_changes.csv")
hp.head()

Unnamed: 0.1,Unnamed: 0,Case_Shiller_HPI,Case_Shiller_HPI_chg,AHE,NFP_TOT,u_rate,new_home_sales_1qchg,housing_starts_1qchg,30yr_mtge_rate,deliquency,...,NFP_TOT_1yrchg,u_rate_1yrchg,new_home_sales_1yrchg,housing_starts_1yrchg,30yr_mtge_rate_1yrchg,deliquency_1yrchg,GDP_1yrchg,NAHB_1yrchg,Philly_fed_1yrchg,yield_curve_1yrchg
0,6/30/1988,70.439,0.016721,9.296667,0.007337,5.7,0.005489,0.058269,10.11,-0.04,...,0.031517,-0.9,0.023074,-0.066894,0.995385,-0.08,0.072157,-0.166061,0.046667,0.046667
1,9/30/1988,71.805,0.019207,9.396667,0.008163,5.466667,0.000912,0.254106,10.359231,-0.01,...,0.031889,-0.8,0.025856,-0.066062,0.016154,-0.02,0.077063,-0.092373,0.146667,0.146667
2,12/31/1988,73.068,0.017436,9.473333,0.006985,5.466667,-0.003653,-0.124513,10.504615,-0.02,...,0.031751,-0.533333,0.008268,-0.064193,0.023077,-0.05,0.078271,0.012346,-0.22,-0.22
3,3/31/1989,74.375,0.017729,9.58,0.008081,5.333333,-0.003666,-0.098631,10.41,0.06,...,0.030566,-0.5,-0.000918,0.089231,-0.43,-0.01,0.074789,0.061748,-0.043333,-0.043333
4,6/30/1989,75.293,0.012267,9.676667,0.00752,5.2,0.025387,-0.062914,10.82,0.13,...,0.030749,-0.5,0.018979,-0.031952,0.71,0.16,0.08231,0.0,-0.07,-0.07


In [2]:
hp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 32 columns):
Unnamed: 0               119 non-null object
Case_Shiller_HPI         119 non-null float64
Case_Shiller_HPI_chg     119 non-null float64
AHE                      119 non-null float64
NFP_TOT                  119 non-null float64
u_rate                   119 non-null float64
new_home_sales_1qchg     119 non-null float64
housing_starts_1qchg     119 non-null float64
30yr_mtge_rate           119 non-null float64
deliquency               119 non-null float64
GDP                      119 non-null float64
NAHB                     119 non-null float64
Philly_fed               119 non-null float64
yield_curve              119 non-null float64
AHE_1qchg                119 non-null float64
u_rate_1qchg             119 non-null float64
30yr_mtge_rate_1qchg     119 non-null float64
deliquency_1qchg         119 non-null float64
NAHB_1qchg               119 non-null float64
Philly_fed_1qchg      

In [3]:
hp.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
hp['date'] = pd.to_datetime(hp['date'])
hp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 32 columns):
date                     119 non-null datetime64[ns]
Case_Shiller_HPI         119 non-null float64
Case_Shiller_HPI_chg     119 non-null float64
AHE                      119 non-null float64
NFP_TOT                  119 non-null float64
u_rate                   119 non-null float64
new_home_sales_1qchg     119 non-null float64
housing_starts_1qchg     119 non-null float64
30yr_mtge_rate           119 non-null float64
deliquency               119 non-null float64
GDP                      119 non-null float64
NAHB                     119 non-null float64
Philly_fed               119 non-null float64
yield_curve              119 non-null float64
AHE_1qchg                119 non-null float64
u_rate_1qchg             119 non-null float64
30yr_mtge_rate_1qchg     119 non-null float64
deliquency_1qchg         119 non-null float64
NAHB_1qchg               119 non-null float64
Philly_fed_1qc

In [4]:
mask = hp['date'] < datetime.date(2013,1,1)
hp_train = hp[mask]
hp_test = hp[-mask]

### Steps

1. Split to train and test
2. Drop date if it is not a feature in the model
3. Split both to X and y
4. Get list of colnames from X
5. Convert the two X and y dataframes to numpy arrays

In [5]:
def train_test_split_timeseries(input_dataframe, target, timecolumn, year, month, day, dropdates = True):
    """
    The function splits a dataframe containing a time series into non-random train and test subsets. 
    The last observation in the train data is the latest datetime value in the data which precedes 
    the breakpoint given by the (year, month, day) value. The first observation in the test data is the 
    breakpoint given by the (year, month, day) value or the first observation afterwards.
    
    Parameters:
        input_dataframe (Pandas dataframe): The data file with the time series data.
        target (string): Name of the target variable in the input dataframe.
        timecolumn (string): The name of the time colummn for splitting the dataframe (usually a date column).
        year, month, day (int): The year, month, day components of the breakpoint.
        dropdates (boolean): Whether or not to drop the date column to produce the train/test data. Defaults to True. 
    
    Returns: 
        X_train (array): A numpy array of training input data.
        y_train (array): A numpy array of training target data.
        X_test (array): A numpy array of test input data.
        y_test (array): A numpy array of test target data.
        feature_names (array): A list of feature names used in the input matrix.
    """
    
    # Split to train and test periods.
    model_df = input_dataframe
    target = target
    timecolumn = timecolumn
    mask = model_df[timecolumn] < datetime.date(year,month,day)
    model_df_train = model_df[mask]
    model_df_test = model_df[-mask]
    
    # Drop date column if dropdates = True
    if dropdates:
        model_df_train = model_df_train.drop(['date'], axis=1)
        model_df_test = model_df_test.drop(['date'], axis=1)
    
    # Split both train and test to X (input) and y (target)
    X_train = model_df_train.drop([target], axis=1)
    y_train = model_df_train[target]
    
    X_test = model_df_test.drop([target], axis=1)
    y_test = model_df_test[target]
    
    # Get column names for variable importance
    feature_names = list(X_train)
    
    # Convert X_train, X_test, y_train, y_test to numpy arrays
    X_train = X_train.as_matrix()
    X_test = X_test.as_matrix()
    y_train = y_train.as_matrix()
    y_test = y_test.as_matrix()
    
    
    return X_train, y_train, X_test, y_test, feature_names



In [6]:
X_train, y_train, X_test, y_test, feature_names = train_test_split_timeseries(hp, 'Case_Shiller_HPI_chg', 'date', 2013, 1,1)

In [7]:
print('Feature_names')
print(feature_names)
print()
print("X_train type:", type(X_train), ", shape:", X_train.shape)
print("y_train type:", type(y_train), ", shape:", y_train.shape)
print("X_test type:", type(X_test), ", shape:", X_test.shape)
print("y_test type:", type(y_test), ", shape:", y_test.shape)

Feature_names
['Case_Shiller_HPI', 'AHE', 'NFP_TOT', 'u_rate', 'new_home_sales_1qchg', 'housing_starts_1qchg', '30yr_mtge_rate', 'deliquency', 'GDP', 'NAHB', 'Philly_fed', 'yield_curve', 'AHE_1qchg', 'u_rate_1qchg', '30yr_mtge_rate_1qchg', 'deliquency_1qchg', 'NAHB_1qchg', 'Philly_fed_1qchg', 'yield_curve_1qchg', 'AHE_1yrchg', 'NFP_TOT_1yrchg', 'u_rate_1yrchg', 'new_home_sales_1yrchg', 'housing_starts_1yrchg', '30yr_mtge_rate_1yrchg', 'deliquency_1yrchg', 'GDP_1yrchg', 'NAHB_1yrchg', 'Philly_fed_1yrchg', 'yield_curve_1yrchg']

X_train type: <class 'numpy.ndarray'> , shape: (99, 30)
y_train type: <class 'numpy.ndarray'> , shape: (99,)
X_test type: <class 'numpy.ndarray'> , shape: (20, 30)
y_test type: <class 'numpy.ndarray'> , shape: (20,)


In [8]:
help(train_test_split_timeseries)

Help on function train_test_split_timeseries in module __main__:

train_test_split_timeseries(input_dataframe, target, timecolumn, year, month, day, dropdates=True)
    The function splits a dataframe containing a time series into non-random train and test subsets. 
    The last observation in the train data is the latest datetime value in the data which precedes 
    the breakpoint given by the (year, month, day) value. The first observation in the test data is the 
    breakpoint given by the (year, month, day) value or the first observation afterwards.
    
    Parameters:
        input_dataframe (Pandas dataframe): The data file with the time series data.
        target (string): Name of the target variable in the input dataframe.
        timecolumn (string): The name of the time colummn for splitting the dataframe (usually a date column).
        year, month, day (int): The year, month, day components of the breakpoint.
        dropdates (boolean): Whether or not to drop the date 