In [8]:
### import & init 
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, max_error
import random
import numpy as np


mbd_file = 'mbd_train.csv'
mbd = pd.read_csv(mbd_file)
mbd['date_order'] = mbd.groupby(['cfips']).rank()['first_day_of_month'] - 1
mbd = mbd.rename(columns={'microbusiness_density':'mbd'})

In [9]:
### pivot the data so that we can create the lag 

# get the cfips
cfips = pd.unique(mbd['cfips'])
# use 20 months of training to find the next months data
train_number_months = 20
# get a count of how many records we will create per cfip
iterations_per_cfip = max(mbd['date_order']) - train_number_months
# create dataframe to populate
mbd_exploded = pd.DataFrame(columns=['cfips','iter'] + list(range(0,train_number_months)))
# pivot the entire thing
mbd_pivotted = mbd.pivot(index='cfips',columns='date_order',values='mbd')


In [10]:
### This is to create the lag


# get the cfips
cfips = pd.unique(mbd['cfips'])
# use 20 months of training to find the next months data
train_number_months = 20
# get a count of how many records we will create per cfip
iterations_per_cfip = max(mbd['date_order']) - train_number_months
# create dataframe to populate
mbd_exploded = pd.DataFrame(columns=['cfips','iter'] + list(range(0,train_number_months)))
# pivot the entire thing
mbd_pivotted = mbd.pivot(index='cfips',columns='date_order',values='mbd')
# get the counties to loop thru
cfips_to_loop = cfips
# this is when we limit the counties randomly
cfips_to_loop = random.sample(list(cfips), 1000)
# just a list of possible starting months 
iters = list(range(0, int(iterations_per_cfip)))
# how many samples per county - max = len(iters)
samples_per_cfip = 10

# init an empty array
mbd_exploded_np = np.empty((len(cfips_to_loop)*samples_per_cfip, train_number_months))
# this is the row iterator for the numpy array
j = 0
# loop thru each cfip
for cfip in cfips_to_loop:
    # for each cfip, get a random set of consecutive months to train on
    random_i = random.sample(iters, samples_per_cfip)
    
    for i in random_i:
        # get the columns we need
        columns_to_grab = list(range(i,i+train_number_months))
        # grab those columns from the pivotted df
        to_append = mbd_pivotted.loc[[cfip],columns_to_grab]
        # send it to a scalar array
        to_append_np = to_append.to_numpy()
        # chagne the big array's row to that row in question
        mbd_exploded_np[j,:] = to_append_np

        # iterate plus one
        j += 1

In [11]:
### test / train split

# get rid of cfips & iter
X = mbd_exploded_np
y = X[:,-1]
X = X[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [12]:
### linear model

regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.8f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.8f" % r2_score(y_test, y_pred))

Coefficients: 
 [-2.13253568e-02  1.41516036e-02  2.83462453e-02  3.38695647e-02
 -4.91402101e-02 -3.77355593e-02 -2.61166404e-02  8.63414273e-02
 -5.93073206e-02 -2.10525595e-02  1.79635296e-02  7.80450054e-02
 -6.46830550e-02 -2.80130243e-02  1.61080481e-02 -4.08334179e-02
  6.56258029e-02  5.37557335e-04  1.00944595e+00]
Mean squared error: 0.11094970
Coefficient of determination: 0.99864035


In [13]:
### ridge model 

clf = Ridge(alpha=0.5)
clf.fit(X_train, y_train)
y_pred_ridge = clf.predict(X_test)

print("Coefficients: \n", clf.coef_)
# The mean squared error
print("Mean squared error: %.8f" % mean_squared_error(y_test, y_pred_ridge))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.8f" % r2_score(y_test, y_pred_ridge))

Coefficients: 
 [-2.13108805e-02  1.41432056e-02  2.83155205e-02  3.37987748e-02
 -4.90379668e-02 -3.77403751e-02 -2.60639975e-02  8.62842441e-02
 -5.92836063e-02 -2.10591289e-02  1.79958951e-02  7.79086689e-02
 -6.45830293e-02 -2.79983746e-02  1.59985148e-02 -4.07471962e-02
  6.55344177e-02  6.49733443e-04  1.00942374e+00]
Mean squared error: 0.11083245
Coefficient of determination: 0.99864179


In [14]:
### kernel model

kern = KernelRidge(alpha=0.5)
kern.fit(X_train, y_train)
y_pred_kernel = kern.predict(X_test)


# The mean squared error
print("Mean squared error: %.8f" % mean_squared_error(y_test, y_pred_kernel))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.8f" % r2_score(y_test, y_pred_kernel))

Coefficients: 
 [-2.13253568e-02  1.41516036e-02  2.83462453e-02  3.38695647e-02
 -4.91402101e-02 -3.77355593e-02 -2.61166404e-02  8.63414273e-02
 -5.93073206e-02 -2.10525595e-02  1.79635296e-02  7.80450054e-02
 -6.46830550e-02 -2.80130243e-02  1.61080481e-02 -4.08334179e-02
  6.56258029e-02  5.37557335e-04  1.00944595e+00]
Mean squared error: 0.11052272
Coefficient of determination: 0.99864558


In [16]:
### model comparisons

mse_linear = mean_squared_error(y_test, y_pred)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mse_kernel = mean_squared_error(y_test, y_pred_kernel)

r2_linear = r2_score(y_test, y_pred)
r2_ridge = r2_score(y_test, y_pred_ridge)
r2_kernel = r2_score(y_test, y_pred_kernel)

maxE_linear = max_error(y_test, y_pred)
maxE_ridge = max_error(y_test, y_pred_ridge)
maxE_kernel = max_error(y_test, y_pred_kernel)



# Create a DataFrame to store the results
model_comp = pd.DataFrame(index=['OLS', 'Ridge', 'Kernel'], columns=['MSE', 'R-squared', 'Max Error'])

# Store the results in the DataFrame
model_comp.loc['OLS'] = [mse_linear, r2_linear, maxE_linear]
model_comp.loc['Ridge'] = [mse_ridge, r2_ridge, maxE_ridge]
model_comp.loc['Kernel'] = [mse_kernel, r2_kernel, maxE_kernel]
model_comp

Unnamed: 0,MSE,R-squared,Max Error
OLS,0.11095,0.99864,12.17397
Ridge,0.110832,0.998642,12.161082
Kernel,0.110523,0.998646,12.150024
