In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


mbd_file = 'mbd_train.csv'
mbd = pd.read_csv(mbd_file)
mbd['date_order'] = mbd.groupby(['cfips']).rank()['first_day_of_month'] - 1
mbd = mbd.rename(columns={'microbusiness_density':'mbd'})

In [5]:
# get the cfips
cfips = pd.unique(mbd['cfips'])
# use 20 months of training to find the next months data
train_number_months = 20
# get a count of how many records we will create per cfip
iterations_per_cfip = max(mbd['date_order']) - train_number_months
# create dataframe to populate
mbd_exploded = pd.DataFrame(columns=['cfips','iter'] + list(range(0,train_number_months)))
# pivot the entire thing
mbd_pivotted = mbd.pivot(index='cfips',columns='date_order',values='mbd')
# loop thru each cfip
for cfip in cfips[:100]:
    # for unique iteration of months to use as variable 1-20
    for i in range(0, int(iterations_per_cfip)):
        # get the columns we need
        columns_to_grab = list(range(i,i+train_number_months))
        # grab those columns from the pivotted df
        to_append = mbd_pivotted.loc[[cfip],columns_to_grab]
        # add a row number for that iteration of the cfip for record keeping sake
        to_append['iter'] = i
        # crazy way of just renaming the columns to grab to a list of range(0-2o)
        to_append = to_append.rename(columns=dict(list(zip(columns_to_grab, list(range(0,train_number_months))))))
        # reset index to get 'cfips' into the dataframe
        to_append = to_append.reset_index(drop=False)
        # mbd_exploded = mbd_exploded.append(to_append, ignore_index=True)
        mbd_exploded = pd.concat([mbd_exploded, to_append], axis=0)


In [10]:

# get rid of cfips & iter
X = mbd_exploded.iloc[:,2:].to_numpy()
y = X[:,-1]
X = X[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.8f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.8f" % r2_score(y_test, y_pred))


Coefficients: 
 [-1.50104441e-01  1.54662417e-01 -3.38808310e-02  2.58216848e-02
  1.97360516e-03  1.12163534e-01 -1.41158702e-01  2.27010400e-01
 -2.66645988e-01  4.72083849e-02  1.26818616e-01 -2.78760104e-01
  1.22793098e-01  2.09942975e-01 -1.05562102e-01 -8.14811455e-04
 -2.86079859e-02  6.50990003e-02  9.06912209e-01]
Mean squared error: 0.02418465
Coefficient of determination: 0.99510087


In [11]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=0.5)
clf.fit(X_train, y_train)
y_pred_ridge = clf.predict(X_test)

print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.8f" % mean_squared_error(y_test, y_pred_ridge))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.8f" % r2_score(y_test, y_pred_ridge))

Coefficients: 
 [-1.50104441e-01  1.54662417e-01 -3.38808310e-02  2.58216848e-02
  1.97360516e-03  1.12163534e-01 -1.41158702e-01  2.27010400e-01
 -2.66645988e-01  4.72083849e-02  1.26818616e-01 -2.78760104e-01
  1.22793098e-01  2.09942975e-01 -1.05562102e-01 -8.14811455e-04
 -2.86079859e-02  6.50990003e-02  9.06912209e-01]
Mean squared error: 0.02402910
Coefficient of determination: 0.99513239


In [12]:

from sklearn.kernel_ridge import KernelRidge
kern = KernelRidge(alpha=0.5)
kern.fit(X_train, y_train)
y_pred_kernel = kern.predict(X_test)

print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.8f" % mean_squared_error(y_test, y_pred_kernel))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.8f" % r2_score(y_test, y_pred_kernel))

Coefficients: 
 [-1.50104441e-01  1.54662417e-01 -3.38808310e-02  2.58216848e-02
  1.97360516e-03  1.12163534e-01 -1.41158702e-01  2.27010400e-01
 -2.66645988e-01  4.72083849e-02  1.26818616e-01 -2.78760104e-01
  1.22793098e-01  2.09942975e-01 -1.05562102e-01 -8.14811455e-04
 -2.86079859e-02  6.50990003e-02  9.06912209e-01]
Mean squared error: 0.02436339
Coefficient of determination: 0.99506467
