In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


mbd_file = 'mbd_train.csv'
mbd = pd.read_csv(mbd_file)
mbd['date_order'] = mbd.groupby(['cfips']).rank()['first_day_of_month'] - 1
mbd = mbd.rename(columns={'microbusiness_density':'mbd'})

In [5]:
# get the cfips
cfips = pd.unique(mbd['cfips'])
# use 20 months of training to find the next months data
train_number_months = 20
# get a count of how many records we will create per cfip
iterations_per_cfip = max(mbd['date_order']) - train_number_months
# create dataframe to populate
mbd_exploded = pd.DataFrame(columns=['cfips','iter'] + list(range(0,train_number_months)))
# pivot the entire thing
mbd_pivotted = mbd.pivot(index='cfips',columns='date_order',values='mbd')
# loop thru each cfip
for cfip in cfips[:100]:
    # for unique iteration of months to use as variable 1-20
    for i in range(0, int(iterations_per_cfip)):
        # get the columns we need
        columns_to_grab = list(range(i,i+train_number_months))
        # grab those columns from the pivotted df
        to_append = mbd_pivotted.loc[[cfip],columns_to_grab]
        # add a row number for that iteration of the cfip for record keeping sake
        to_append['iter'] = i
        # crazy way of just renaming the columns to grab to a list of range(0-2o)
        to_append = to_append.rename(columns=dict(list(zip(columns_to_grab, list(range(0,train_number_months))))))
        # reset index to get 'cfips' into the dataframe
        to_append = to_append.reset_index(drop=False)
        # mbd_exploded = mbd_exploded.append(to_append, ignore_index=True)
        mbd_exploded = pd.concat([mbd_exploded, to_append], axis=0)


In [6]:

# get rid of cfips & iter
X = mbd_exploded.iloc[:,2:].to_numpy()
y = X[:,-1]
X = X[:,:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Coefficients: 
 [-0.21583413  0.25004917 -0.03252091  0.03475388 -0.03576828  0.13933484
 -0.12709394  0.07155426 -0.1375314   0.03031207  0.13147216 -0.23807329
  0.05671739  0.10503617  0.00273261 -0.0393279  -0.02305061  0.05950023
  0.95955231]
Mean squared error: 0.02
Coefficient of determination: 1.00
