In [1]:
# Import our dependencies
import pandas as pd
from datetime import datetime as dt
import numpy as np

# Machine Learning includes
#from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
#from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
#from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:

# Read in our dataset
candfraw = pd.read_csv("second_segment/County Product Trend_Full Data_data.csv")

cannabis_df = candfraw.copy(deep=True)
cannabis_df

Unnamed: 0,Date,Product Type,County,Month Name,Month Year,Tooltip Date,Market Share County,Sales,Sales Detail
0,10/1/2016,Usable MJ,Baker,October,10/1/2016,October 2016,100.00,58637.51,58637.51
1,10/1/2016,Usable MJ,Baker,October,10/1/2016,October 2016,100.00,441.00,441.00
2,10/1/2016,Concentrate/Extract,Clackamas,October,10/1/2016,October 2016,15.14,803.20,803.20
3,10/1/2016,Edible/Tincture,Clackamas,October,10/1/2016,October 2016,9.31,359.68,359.68
4,10/1/2016,Other,Clackamas,October,10/1/2016,October 2016,1.13,30.00,30.00
...,...,...,...,...,...,...,...,...,...
32416,4/1/2022,Usable MJ,Wasco,April,4/1/2022,April 2022,48.13,3980.86,3980.86
32417,4/1/2022,Usable MJ,Washington,April,4/1/2022,April 2022,47.34,27466.81,27466.81
32418,4/1/2022,Usable MJ,Washington,April,4/1/2022,April 2022,47.34,172242.21,172242.21
32419,4/1/2022,Usable MJ,Yamhill,April,4/1/2022,April 2022,45.68,5089.02,5089.02


In [3]:
#Create a new column with formated Date
cannabis_df['SalesMonthDate'] = pd.to_datetime(cannabis_df["Month Year"])

#Drop columns that are repetative
cannabis_df.drop(columns=["Month Name", "Month Year", "Tooltip Date", "Date", "Sales Detail" ],axis=1,inplace=True)


cannabis_df

Unnamed: 0,Product Type,County,Market Share County,Sales,SalesMonthDate
0,Usable MJ,Baker,100.00,58637.51,2016-10-01
1,Usable MJ,Baker,100.00,441.00,2016-10-01
2,Concentrate/Extract,Clackamas,15.14,803.20,2016-10-01
3,Edible/Tincture,Clackamas,9.31,359.68,2016-10-01
4,Other,Clackamas,1.13,30.00,2016-10-01
...,...,...,...,...,...
32416,Usable MJ,Wasco,48.13,3980.86,2022-04-01
32417,Usable MJ,Washington,47.34,27466.81,2022-04-01
32418,Usable MJ,Washington,47.34,172242.21,2022-04-01
32419,Usable MJ,Yamhill,45.68,5089.02,2022-04-01


In [4]:
# There are rows where the sales are broken out into 2 disctinct values, but the "Market Share County" has them aggregated
# We can show this below
cannabis_df.groupby(['SalesMonthDate', 'Product Type', 'County']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Market Share County,Sales
SalesMonthDate,Product Type,County,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-01,Concentrate/Extract,Clackamas,2,2
2016-10-01,Concentrate/Extract,Clatsop,2,2
2016-10-01,Concentrate/Extract,Deschutes,2,2
2016-10-01,Concentrate/Extract,Hood River,2,2
2016-10-01,Concentrate/Extract,Jackson,2,2
...,...,...,...,...
2022-04-01,Usable MJ,Umatilla,4,4
2022-04-01,Usable MJ,Wallowa,4,4
2022-04-01,Usable MJ,Wasco,4,4
2022-04-01,Usable MJ,Washington,6,6


In [5]:
#Whole dataset 
testdf_CntyNMonth = cannabis_df[(cannabis_df["County"]=="Clackamas")&(cannabis_df["SalesMonthDate"]=="2016-11-01")]
testdf_CntyNMonth

Unnamed: 0,Product Type,County,Market Share County,Sales,SalesMonthDate
101,Concentrate/Extract,Clackamas,13.58,2315.28,2016-11-01
102,Edible/Tincture,Clackamas,4.98,338.96,2016-11-01
103,Other,Clackamas,2.35,486.23,2016-11-01
104,Usable MJ,Clackamas,79.09,17086.67,2016-11-01
105,Concentrate/Extract,Clackamas,13.58,981.5,2016-11-01
106,Edible/Tincture,Clackamas,4.98,871.4,2016-11-01
107,Other,Clackamas,2.35,85.0,2016-11-01
108,Usable MJ,Clackamas,79.09,2116.15,2016-11-01


In [6]:
#We can see the double values here specifically
testdf_CntyNMonth_ProdType = cannabis_df[(cannabis_df["County"]=="Clackamas")&(cannabis_df["SalesMonthDate"]=="2016-11-01")
               &(cannabis_df["Product Type"]=="Concentrate/Extract")]

testdf_CntyNMonth_ProdType

Unnamed: 0,Product Type,County,Market Share County,Sales,SalesMonthDate
101,Concentrate/Extract,Clackamas,13.58,2315.28,2016-11-01
105,Concentrate/Extract,Clackamas,13.58,981.5,2016-11-01


In [7]:
#We should see about 13.58
#we do so this has been fixed
print((testdf_CntyNMonth_ProdType["Sales"].sum()/testdf_CntyNMonth["Sales"].sum())*100)

13.577505880066013


In [8]:
#so we know we can add the two rows together
#and we can Drop "Market Share County" column
cannabis_df.drop(columns=["Market Share County" ],axis=1,inplace=True)
#rename column too
cannabis_df = cannabis_df.rename(columns={'Product Type': 'ProductType'})

candfNodup_df = cannabis_df.groupby(['SalesMonthDate', 'ProductType', 'County']).sum().reset_index()

In [9]:
#Single Value is good
candfNodup_df[(candfNodup_df["County"]=="Clackamas")&(candfNodup_df["SalesMonthDate"]=="2016-11-01")
               &(candfNodup_df["ProductType"]=="Concentrate/Extract")]

Unnamed: 0,SalesMonthDate,ProductType,County,Sales
49,2016-11-01,Concentrate/Extract,Clackamas,3296.78


In [10]:
# Now we convert the Date (Month and Year)

# Old Code to convert SalesMonthDate to numeric
#candfNodup_df["SalesMonthDate"] = candfNodup_df["SalesMonthDate"].dt.year*100+candfNodup_df["SalesMonthDate"].dt.month

candfNodup_df['SalesMonth'] = candfNodup_df['SalesMonthDate']
candfNodup_df['SalesYear'] = candfNodup_df['SalesMonthDate']

candfNodup_df['SalesMonth'] = pd.DatetimeIndex(candfNodup_df['SalesMonth']).month.astype(object)
candfNodup_df['SalesYear'] = pd.DatetimeIndex(candfNodup_df['SalesYear']).year.astype(object)

#Drop Date column now that we have them separated and ready to encode
candfNodup_df.drop(columns=["SalesMonthDate" ],axis=1,inplace=True)

In [11]:
#Take a look at data
candfNodup_df["ProductType"].unique()

array(['Concentrate/Extract', 'Edible/Tincture', 'Other', 'Usable MJ',
       'Industrial Hemp Commodity/Product', 'Industrial Hemp',
       'Inhalable Product with Non-Cannabis Additives'], dtype=object)

In [12]:
candfNodup_df["County"].unique()

array(['Clackamas', 'Clatsop', 'Deschutes', 'Hood River', 'Jackson',
       'Lane', 'Lincoln', 'Marion', 'Multnomah', 'Tillamook',
       'Washington', 'Yamhill', 'Baker', 'Columbia', 'Douglas', 'Curry',
       'Polk', 'Wasco', 'Coos', 'Benton', 'Jefferson', 'Josephine',
       'Linn', 'Umatilla', 'Harney', 'Lake', 'Klamath', 'Grant', 'Union',
       'Wallowa', 'Malheur'], dtype=object)

In [13]:
candfNodup_df["SalesMonth"].unique()

array([10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=object)

In [15]:
candfNodup_df["SalesYear"].unique()

array([2016, 2017, 2018, 2019, 2020, 2021, 2022], dtype=object)

In [16]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)


In [17]:
# Generate our categorical variable lists
cannabis_cat = candfNodup_df.dtypes[candfNodup_df.dtypes == "object"].index.tolist()
# Check the number of unique values in each column
candfNodup_df[cannabis_cat].nunique()

ProductType     7
County         31
SalesMonth     12
SalesYear       7
dtype: int64

In [18]:
candfNodup_df


Unnamed: 0,ProductType,County,Sales,SalesMonth,SalesYear
0,Concentrate/Extract,Clackamas,947.20,10,2016
1,Concentrate/Extract,Clatsop,13057.00,10,2016
2,Concentrate/Extract,Deschutes,9763.45,10,2016
3,Concentrate/Extract,Hood River,23438.44,10,2016
4,Concentrate/Extract,Jackson,95346.45,10,2016
...,...,...,...,...,...
9509,Usable MJ,Umatilla,646336.81,4,2022
9510,Usable MJ,Wallowa,48779.53,4,2022
9511,Usable MJ,Wasco,178133.02,4,2022
9512,Usable MJ,Washington,4298767.33,4,2022


In [19]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(candfNodup_df[cannabis_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(cannabis_cat)
encode_df.head()

Unnamed: 0,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,County_Clackamas,...,SalesMonth_10,SalesMonth_11,SalesMonth_12,SalesYear_2016,SalesYear_2017,SalesYear_2018,SalesYear_2019,SalesYear_2020,SalesYear_2021,SalesYear_2022
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,,,,,
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
candfNodup_dfWencode = candfNodup_df.merge(encode_df, left_index=True, right_index=True)
candfNodup_dfWencode = candfNodup_dfWencode.drop(columns=cannabis_cat, axis=1)
#candfNodup_dfWencode

In [21]:
#candfNodup_dfWencode.columns.values.tolist()
#candfNodup_dfWencode.dtypes
candfNodup_dfWencode.info()
#candfNodup_dfWencode.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9514 entries, 0 to 9513
Data columns (total 58 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   Sales                                                      9514 non-null   float64
 1   ProductType_Concentrate/Extract                            9514 non-null   float64
 2   ProductType_Edible/Tincture                                9514 non-null   float64
 3   ProductType_Industrial Hemp                                9514 non-null   float64
 4   ProductType_Industrial Hemp Commodity/Product              9514 non-null   float64
 5   ProductType_Inhalable Product with Non-Cannabis Additives  9514 non-null   float64
 6   ProductType_Other                                          9514 non-null   float64
 7   ProductType_Usable MJ                                      9514 non-null   float64
 8   County_B

In [22]:
# Set y equal to the target, Sales
y = candfNodup_dfWencode["Sales"].values
y

array([9.47200000e+02, 1.30570000e+04, 9.76345000e+03, ...,
       1.78133020e+05, 4.29876733e+06, 7.77254090e+05])

In [23]:
# Set X to the df minus the "Sales" column
X = candfNodup_dfWencode.drop(columns=["Sales"],axis=1)
X

Unnamed: 0,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,County_Clackamas,...,SalesMonth_10,SalesMonth_11,SalesMonth_12,SalesYear_2016,SalesYear_2017,SalesYear_2018,SalesYear_2019,SalesYear_2020,SalesYear_2021,SalesYear_2022
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9509,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9510,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9511,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9512,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
# Split the preprocessed data into a training and testing dataset
#9514rows
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [25]:
#7135 X_train
#2379 X_test
#7135 y_train
#2379 y_test



#X_train
#X_test
#len(y_train)
#len(y_test)

In [26]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns.values)
X_train_scaled

Unnamed: 0,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,County_Clackamas,...,SalesMonth_10,SalesMonth_11,SalesMonth_12,SalesYear_2016,SalesYear_2017,SalesYear_2018,SalesYear_2019,SalesYear_2020,SalesYear_2021,SalesYear_2022
0,2.039381,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,3.253120,-0.299687,-0.307124,7.173788,-0.369715,-0.443561,-0.475337,-0.508963,-0.545227,-0.289004
1,-0.490345,2.021270,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,3.256014,-0.139396,-0.369715,-0.443561,2.103772,-0.508963,-0.545227,-0.289004
2,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,-0.508963,-0.545227,3.460163
3,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,-0.508963,1.834098,-0.289004
4,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,2.026665,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,1.964778,-0.545227,-0.289004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7130,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,-0.18857,-0.186569,...,3.253120,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,1.964778,-0.545227,-0.289004
7131,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,5.359960,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,2.103772,-0.508963,-0.545227,-0.289004
7132,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,1.964778,-0.545227,-0.289004
7133,2.039381,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,2.704783,-0.443561,-0.475337,-0.508963,-0.545227,-0.289004


In [28]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_train.columns.values)
X_test_scaled

Unnamed: 0,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,County_Clackamas,...,SalesMonth_10,SalesMonth_11,SalesMonth_12,SalesYear_2016,SalesYear_2017,SalesYear_2018,SalesYear_2019,SalesYear_2020,SalesYear_2021,SalesYear_2022
0,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,-0.508963,1.834098,-0.289004
1,2.039381,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,2.254483,-0.475337,-0.508963,-0.545227,-0.289004
2,-0.490345,-0.494739,6.325010,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,-0.508963,1.834098,-0.289004
3,-0.490345,2.021270,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,2.704783,-0.443561,-0.475337,-0.508963,-0.545227,-0.289004
4,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,3.256014,7.173788,-0.369715,-0.443561,-0.475337,-0.508963,-0.545227,-0.289004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,-0.490345,2.021270,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,3.256014,-0.139396,-0.369715,2.254483,-0.475337,-0.508963,-0.545227,-0.289004
2375,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,-0.307124,-0.139396,-0.369715,2.254483,-0.475337,-0.508963,-0.545227,-0.289004
2376,-0.490345,-0.494739,-0.158103,-0.423024,4.884734,-0.495836,-0.493421,-0.187772,-0.18857,-0.186569,...,-0.307397,-0.299687,3.256014,-0.139396,-0.369715,-0.443561,-0.475337,-0.508963,1.834098,-0.289004
2377,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,5.30306,-0.186569,...,3.253120,-0.299687,-0.307124,-0.139396,-0.369715,-0.443561,-0.475337,1.964778,-0.545227,-0.289004


In [29]:
# Creating the Linear Regression instance.
model = LinearRegression()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [30]:
#
# research included article from 
# https://ijeremiah.com/portfolio/cars/
# this is using code/inspiration/edu from Cell~55
#
intercept = model.intercept_.round(4)
coeff = model.coef_.round(4)

weights = pd.DataFrame({'Parameter': np.append('const', X.columns.values), 'Coef': np.append(intercept, coeff)})
weights

Unnamed: 0,Parameter,Coef
0,const,478189.4
1,ProductType_Concentrate/Extract,-1.793045e+18
2,ProductType_Edible/Tincture,-1.80282e+18
3,ProductType_Industrial Hemp,-6.996502e+17
4,ProductType_Industrial Hemp Commodity/Product,-1.62755e+18
5,ProductType_Inhalable Product with Non-Cannabi...,-8.912373e+17
6,ProductType_Other,-1.805242e+18
7,ProductType_Usable MJ,-1.799903e+18
8,County_Baker,-3.044236e+18
9,County_Benton,-3.056294e+18


In [31]:
# Making predictions using the testing data.
predictionstest = model.predict(X_test_scaled)
predictionstrain = model.predict(X_train_scaled)


In [32]:
#predictions = pd.DataFrame(predictions, columns = ["Sales"])
#2379

#2nd Try is 7135
predictionstrain.size

7135

In [33]:
#y_train = pd.DataFrame(y_train, columns = ["Sales"])
#7135


y_train.size
#y_test.size

7135

In [34]:
# Again inspiration from 
# https://ijeremiah.com/portfolio/cars/
# ~Cell 59

eval_df_train = pd.DataFrame({'Predicted': predictionstrain, 
                              'Observed': y_train, 
                              'Residual': predictionstrain - y_train,
                              'Abs. Error': np.abs(predictionstrain - y_train)
                             }
)


In [35]:

eval_df_train

Unnamed: 0,Predicted,Observed,Residual,Abs. Error
0,-3.425466e+05,4466.93,-3.470136e+05,3.470136e+05
1,-1.049786e+05,52860.20,-1.578388e+05,1.578388e+05
2,-1.853626e+05,8200.34,-1.935630e+05,1.935630e+05
3,3.896134e+05,61582.11,3.280312e+05,3.280312e+05
4,9.855814e+05,67508.89,9.180725e+05,9.180725e+05
...,...,...,...,...
7130,-2.544826e+05,4749.86,-2.592325e+05,2.592325e+05
7131,2.022214e+05,72875.04,1.293463e+05,1.293463e+05
7132,-7.323464e+04,14183.20,-8.741784e+04,8.741784e+04
7133,2.544454e+05,118070.87,1.363745e+05,1.363745e+05


In [None]:
# Pretty Sure this is not the results we wanted
# We had tried using the Date Column as a Numeric but that also had similar mal-aigned results.
# Our next step is to use a dataset that also has population as an additional column
# this is week 3 of 4.

#Our analysis with the Population will be present for week 4/4 below.








