In [1]:
# Import our dependencies
import pandas as pd
from datetime import datetime as dt
import numpy as np

# Machine Learning includes
#from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
#from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
#from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:

# Read in our dataset
candfraw = pd.read_csv("second_segment/County Product Trend_Full Data_data.csv")

cannabis_df = candfraw.copy(deep=True)
cannabis_df

Unnamed: 0,Date,Product Type,County,Month Name,Month Year,Tooltip Date,Market Share County,Sales,Sales Detail
0,10/1/2016,Usable MJ,Baker,October,10/1/2016,October 2016,100.00,58637.51,58637.51
1,10/1/2016,Usable MJ,Baker,October,10/1/2016,October 2016,100.00,441.00,441.00
2,10/1/2016,Concentrate/Extract,Clackamas,October,10/1/2016,October 2016,15.14,803.20,803.20
3,10/1/2016,Edible/Tincture,Clackamas,October,10/1/2016,October 2016,9.31,359.68,359.68
4,10/1/2016,Other,Clackamas,October,10/1/2016,October 2016,1.13,30.00,30.00
...,...,...,...,...,...,...,...,...,...
32416,4/1/2022,Usable MJ,Wasco,April,4/1/2022,April 2022,48.13,3980.86,3980.86
32417,4/1/2022,Usable MJ,Washington,April,4/1/2022,April 2022,47.34,27466.81,27466.81
32418,4/1/2022,Usable MJ,Washington,April,4/1/2022,April 2022,47.34,172242.21,172242.21
32419,4/1/2022,Usable MJ,Yamhill,April,4/1/2022,April 2022,45.68,5089.02,5089.02


In [3]:
#Create a new column with formated Date
cannabis_df['SalesMonthDate'] = pd.to_datetime(cannabis_df["Month Year"])



#Drop columns that are repetative
cannabis_df.drop(columns=["Month Name", "Month Year", "Tooltip Date", "Date", "Sales Detail" ],axis=1,inplace=True)


cannabis_df

Unnamed: 0,Product Type,County,Market Share County,Sales,SalesMonthDate
0,Usable MJ,Baker,100.00,58637.51,2016-10-01
1,Usable MJ,Baker,100.00,441.00,2016-10-01
2,Concentrate/Extract,Clackamas,15.14,803.20,2016-10-01
3,Edible/Tincture,Clackamas,9.31,359.68,2016-10-01
4,Other,Clackamas,1.13,30.00,2016-10-01
...,...,...,...,...,...
32416,Usable MJ,Wasco,48.13,3980.86,2022-04-01
32417,Usable MJ,Washington,47.34,27466.81,2022-04-01
32418,Usable MJ,Washington,47.34,172242.21,2022-04-01
32419,Usable MJ,Yamhill,45.68,5089.02,2022-04-01


In [4]:
# There are rows where the sales are broken out into 2 disctinct values, but the "Market Share County" has them aggregated
# We can show this below
cannabis_df.groupby(['SalesMonthDate', 'Product Type', 'County']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Market Share County,Sales
SalesMonthDate,Product Type,County,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-01,Concentrate/Extract,Clackamas,2,2
2016-10-01,Concentrate/Extract,Clatsop,2,2
2016-10-01,Concentrate/Extract,Deschutes,2,2
2016-10-01,Concentrate/Extract,Hood River,2,2
2016-10-01,Concentrate/Extract,Jackson,2,2
...,...,...,...,...
2022-04-01,Usable MJ,Umatilla,4,4
2022-04-01,Usable MJ,Wallowa,4,4
2022-04-01,Usable MJ,Wasco,4,4
2022-04-01,Usable MJ,Washington,6,6


In [5]:
#Whole dataset 
testdf_CntyNMonth = cannabis_df[(cannabis_df["County"]=="Clackamas")&(cannabis_df["SalesMonthDate"]=="2016-11-01")]
testdf_CntyNMonth

Unnamed: 0,Product Type,County,Market Share County,Sales,SalesMonthDate
101,Concentrate/Extract,Clackamas,13.58,2315.28,2016-11-01
102,Edible/Tincture,Clackamas,4.98,338.96,2016-11-01
103,Other,Clackamas,2.35,486.23,2016-11-01
104,Usable MJ,Clackamas,79.09,17086.67,2016-11-01
105,Concentrate/Extract,Clackamas,13.58,981.5,2016-11-01
106,Edible/Tincture,Clackamas,4.98,871.4,2016-11-01
107,Other,Clackamas,2.35,85.0,2016-11-01
108,Usable MJ,Clackamas,79.09,2116.15,2016-11-01


In [6]:
#We can see the double values here specifically
testdf_CntyNMonth_ProdType = cannabis_df[(cannabis_df["County"]=="Clackamas")&(cannabis_df["SalesMonthDate"]=="2016-11-01")
               &(cannabis_df["Product Type"]=="Concentrate/Extract")]

testdf_CntyNMonth_ProdType

Unnamed: 0,Product Type,County,Market Share County,Sales,SalesMonthDate
101,Concentrate/Extract,Clackamas,13.58,2315.28,2016-11-01
105,Concentrate/Extract,Clackamas,13.58,981.5,2016-11-01


In [7]:
#We should see about 13.58
#we do so this has been fixed
print((testdf_CntyNMonth_ProdType["Sales"].sum()/testdf_CntyNMonth["Sales"].sum())*100)

13.577505880066013


In [8]:
#so we know we can add the two rows together
#and we can Drop "Market Share County" column
cannabis_df.drop(columns=["Market Share County" ],axis=1,inplace=True)
#rename column too
cannabis_df = cannabis_df.rename(columns={'Product Type': 'ProductType'})

candfNodup_df = cannabis_df.groupby(['SalesMonthDate', 'ProductType', 'County']).sum().reset_index()

In [9]:
#Single Value is good
candfNodup_df[(candfNodup_df["County"]=="Clackamas")&(candfNodup_df["SalesMonthDate"]=="2016-11-01")
               &(candfNodup_df["ProductType"]=="Concentrate/Extract")]

Unnamed: 0,SalesMonthDate,ProductType,County,Sales
49,2016-11-01,Concentrate/Extract,Clackamas,3296.78


In [10]:
candfNodup_df["SalesMonthDate"] = candfNodup_df["SalesMonthDate"].dt.year*100+candfNodup_df["SalesMonthDate"].dt.month



In [11]:
#Take a look at data
# Look at Product Types 
candfNodup_df["ProductType"].unique()

array(['Concentrate/Extract', 'Edible/Tincture', 'Other', 'Usable MJ',
       'Industrial Hemp Commodity/Product', 'Industrial Hemp',
       'Inhalable Product with Non-Cannabis Additives'], dtype=object)

In [12]:
candfNodup_df["County"].unique()

array(['Clackamas', 'Clatsop', 'Deschutes', 'Hood River', 'Jackson',
       'Lane', 'Lincoln', 'Marion', 'Multnomah', 'Tillamook',
       'Washington', 'Yamhill', 'Baker', 'Columbia', 'Douglas', 'Curry',
       'Polk', 'Wasco', 'Coos', 'Benton', 'Jefferson', 'Josephine',
       'Linn', 'Umatilla', 'Harney', 'Lake', 'Klamath', 'Grant', 'Union',
       'Wallowa', 'Malheur'], dtype=object)

In [13]:
candfNodup_df["SalesMonthDate"].unique()

array([201610, 201611, 201612, 201701, 201702, 201703, 201704, 201705,
       201706, 201707, 201708, 201709, 201710, 201711, 201712, 201801,
       201802, 201803, 201804, 201805, 201806, 201807, 201808, 201809,
       201810, 201811, 201812, 201901, 201902, 201903, 201904, 201905,
       201906, 201907, 201908, 201909, 201910, 201911, 201912, 202001,
       202002, 202003, 202004, 202005, 202006, 202007, 202008, 202009,
       202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105,
       202106, 202107, 202108, 202109, 202110, 202111, 202112, 202201,
       202202, 202203, 202204], dtype=int64)

In [14]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)


In [15]:
# Generate our categorical variable lists
cannabis_cat = candfNodup_df.dtypes[candfNodup_df.dtypes == "object"].index.tolist()
# Check the number of unique values in each column
candfNodup_df[cannabis_cat].nunique()

ProductType     7
County         31
dtype: int64

In [16]:
candfNodup_df


Unnamed: 0,SalesMonthDate,ProductType,County,Sales
0,201610,Concentrate/Extract,Clackamas,947.20
1,201610,Concentrate/Extract,Clatsop,13057.00
2,201610,Concentrate/Extract,Deschutes,9763.45
3,201610,Concentrate/Extract,Hood River,23438.44
4,201610,Concentrate/Extract,Jackson,95346.45
...,...,...,...,...
9509,202204,Usable MJ,Umatilla,646336.81
9510,202204,Usable MJ,Wallowa,48779.53
9511,202204,Usable MJ,Wasco,178133.02
9512,202204,Usable MJ,Washington,4298767.33


In [17]:
# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(candfNodup_df[cannabis_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(cannabis_cat)
encode_df.head()

Unnamed: 0,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,County_Clackamas,...,County_Marion,County_Multnomah,County_Polk,County_Tillamook,County_Umatilla,County_Union,County_Wallowa,County_Wasco,County_Washington,County_Yamhill
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
candfNodup_dfWencode = candfNodup_df.merge(encode_df, left_index=True, right_index=True)
candfNodup_dfWencode = candfNodup_dfWencode.drop(columns=cannabis_cat, axis=1)
#candfNodup_dfWencode

In [19]:
#candfNodup_dfWencode.columns.values.tolist()
#candfNodup_dfWencode.dtypes
#candfNodup_dfWencode.info()
candfNodup_dfWencode.head()


Unnamed: 0,SalesMonthDate,Sales,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,...,County_Marion,County_Multnomah,County_Polk,County_Tillamook,County_Umatilla,County_Union,County_Wallowa,County_Wasco,County_Washington,County_Yamhill
0,201610,947.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,201610,13057.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,201610,9763.45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,201610,23438.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,201610,95346.45,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Set y equal to the target, Sales
y = candfNodup_dfWencode["Sales"].values
y

array([9.47200000e+02, 1.30570000e+04, 9.76345000e+03, ...,
       1.78133020e+05, 4.29876733e+06, 7.77254090e+05])

In [32]:
# Set X to the df minus the "Sales" column
X = candfNodup_dfWencode.drop(columns=["Sales"],axis=1)
X

Unnamed: 0,SalesMonthDate,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,...,County_Marion,County_Multnomah,County_Polk,County_Tillamook,County_Umatilla,County_Union,County_Wallowa,County_Wasco,County_Washington,County_Yamhill
0,201610,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,201610,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,201610,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,201610,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,201610,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9509,202204,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9510,202204,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9511,202204,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9512,202204,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [34]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns.values)
X_train_scaled

Unnamed: 0,SalesMonthDate,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,...,County_Marion,County_Multnomah,County_Polk,County_Tillamook,County_Umatilla,County_Union,County_Wallowa,County_Wasco,County_Washington,County_Yamhill
0,-2.182178,2.039381,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302
1,-0.242350,-0.490345,2.021270,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302
2,1.620399,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,5.359960,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302
3,0.984495,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302
4,0.355015,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,2.026665,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,6.992503,-0.187372,-0.194081,-0.193302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7130,0.387131,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,6.992503,-0.187372,-0.194081,-0.193302
7131,-0.261619,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302
7132,0.380708,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302
7133,-1.578390,2.039381,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.202480,-0.190554,-0.189366,-0.186569,-0.07695,-0.143010,-0.187372,-0.194081,-0.193302


In [36]:
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_train.columns.values)
X_test_scaled

Unnamed: 0,SalesMonthDate,ProductType_Concentrate/Extract,ProductType_Edible/Tincture,ProductType_Industrial Hemp,ProductType_Industrial Hemp Commodity/Product,ProductType_Inhalable Product with Non-Cannabis Additives,ProductType_Other,ProductType_Usable MJ,County_Baker,County_Benton,...,County_Marion,County_Multnomah,County_Polk,County_Tillamook,County_Umatilla,County_Union,County_Wallowa,County_Wasco,County_Washington,County_Yamhill
0,0.997342,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,-0.18857,...,5.091614,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302
1,-0.916793,2.039381,-0.494739,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,5.173263
2,1.010188,-0.490345,-0.494739,6.325010,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,5.173263
3,-1.578390,-0.490345,2.021270,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302
4,-2.169331,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2374,-0.884677,-0.490345,2.021270,-0.158103,-0.423024,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302
2375,-0.910370,-0.490345,-0.494739,-0.158103,2.363933,-0.204719,-0.495836,-0.493421,-0.187772,-0.18857,...,5.091614,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302
2376,1.042305,-0.490345,-0.494739,-0.158103,-0.423024,4.884734,-0.495836,-0.493421,-0.187772,-0.18857,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302
2377,0.387131,-0.490345,-0.494739,-0.158103,-0.423024,-0.204719,2.016797,-0.493421,-0.187772,5.30306,...,-0.196401,-0.20248,-0.190554,-0.189366,-0.186569,-0.07695,-0.14301,-0.187372,-0.194081,-0.193302


In [37]:
# Creating the Linear Regression instance.
model = LinearRegression()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [38]:
#
# research included article from 
# https://ijeremiah.com/portfolio/cars/
# this is using code/inspiration/edu from Cell~55
#
intercept = model.intercept_.round(4)
coeff = model.coef_.round(4)

weights = pd.DataFrame({'Parameter': np.append('const', X.columns.values), 'Coef': np.append(intercept, coeff)})
weights.head()

Unnamed: 0,Parameter,Coef
0,const,475947.8
1,SalesMonthDate,139133.0
2,ProductType_Concentrate/Extract,3.180671e+18
3,ProductType_Edible/Tincture,3.198012e+18
4,ProductType_Industrial Hemp,1.241105e+18


In [39]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [44]:
#predictions = pd.DataFrame(predictions, columns = ["Sales"])
#2379
predictions.size

2379

In [45]:
#y_train = pd.DataFrame(y_train, columns = ["Sales"])
#7135
y_train.size

7135

In [42]:
# Again inspiration from 
# https://ijeremiah.com/portfolio/cars/
# ~Cell 59


#predictions.merge(y_train, axis=1)

#eval_df = pd.concat([predictions,y_train],axis=1)





eval_df_train = pd.DataFrame({'Predicted': predictions, 
                              'Observed': y_train, 
                              'Residual': predictions - y_train,
                              'Abs. Error': np.abs(predictions - y_train)
                             }
)



ValueError: operands could not be broadcast together with shapes (2379,) (7135,) 

In [None]:
eval_df_train.head(8)