In [1]:
# Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# Load data into dataframe
file_path = Path('../Resources/refactored_data/combineddata.csv')
df = pd.read_csv(file_path)

# Pull desired columns
df_stlhead_week_month = df[['weeknumber','monthvalue','maxtempf','mintempf','precipitationinch','watertempf','stlheadcount']]

# Drop NaN values
df = df_stlhead_week_month.dropna(axis=0)

df = df.astype({'stlheadcount': 'int64'})

# Make a df grouped by average count on monthvalue
week_df = df.groupby(['weeknumber']).mean().reset_index()
week_df = week_df.drop(columns = ['monthvalue'])
week_df

Unnamed: 0,weeknumber,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,1,41.833333,32.185185,0.366481,40.86,17.185185
1,2,42.043956,32.934066,0.410659,40.365055,17.395604
2,3,41.988764,33.550562,0.320674,39.600449,16.258427
3,4,44.470588,35.082353,0.316706,38.816706,14.552941
4,5,47.13253,37.060241,0.461205,39.098072,16.879518
5,6,46.101266,34.860759,0.289114,39.026835,14.443038
6,7,48.488889,35.755556,0.259444,39.184,18.411111
7,8,49.344086,35.967742,0.333226,39.563871,19.860215
8,9,48.868132,35.516484,0.25044,40.062418,22.505495
9,10,51.89899,36.10101,0.370808,40.765455,32.484848


In [3]:
target = ['stlheadcount']

# Creating features
X = week_df.drop('stlheadcount', axis=1)

# Creating target
y = week_df['stlheadcount']

In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

poly = PolynomialFeatures(degree=2, include_bias=False)

In [5]:
poly_features = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=1)

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [7]:
predict = model.predict(X_test)
predict

array([ 3268.23793913,    88.97420261, -2432.84865854,  3463.16843855,
        3543.65809232,  -744.07804323,  3273.19747662,  1532.85340095,
       -1750.63485258,   -89.45591142,  -118.85032611,  -620.54480332,
        -950.55130001,  1660.80691277])

In [8]:
predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

[ 3268.23793913    88.97420261 -2432.84865854  3463.16843855
  3543.65809232  -744.07804323  3273.19747662  1532.85340095
 -1750.63485258   -89.45591142  -118.85032611  -620.54480332
  -950.55130001  1660.80691277]
Coefficients: 
 [ 8.81354449e+01  2.18739432e+03 -5.43828354e+03 -1.49068602e+04
  5.28161335e+02  1.15392154e+00  2.59168916e+01 -6.87865098e+01
  4.01717934e+02  2.12786640e+01 -4.94672177e+01  1.77633249e+02
 -2.16103906e+03 -7.43015889e+01 -2.13592580e+02  6.04914307e+03
  2.48575612e+02 -2.57300570e+03 -2.40439294e+03 -6.20210113e+01]
Mean squared error: 1075326.52
Coefficient of determination: 0.58


In [9]:
model.score(X_train, y_train)

0.9676781220006541

In [10]:
model.score(X_test, y_test)

0.5800296809424293

In [11]:
print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

Train:  0.9676781220006541
Test:  0.5800296809424293
