In [1]:
# Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# Load data into dataframe
file_path = Path('../Resources/refactored_data/combineddata.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,countid,yearvalue,monthvalue,weeknumber,locationname,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,19900101,1990,1,1,Bonneville,1990-01-01,49.0,35.0,0.89,,
1,19900102,1990,1,1,Bonneville,1990-01-02,45.0,36.0,0.85,,
2,19900103,1990,1,1,Bonneville,1990-01-03,46.0,39.0,0.02,,
3,19900104,1990,1,1,Bonneville,1990-01-04,49.0,42.0,0.04,,
4,19900105,1990,1,1,Bonneville,1990-01-05,53.0,45.0,0.47,,


In [3]:
# Pull desired columns
df_stlhead_week_month = df[['weeknumber','monthvalue','maxtempf','mintempf','precipitationinch','watertempf','stlheadcount']]
df_stlhead_week_month.head(3)

Unnamed: 0,weeknumber,monthvalue,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,1,1,49.0,35.0,0.89,,
1,1,1,45.0,36.0,0.85,,
2,1,1,46.0,39.0,0.02,,


In [4]:
# Drop NaN values
df = df_stlhead_week_month.dropna(axis=0)
df

Unnamed: 0,weeknumber,monthvalue,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
73,11,3,53.0,38.0,0.12,44.06,27.0
74,11,3,69.0,38.0,0.00,44.96,44.0
75,11,3,61.0,44.0,0.00,44.06,36.0
76,12,3,59.0,44.0,0.02,44.96,60.0
77,12,3,67.0,44.0,0.00,46.04,46.0
...,...,...,...,...,...,...,...
11583,38,9,76.0,52.0,0.93,66.56,590.0
11584,39,9,60.0,51.0,1.50,66.56,702.0
11585,39,9,64.0,50.0,0.58,66.74,698.0
11586,39,9,75.0,50.0,0.00,66.38,1004.0


In [5]:
df.dtypes

weeknumber             int64
monthvalue             int64
maxtempf             float64
mintempf             float64
precipitationinch    float64
watertempf           float64
stlheadcount         float64
dtype: object

In [6]:
df = df.astype({'stlheadcount': 'int64'})

## Day ML

In [7]:
day_df = df.drop(columns = ['weeknumber', 'monthvalue']).reset_index()
day_df

Unnamed: 0,index,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,73,53.0,38.0,0.12,44.06,27
1,74,69.0,38.0,0.00,44.96,44
2,75,61.0,44.0,0.00,44.06,36
3,76,59.0,44.0,0.02,44.96,60
4,77,67.0,44.0,0.00,46.04,46
...,...,...,...,...,...,...
8523,11583,76.0,52.0,0.93,66.56,590
8524,11584,60.0,51.0,1.50,66.56,702
8525,11585,64.0,50.0,0.58,66.74,698
8526,11586,75.0,50.0,0.00,66.38,1004


In [8]:
target = ['stlheadcount']

# Creating features
X = day_df.drop('stlheadcount', axis=1)
# X = pd.get_dummies(X)

# Creating target
y = day_df['stlheadcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

model = LinearRegression()
model.fit(X_train, y_train)

day_train = model.score(X_train, y_train)
day_test = model.score(X_test, y_test)

## Week ML

In [9]:
# Make a df grouped by average count on monthvalue
week_df = df.groupby(['weeknumber']).mean().reset_index()
week_df = week_df.drop(columns = ['monthvalue'])
week_df

Unnamed: 0,weeknumber,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,1,41.833333,32.185185,0.366481,40.86,17.185185
1,2,42.043956,32.934066,0.410659,40.365055,17.395604
2,3,41.988764,33.550562,0.320674,39.600449,16.258427
3,4,44.470588,35.082353,0.316706,38.816706,14.552941
4,5,47.13253,37.060241,0.461205,39.098072,16.879518
5,6,46.101266,34.860759,0.289114,39.026835,14.443038
6,7,48.488889,35.755556,0.259444,39.184,18.411111
7,8,49.344086,35.967742,0.333226,39.563871,19.860215
8,9,48.868132,35.516484,0.25044,40.062418,22.505495
9,10,51.89899,36.10101,0.370808,40.765455,32.484848


In [10]:
target = ['stlheadcount']

# Creating features
X = week_df.drop('stlheadcount', axis=1)

# Creating target
y = week_df['stlheadcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

model = LinearRegression()
model.fit(X_train, y_train)

week_train = model.score(X_train, y_train)
week_test = model.score(X_test, y_test)

## Month ML

In [11]:
# Make a df grouped by average count on monthvalue
month_df = df.groupby(['monthvalue']).mean().reset_index()
month_df = month_df.drop(columns = ['weeknumber'])
month_df

Unnamed: 0,monthvalue,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,1,43.449871,34.18509,0.370746,39.715013,16.40874
1,2,48.228814,35.528249,0.290763,39.438475,18.274011
2,3,55.110632,38.623563,0.2925,43.468534,43.793103
3,4,60.340684,42.410143,0.216417,49.07559,47.721058
4,5,67.453871,47.808059,0.135673,55.61913,62.323436
5,6,72.699459,52.744865,0.109665,61.194832,294.208649
6,7,79.954119,57.387904,0.018749,67.647132,2088.758081
7,8,80.6091,57.706308,0.029535,70.561138,3575.729059
8,9,75.565318,53.587283,0.083549,67.692231,2426.313295
9,10,63.381895,46.81471,0.225516,61.376662,351.817539


In [12]:
target = ['stlheadcount']

# Creating features
X = month_df.drop('stlheadcount', axis=1)

# Creating target
y = month_df['stlheadcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

model = LinearRegression()
model.fit(X_train, y_train)

month_train = model.score(X_train, y_train)
month_test = model.score(X_test, y_test)

## Results

In [13]:
index = ['Day', 'Week', 'Month']

data = {'Train': pd.Series([day_train, week_train, month_train], index = index), 
       'Test': pd.Series([day_test, week_test, month_test], index = index)}

results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Train,Test
Day,0.307095,0.344436
Week,0.571202,0.566548
Month,0.767907,-14936.649123


## Scalers

In [14]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score


In [15]:
week_df

target = ['stlheadcount']

# Creating features
X = week_df.drop('stlheadcount', axis=1)

# Creating target
y = week_df['stlheadcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

model = LinearRegression()
model.fit(X_train, y_train)

week_train = model.score(X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[1765.85611124 -162.93165798 -393.461689   1857.36781502 1847.22835814
  319.56143099 1747.9763185  1392.02839435  478.23906166  749.9452529
 -249.24742856 -275.25535772  948.23388608  -91.60388768]
Coefficients: 
 [   -7.48966196   -23.93553988    10.50886596 -1014.36961168
    83.64077857]
Mean squared error: 1109846.44
Coefficient of determination: 0.57
Train:  0.5712020593329324
Test:  0.5665478749973416


In [16]:
# Standard Scaler
ss_X_train = StandardScaler().fit_transform(X_train)

model = LinearRegression()
model.fit(ss_X_train, y_train)

week_train = model.score(ss_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[39744.28735604 25823.22872146 22703.52459418 40749.34591654
 40661.90488878 29597.45646128 39992.61309052 37651.99531194
 30760.03950775 31451.03442876 23560.69878727 24397.87955648
 33112.17380226 24960.80256893]
Coefficients: 
 [-111.75395188 -301.34849271   86.90664146 -145.2152967   892.38413108]
Mean squared error: 969480738.62
Coefficient of determination: -377.63
Train:  -937.6736557945143
Test:  -377.63209663240264


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [17]:
# Min Max Scaler
mm_X_train = MinMaxScaler().fit_transform(X_train)

model = LinearRegression()
model.fit(mm_X_train, y_train)

week_train = model.score(mm_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[110298.38926777  72652.35509948  60562.75956494 113234.12100548
 113018.17702607  80882.45327091 110902.18190592 104133.29563208
  84226.76323701  87065.65682137  64767.6706072   68247.67249108
  91795.87036617  67337.46860969]
Coefficients: 
 [-389.4624218  -952.49687636  273.40548419 -588.10806629 2644.26323624]
Mean squared error: 7817877719.35
Coefficient of determination: -3052.28
Train:  -7397.672008291921
Test:  -3052.283385814416


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [18]:
# Max ABS Scaler
mabs_X_train = MaxAbsScaler().fit_transform(X_train)

model = LinearRegression()
model.fit(mabs_X_train, y_train)

week_train = model.score(mabs_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[273886.61557952 169132.41707189 165308.34468748 281177.34358604
 280275.75026688 209841.1761136  276705.67399523 262263.01569999
 217474.86688539 214056.27407595 157780.3966057  160226.30462943
 225956.74617816 179516.2130944 ]
Coefficients: 
 [ -396.95208376 -1941.19424387   611.63528121  -602.85830331
  5908.49813795]
Mean squared error: 49735598055.61
Coefficient of determination: -19423.31
Train:  -46083.291874492075
Test:  -19423.30934303284


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [19]:
# Robust Scaler
rs_X_train = RobustScaler(quantile_range=(25, 75)).fit_transform(X_train)

model = LinearRegression()
model.fit(rs_X_train, y_train)

week_train = model.score(rs_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[75105.75223115 48205.19526304 43452.39496539 77034.57945895
 76849.86923843 56298.51435732 75643.18994457 71327.79277089
 58476.2870058  59240.78078114 44188.98102289 45570.27179838
 62419.88912012 47622.90018664]
Coefficients: 
 [-187.24154894 -563.65445612  170.18035009 -209.29390192 1671.85035275]
Mean squared error: 3605400990.95
Coefficient of determination: -1407.09
Train:  -3417.4089761319765
Test:  -1407.0945417709522


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [20]:
# Power Transformer
pt_X_train = PowerTransformer(method="yeo-johnson").fit_transform(X_train)

model = LinearRegression()
model.fit(pt_X_train, y_train)

week_train = model.score(pt_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[42798.54796547 22361.99080597 22780.91492349 42830.95953196
 42725.43512568 30293.54663212 44009.73716361 40702.35920462
 31027.96243294 33479.52159736 25343.74552904 21154.06298155
 33783.68006667 26535.35877257]
Coefficients: 
 [ -28.40521879  409.55284581 -957.72849807 -426.16232889  932.20780992]
Mean squared error: 1053917434.94
Coefficient of determination: -410.61
Train:  -1022.2325430513084
Test:  -410.6089698045421


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [21]:
# Power Transformer
pt_X_train = PowerTransformer(method="box-cox").fit_transform(X_train)

model = LinearRegression()
model.fit(pt_X_train, y_train)

week_train = model.score(pt_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[28391.6370929  14876.98763557 16380.20592972 28704.18843763
 28534.13896916 22098.07643912 29965.73432396 28445.7544228
 22249.1550194  21619.01737402 15564.81523959 13354.47865421
 21697.16757052 19771.46081608]
Coefficients: 
 [  -5.70632462  197.191002   -858.83530056 -592.3947055   884.20571814]
Mean squared error: 466248596.95
Coefficient of determination: -181.09
Train:  -451.65118141964706
Test:  -181.09406002723432


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [22]:
# QuantileTransformer
qt_X_train = QuantileTransformer(output_distribution="uniform").fit_transform(X_train)

model = LinearRegression()
model.fit(qt_X_train, y_train)

week_train = model.score(qt_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[268928.22998615 147930.99959682 124183.6245419  267491.14841721
 267825.29576186 169400.09902413 269173.47748348 241943.22413072
 177469.2366391  215634.88802016 169301.98528126 144686.35957927
 219445.99928665 141418.55956055]
Coefficients: 
 [ -655.53491579  2502.78176062 -3597.95010266   148.47792136
  4196.10737586]
Mean squared error: 42862273263.90
Coefficient of determination: -16738.92
Train:  -41112.10530083097
Test:  -16738.922461426864


  "n_samples." % (self.n_quantiles, n_samples)
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [23]:
# QuantileTransformer
qt_X_train = QuantileTransformer(output_distribution="normal").fit_transform(X_train)

model = LinearRegression()
model.fit(qt_X_train, y_train)

week_train = model.score(qt_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[34477.55887279 17441.1513626  22560.75596811 34693.95158992
 34601.3943357  26013.57033428 34672.2811977  32456.59795014
 26939.91093127 27233.2343144  20981.72913192 17604.25155682
 28286.42265474 23384.1902281 ]
Coefficients: 
 [  91.19578244  190.01107885 -104.40577606 -209.13704321  306.95212555]
Mean squared error: 705395910.02
Coefficient of determination: -274.49
Train:  -675.707558354736
Test:  -274.493387055042


  "n_samples." % (self.n_quantiles, n_samples)
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [24]:
# Normalizer
n_X_train = Normalizer().fit_transform(X_train)

model = LinearRegression()
model.fit(n_X_train, y_train)

week_train = model.score(n_X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[ -71067.1853626  -110130.76265739 -162037.53294402  -58060.58082509
  -59795.94836423 -144109.07621318  -81723.17933421  -96889.47495147
 -134561.41349629 -110572.82066209 -176393.45796763 -124042.24712663
  -92286.89574731 -122845.93252539]
Coefficients: 
 [   -695.19618169   -8622.44210705    6430.62339005 -399378.36087638
    4149.46695625]
Mean squared error: 13564897328.19
Coefficient of determination: -5296.79
Train:  -16854.820781938095
Test:  -5296.790158564198


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [25]:
# Normalize X before train test split

week_df

target = ['stlheadcount']

# Creating features
X = week_df.drop('stlheadcount', axis=1)
X = preprocessing.normalize(X)

# Creating target
y = week_df['stlheadcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_test = preprocessing.normalize(X_test)

model = LinearRegression()
model.fit(X_train, y_train)

week_train = model.score(X_train, y_train)
week_test = model.score(X_test, y_test)

predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, predict))
print("Coefficient of determination: %.2f" % r2_score(y_test, predict))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

[1422.61295602  321.16431418   87.4650989  1526.81784448 1512.8305283
  445.17314244 1331.13785689 1147.52923701  601.78941304  890.32931544
 -225.84291869  166.62735054 1110.7621798   541.91790034]
Coefficients: 
 [   -695.19618169   -8622.44210705    6430.62339005 -399378.36087638
    4149.46695625]
Mean squared error: 1551644.85
Coefficient of determination: 0.39
Train:  0.4360359096298755
Test:  0.39400287164913894


In [26]:
from sklearn.linear_model import SGDRegressor
from sklearn.datasets import load_boston
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [27]:
# SGD Regressor

week_df

target = ['stlheadcount']

# Creating features
X = week_df.drop('stlheadcount', axis=1)
X = preprocessing.normalize(X)

# Creating target
y = week_df['stlheadcount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Train the Logistic Regression model using the resampled data

sgdr = SGDRegressor()

sgdr.fit(X_train, y_train)

score = sgdr.score(X_train, y_train)
print("R-squared: ", score)

cv_score = cross_val_score(sgdr, X, y, cv = 10)
print("CV mean score: ", cv_score.mean())

ypred = sgdr.predict(X_test)

mse = mean_squared_error(y_test, ypred)
print("MSE: ", mse)
print("RMSE: ", mse**(1/2.0)) 
predict = model.predict(X_test)
print(predict)
print("Coefficients: \n", model.coef_)
print("Mean squared error: %.2f" % mean_squared_error(y_test, ypred))
print("Coefficient of determination: %.2f" % r2_score(y_test, ypred))

print("Train: ", model.score(X_train, y_train))
print("Test: ", model.score(X_test, y_test))

R-squared:  0.006300534763253762
CV mean score:  -55082.77444029553
MSE:  2817156.694575654
RMSE:  1678.438767002137
[1422.61295602  321.16431418   87.4650989  1526.81784448 1512.8305283
  445.17314244 1331.13785689 1147.52923701  601.78941304  890.32931544
 -225.84291869  166.62735054 1110.7621798   541.91790034]
Coefficients: 
 [   -695.19618169   -8622.44210705    6430.62339005 -399378.36087638
    4149.46695625]
Mean squared error: 2817156.69
Coefficient of determination: -0.10
Train:  0.4360359096298755
Test:  0.39400287164913894
