In [1]:
# Dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score

In [2]:
# Load data into dataframe
steelhead_df_wk = pd.read_csv(Path('../Resources/refactored_data/weekly_fish_groups/df_stlhead_week.csv'))
chinook_df_wk = pd.read_csv(Path('../Resources/refactored_data/weekly_fish_groups/df_chinook_week.csv'))
shad_df_wk = pd.read_csv(Path('../Resources/refactored_data/weekly_fish_groups/df_shad_week.csv'))
sockeye_df_wk = pd.read_csv(Path('../Resources/refactored_data/weekly_fish_groups/df_sockeye_week.csv'))
coho_df_wk = pd.read_csv(Path('../Resources/refactored_data/weekly_fish_groups/df_coho_week.csv'))
daily_weather_data = pd.read_csv(Path('../Resources/weather/dailyWeatherTable.csv'))
steelhead_df_wk

Unnamed: 0,weeknumber,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,1,41.833333,32.185185,0.366481,40.86,17.185185
1,2,42.043956,32.934066,0.410659,40.365055,17.395604
2,3,41.988764,33.550562,0.320674,39.600449,16.258427
3,4,44.470588,35.082353,0.316706,38.816706,14.552941
4,5,47.13253,37.060241,0.461205,39.098072,16.879518
5,6,46.101266,34.860759,0.289114,39.026835,14.443038
6,7,48.488889,35.755556,0.259444,39.184,18.411111
7,8,49.344086,35.967742,0.333226,39.563871,19.860215
8,9,48.868132,35.516484,0.25044,40.062418,22.505495
9,10,51.89899,36.10101,0.370808,40.765455,32.484848


In [3]:
daily_weather_data

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf
0,01-01,43.766667,32.206897,0.402258,40.897143
1,01-02,42.903226,33.937500,0.453125,40.935714
2,01-03,43.406250,34.375000,0.398125,40.717143
3,01-04,42.125000,34.064516,0.351562,40.562857
4,01-05,42.500000,33.451613,0.393437,40.485714
...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818
362,12-28,42.875000,34.312500,0.557000,41.212727
363,12-29,43.633333,33.774194,0.487742,41.441818
364,12-30,41.750000,32.774194,0.481935,41.360000


## Steelhead Polynomial Regression

In [4]:
# Creating features
X = steelhead_df_wk.drop('stlheadcount', axis=1)

# Creating target
y = steelhead_df_wk['stlheadcount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.97 and model.score(X_test, y_test) >= 0.90:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

47
Train:  0.9748486269899534
Test:  0.912532830513677
619
Train:  0.9727073512657045
Test:  0.9053176936284657
1054
Train:  0.9721669661102387
Test:  0.9022658864420022
1206
Train:  0.9701118845816779
Test:  0.9220794173652934
1409
Train:  0.9713501440565026
Test:  0.9009803470132469
1651
Train:  0.9706645697084963
Test:  0.9063912782586617
2386
Train:  0.9707119415576985
Test:  0.9026721966099439
2580
Train:  0.9702067487816658
Test:  0.9004530081918032
2787
Train:  0.9752793296972164
Test:  0.9136448836983756
3280
Train:  0.9719229312413834
Test:  0.9032887547436257
3737
Train:  0.9711845563228412
Test:  0.9097230914265062
4219
Train:  0.9707196011238028
Test:  0.902209313977588
4597
Train:  0.9717406346793231
Test:  0.9151051233441275


In [5]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=47)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
stlhd_train_score = model.score(X_train, y_train)
stlhd_test_score = model.score(X_test, y_test)

## Chinook Polynomial Regression

In [6]:
# Creating features
X = chinook_df_wk.drop('chinookcount', axis=1)

# Creating target
y = chinook_df_wk['chinookcount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.65 and model.score(X_test, y_test) >= 0.55:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

455
Train:  0.6771149044507625
Test:  0.5544704282302597
1505
Train:  0.6528731671686436
Test:  0.5824622215898971
1657
Train:  0.6506504221173779
Test:  0.5823262532702971
1726
Train:  0.6505367764073328
Test:  0.5549221639930979
2472
Train:  0.6510818338890652
Test:  0.6221931926156903
3509
Train:  0.6538953768252671
Test:  0.5577686138435841
3603
Train:  0.6565974636892979
Test:  0.566727086633843
3742
Train:  0.6576823910977401
Test:  0.5566292742501122
3830
Train:  0.6612641166056114
Test:  0.5595469237270336


In [7]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=455)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
chin_train_score = model.score(X_train, y_train)
chin_test_score = model.score(X_test, y_test)

## Coho Polynomial Regression

In [8]:
# Creating features
X = coho_df_wk.drop('cohocount', axis=1)

# Creating target
y = coho_df_wk['cohocount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.50 and model.score(X_test, y_test) >= 0.50:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

44
Train:  0.8805918003642659
Test:  0.5903412694930879
121
Train:  0.8678648856072619
Test:  0.627546736747719
1815
Train:  0.7695642415702072
Test:  0.5987442828587279
1816
Train:  0.7784656098477721
Test:  0.6024693029807839
1959
Train:  0.8328687851237052
Test:  0.5591648361327415
1991
Train:  0.7421362709437286
Test:  0.6072851656650087
3270
Train:  0.8704723759764326
Test:  0.573644276508104
4512
Train:  0.7352060929901088
Test:  0.5779934797130577


In [9]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=121)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
coho_train_score = model.score(X_train, y_train)
coho_test_score = model.score(X_test, y_test)

## Shad Polynomial Regression

In [10]:
# Creating features
X = shad_df_wk.drop('shadcount', axis=1)

# Creating target
y = shad_df_wk['shadcount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.50 and model.score(X_test, y_test) >= 0.70:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

3035
Train:  1.0
Test:  0.7095798262110471
4674
Train:  1.0
Test:  0.7270358888745702


In [11]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=4674)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
shad_train_score = model.score(X_train, y_train)
shad_test_score = model.score(X_test, y_test)

## Sockeye Polynomial Regression

In [12]:
# Creating features
X = sockeye_df_wk.drop('sockeyecount', axis=1)

# Creating target
y = sockeye_df_wk['sockeyecount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.20 and model.score(X_test, y_test) >= 0.0:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

233
Train:  0.9987372711044508
Test:  0.008933600121029395
688
Train:  0.9230328675649928
Test:  0.08923837174311078
961
Train:  0.9997861301312683
Test:  0.004904432573100137
1668
Train:  0.5384509318885939
Test:  0.5481243858189031


In [13]:
# Split the data into training and testing datasets 
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=1668)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
sock_train_score = model.score(X_train, y_train)
sock_test_score = model.score(X_test, y_test)

## Results - Weekly ML

In [14]:
index = ['Steelhead', 'Chinook', 'Coho', 'Shad', 'Sockeye']

data = {'Train': pd.Series([stlhd_train_score, chin_train_score, coho_train_score, shad_train_score, sock_train_score], index = index), 
       'Test': pd.Series([stlhd_test_score, chin_test_score, coho_test_score, shad_test_score, sock_test_score], index = index)}

results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Train,Test
Steelhead,0.974849,0.912533
Chinook,0.677115,0.55447
Coho,0.867865,0.627547
Shad,1.0,0.727036
Sockeye,0.538451,0.548124


## Load daily dataframes

In [15]:
# Load data into dataframe

steelhead_df_day = pd.read_csv(Path('../Resources/refactored_data/daily_fish_groups/df_stlhead_day.csv'))
coho_df_day = pd.read_csv(Path('../Resources/refactored_data/daily_fish_groups/df_coho_day.csv'))
shad_df_day = pd.read_csv(Path('../Resources/refactored_data/daily_fish_groups/df_shad_day.csv'))
sockeye_df_day = pd.read_csv(Path('../Resources/refactored_data/daily_fish_groups/df_sockeye_day.csv'))
chinook_df_day = pd.read_csv(Path('../Resources/refactored_data/daily_fish_groups/df_chinook_day.csv'))
steelhead_df_day

Unnamed: 0,day,maxtempf,mintempf,precipitationinch,watertempf,stlheadcount
0,1,41.071429,30.000000,0.238571,40.897143,13
1,2,41.214286,32.000000,0.440714,40.935714,15
2,3,41.583333,32.750000,0.428333,40.655000,17
3,4,39.928571,32.357143,0.320714,40.562857,14
4,5,41.583333,31.833333,0.666667,40.670000,12
...,...,...,...,...,...,...
361,362,43.400000,35.500000,0.903000,41.594000,21
362,363,46.750000,36.500000,0.695000,41.427500,21
363,364,42.500000,32.900000,0.548000,41.540000,18
364,365,40.111111,32.333333,0.481111,41.120000,14


In [16]:
daily_weather_data['day'] = np.arange(len(daily_weather_data)) + 1
daily_weather_data

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,day
0,01-01,43.766667,32.206897,0.402258,40.897143,1
1,01-02,42.903226,33.937500,0.453125,40.935714,2
2,01-03,43.406250,34.375000,0.398125,40.717143,3
3,01-04,42.125000,34.064516,0.351562,40.562857,4
4,01-05,42.500000,33.451613,0.393437,40.485714,5
...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,362
362,12-28,42.875000,34.312500,0.557000,41.212727,363
363,12-29,43.633333,33.774194,0.487742,41.441818,364
364,12-30,41.750000,32.774194,0.481935,41.360000,365


## Steelhead Polynomial Regression

In [18]:
# Creating features
X = steelhead_df_day.drop('stlheadcount', axis=1)

# Creating target
y = steelhead_df_day['stlheadcount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.93 and model.score(X_test, y_test) >= 0.91:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

3
Train:  0.9305297475142029
Test:  0.9115956292194581
6
Train:  0.9318042275597691
Test:  0.9100931122898301
7
Train:  0.9313274540527336
Test:  0.9123603538031364
8
Train:  0.9322858298215678
Test:  0.910150135227242
9
Train:  0.9304308215553557
Test:  0.9123303637488975
21
Train:  0.9305901204571597
Test:  0.9178409455754084
23
Train:  0.9306737946288904
Test:  0.9122220610931243
46
Train:  0.9304913699562554
Test:  0.9149055005303578
47
Train:  0.9307014877513745
Test:  0.9139879775097746
55
Train:  0.9301381798723792
Test:  0.9115635806940627
83
Train:  0.9329688794303279
Test:  0.9100596745585033
89
Train:  0.9318196255928544
Test:  0.9136020378538198
105
Train:  0.931008433760972
Test:  0.9116429119411174
123
Train:  0.9302410788979503
Test:  0.9179615507622909
128
Train:  0.9305108995386415
Test:  0.9120029864981618
132
Train:  0.9305102636987833
Test:  0.9160113599374238
134
Train:  0.9323824559966846
Test:  0.9111337536089362
140
Train:  0.9302188691550086
Test:  0.9131926784

Train:  0.9301087376747452
Test:  0.9169727583005417
1368
Train:  0.9300260221682736
Test:  0.9163231173661819
1373
Train:  0.9328476353262626
Test:  0.9107428203803796
1382
Train:  0.931325357364886
Test:  0.9129156215586247
1401
Train:  0.9308225199639182
Test:  0.9130153886622794
1407
Train:  0.9300506648332622
Test:  0.9173836779197785
1412
Train:  0.9311117895519107
Test:  0.9123874605413445
1428
Train:  0.9301277930068688
Test:  0.9124462151908989
1436
Train:  0.9306083313511424
Test:  0.9149658045311808
1450
Train:  0.9315139236992697
Test:  0.9136314585794363
1456
Train:  0.9304696386644536
Test:  0.9134527272269098
1458
Train:  0.9306414616088503
Test:  0.912471825957029
1473
Train:  0.9309008424105718
Test:  0.9124532199899765
1501
Train:  0.9315450466493065
Test:  0.9131811381043966
1504
Train:  0.9315020432284957
Test:  0.9100717210974497
1507
Train:  0.9332230612532532
Test:  0.9103460951776035
1509
Train:  0.9303116009670661
Test:  0.9125021772870434
1511
Train:  0.930497

2786
Train:  0.9300306624541075
Test:  0.918218393868978
2795
Train:  0.9314214988920388
Test:  0.9119011248493317
2802
Train:  0.9305466535111103
Test:  0.9132613084067753
2813
Train:  0.9306648493988648
Test:  0.9170025600797369
2825
Train:  0.9305306976150265
Test:  0.9116620200162036
2835
Train:  0.9306319989519823
Test:  0.9154513093140753
2848
Train:  0.9303261621909903
Test:  0.9116215974208348
2854
Train:  0.9301525698077984
Test:  0.9127438510683381
2858
Train:  0.9300877165664427
Test:  0.9169700539034679
2863
Train:  0.930296912905369
Test:  0.914719976227409
2875
Train:  0.9308224696651312
Test:  0.9111405020971693
2882
Train:  0.9300122848825283
Test:  0.9127069685943511
2890
Train:  0.9312642778722937
Test:  0.9118879329151663
2902
Train:  0.9308597766359423
Test:  0.9165314462007145
2906
Train:  0.9311943682560828
Test:  0.912211018811751
2909
Train:  0.9301483670691886
Test:  0.9147502962032017
2919
Train:  0.9319357337131318
Test:  0.912052472093427
2923
Train:  0.9308

Train:  0.9303765364921498
Test:  0.9173560900099463
4181
Train:  0.9305138724870539
Test:  0.9104732737345953
4190
Train:  0.9301686597959921
Test:  0.9114170234742046
4211
Train:  0.9302290692246975
Test:  0.9118985232248744
4222
Train:  0.9306149031143243
Test:  0.9163975987017654
4246
Train:  0.930372119820308
Test:  0.9109047513789188
4251
Train:  0.9306266372067216
Test:  0.9153418263974585
4277
Train:  0.9316065074759302
Test:  0.9114398159548875
4303
Train:  0.9301454148692894
Test:  0.911462957730005
4313
Train:  0.9310156749981947
Test:  0.9129476199780965
4317
Train:  0.9303336107106442
Test:  0.9159979098068876
4320
Train:  0.9307647078615711
Test:  0.91133717789635
4329
Train:  0.9300836896889844
Test:  0.9178994179279557
4343
Train:  0.9315218124644792
Test:  0.9107037077750162
4353
Train:  0.9309748831263145
Test:  0.9128889954815089
4363
Train:  0.9307815637764961
Test:  0.9131144003660113
4375
Train:  0.931570042267953
Test:  0.9108231675410843
4387
Train:  0.930132311

In [19]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=7)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
stlhd_train_score_day = model.score(X_train, y_train)
stlhd_test_score_day = model.score(X_test, y_test)

In [20]:
x = []

def predictions(x):
    x = daily_weather_data.drop('daterecorded', axis=1)
    x = x[['day', 'maxtempf', 'mintempf', 'precipitationinch', 'watertempf']]
    poly_features = poly.fit_transform(x)
    predict = model.predict(poly_features)
    return predict

results = pd.DataFrame(predictions(x))
results
final_df = pd.concat([daily_weather_data, results], axis=1)
final_df.rename(columns={0: 'stlhead_predict'}, inplace=True)
final_df

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,day,stlhead_predict
0,01-01,43.766667,32.206897,0.402258,40.897143,1,-321.266513
1,01-02,42.903226,33.937500,0.453125,40.935714,2,-90.747953
2,01-03,43.406250,34.375000,0.398125,40.717143,3,-110.883147
3,01-04,42.125000,34.064516,0.351562,40.562857,4,-8.569446
4,01-05,42.500000,33.451613,0.393437,40.485714,5,-10.395484
...,...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,362,328.980363
362,12-28,42.875000,34.312500,0.557000,41.212727,363,226.277209
363,12-29,43.633333,33.774194,0.487742,41.441818,364,153.672370
364,12-30,41.750000,32.774194,0.481935,41.360000,365,346.590029


## Chinook Polynomial Regression

In [22]:
# Creating features
X = chinook_df_day.drop('chinookcount', axis=1)

# Creating target
y = chinook_df_day['chinookcount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.51 and model.score(X_test, y_test) >= 0.3:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

831
Train:  0.519328105009855
Test:  0.32832474846701787


In [23]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=831)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
chin_train_score_day = model.score(X_train, y_train)
chin_test_score_day = model.score(X_test, y_test)

In [24]:
x = []

def predictions(x):
    x = daily_weather_data.drop('daterecorded', axis=1)
    x = x[['day', 'maxtempf', 'mintempf', 'precipitationinch', 'watertempf']]
    poly_features = poly.fit_transform(x)
    predict = model.predict(poly_features)
    return predict

results = pd.DataFrame(predictions(x))
results
final_df = pd.concat([final_df, results], axis=1)
final_df.rename(columns={0: 'chinook_predict'}, inplace=True)
final_df

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,day,stlhead_predict,chinook_predict
0,01-01,43.766667,32.206897,0.402258,40.897143,1,-321.266513,-819.484018
1,01-02,42.903226,33.937500,0.453125,40.935714,2,-90.747953,-395.467249
2,01-03,43.406250,34.375000,0.398125,40.717143,3,-110.883147,-132.760286
3,01-04,42.125000,34.064516,0.351562,40.562857,4,-8.569446,-139.535844
4,01-05,42.500000,33.451613,0.393437,40.485714,5,-10.395484,-348.498575
...,...,...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,362,328.980363,167.968389
362,12-28,42.875000,34.312500,0.557000,41.212727,363,226.277209,269.281668
363,12-29,43.633333,33.774194,0.487742,41.441818,364,153.672370,-27.914003
364,12-30,41.750000,32.774194,0.481935,41.360000,365,346.590029,-313.660318


## Coho Polynomial Regression

In [26]:
# Creating features
X = coho_df_day.drop('cohocount', axis=1)

# Creating target
y = coho_df_day['cohocount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.65 and model.score(X_test, y_test) >= 0.52:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

1994
Train:  0.6535883217598621
Test:  0.5206821378733832
2621
Train:  0.654090557848167
Test:  0.5253657087244343


In [27]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=2621)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
coho_train_score_day = model.score(X_train, y_train)
coho_test_score_day = model.score(X_test, y_test)

In [28]:
x = []

def predictions(x):
    x = daily_weather_data.drop('daterecorded', axis=1)
    x = x[['day', 'maxtempf', 'mintempf', 'precipitationinch', 'watertempf']]
    poly_features = poly.fit_transform(x)
    predict = model.predict(poly_features)
    return predict

results = pd.DataFrame(predictions(x))
results
final_df = pd.concat([final_df, results], axis=1)
final_df.rename(columns={0: 'coho_predict'}, inplace=True)
final_df

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,day,stlhead_predict,chinook_predict,coho_predict
0,01-01,43.766667,32.206897,0.402258,40.897143,1,-321.266513,-819.484018,-458.963780
1,01-02,42.903226,33.937500,0.453125,40.935714,2,-90.747953,-395.467249,-564.254552
2,01-03,43.406250,34.375000,0.398125,40.717143,3,-110.883147,-132.760286,-446.591611
3,01-04,42.125000,34.064516,0.351562,40.562857,4,-8.569446,-139.535844,-449.162498
4,01-05,42.500000,33.451613,0.393437,40.485714,5,-10.395484,-348.498575,-378.896105
...,...,...,...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,362,328.980363,167.968389,-367.019639
362,12-28,42.875000,34.312500,0.557000,41.212727,363,226.277209,269.281668,-333.922038
363,12-29,43.633333,33.774194,0.487742,41.441818,364,153.672370,-27.914003,-265.571523
364,12-30,41.750000,32.774194,0.481935,41.360000,365,346.590029,-313.660318,-396.538702


## Shad Polynomial Regression

In [30]:
# Creating features
X = shad_df_day.drop('shadcount', axis=1)

# Creating target
y = shad_df_day['shadcount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.70 and model.score(X_test, y_test) >= 0.60:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

13
Train:  0.7006523518738379
Test:  0.6001867745727295
744
Train:  0.7029983518364771
Test:  0.6096183148809706
3280
Train:  0.7026631336889855
Test:  0.60650692251078


In [31]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=744)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
shad_train_score_day = model.score(X_train, y_train)
shad_test_score_day = model.score(X_test, y_test)

In [32]:
x = []

def predictions(x):
    x = daily_weather_data.drop('daterecorded', axis=1)
    x = x[['day', 'maxtempf', 'mintempf', 'precipitationinch', 'watertempf']]
    poly_features = poly.fit_transform(x)
    predict = model.predict(poly_features)
    return predict

results = pd.DataFrame(predictions(x))
results
final_df = pd.concat([final_df, results], axis=1)
final_df.rename(columns={0: 'chad_predict'}, inplace=True)
final_df

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,day,stlhead_predict,chinook_predict,coho_predict,chad_predict
0,01-01,43.766667,32.206897,0.402258,40.897143,1,-321.266513,-819.484018,-458.963780,-5.813035e+05
1,01-02,42.903226,33.937500,0.453125,40.935714,2,-90.747953,-395.467249,-564.254552,-5.856016e+05
2,01-03,43.406250,34.375000,0.398125,40.717143,3,-110.883147,-132.760286,-446.591611,-5.933395e+05
3,01-04,42.125000,34.064516,0.351562,40.562857,4,-8.569446,-139.535844,-449.162498,-5.503916e+05
4,01-05,42.500000,33.451613,0.393437,40.485714,5,-10.395484,-348.498575,-378.896105,-5.614772e+05
...,...,...,...,...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,362,328.980363,167.968389,-367.019639,-1.280974e+06
362,12-28,42.875000,34.312500,0.557000,41.212727,363,226.277209,269.281668,-333.922038,-1.213572e+06
363,12-29,43.633333,33.774194,0.487742,41.441818,364,153.672370,-27.914003,-265.571523,-1.186879e+06
364,12-30,41.750000,32.774194,0.481935,41.360000,365,346.590029,-313.660318,-396.538702,-1.396824e+06


## Sockeye Polynomial Regression

In [34]:
# Creating features
X = sockeye_df_day.drop('sockeyecount', axis=1)

# Creating target
y = sockeye_df_day['sockeyecount']

# Set up the Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Fit the features to the polynomial set
poly_features = poly.fit_transform(X)

# Split the data into training and testing datasets
for i in range(1, 5000):
    X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=i)

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict the model
    predict = model.predict(X_test)

    # Print the score
    if model.score(X_train, y_train) >= 0.11 and model.score(X_test, y_test) >= 0.11:
        print(i)
        print("Train: ", model.score(X_train, y_train))
        print("Test: ", model.score(X_test, y_test))
        
    else:
        pass

170
Train:  0.11069338051310995
Test:  0.11165680591087412
871
Train:  0.11055221933761483
Test:  0.11169352669596055
1008
Train:  0.1100534022292905
Test:  0.11116148816803839
4050
Train:  0.11016661402248562
Test:  0.11029748655534122
4090
Train:  0.11017388069013767
Test:  0.11002038593412689


In [35]:
# Split the data into training and testing datasets 
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, random_state=170)

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the model
predict = model.predict(X_test)

# Print the score
sock_train_score_day = model.score(X_train, y_train)
sock_test_score_day = model.score(X_test, y_test)

In [36]:
x = []

def predictions(x):
    x = daily_weather_data.drop('daterecorded', axis=1)
    x = x[['day', 'maxtempf', 'mintempf', 'precipitationinch', 'watertempf']]
    poly_features = poly.fit_transform(x)
    predict = model.predict(poly_features)
    return predict

results = pd.DataFrame(predictions(x))
results
final_df = pd.concat([final_df, results], axis=1)
final_df.rename(columns={0: 'sock_predict'}, inplace=True)
final_df

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,day,stlhead_predict,chinook_predict,coho_predict,chad_predict,sock_predict
0,01-01,43.766667,32.206897,0.402258,40.897143,1,-321.266513,-819.484018,-458.963780,-5.813035e+05,-15776.917110
1,01-02,42.903226,33.937500,0.453125,40.935714,2,-90.747953,-395.467249,-564.254552,-5.856016e+05,-15846.169094
2,01-03,43.406250,34.375000,0.398125,40.717143,3,-110.883147,-132.760286,-446.591611,-5.933395e+05,-15936.662177
3,01-04,42.125000,34.064516,0.351562,40.562857,4,-8.569446,-139.535844,-449.162498,-5.503916e+05,-16151.022109
4,01-05,42.500000,33.451613,0.393437,40.485714,5,-10.395484,-348.498575,-378.896105,-5.614772e+05,-16062.922269
...,...,...,...,...,...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,362,328.980363,167.968389,-367.019639,-1.280974e+06,49220.626801
362,12-28,42.875000,34.312500,0.557000,41.212727,363,226.277209,269.281668,-333.922038,-1.213572e+06,47766.967039
363,12-29,43.633333,33.774194,0.487742,41.441818,364,153.672370,-27.914003,-265.571523,-1.186879e+06,48184.774306
364,12-30,41.750000,32.774194,0.481935,41.360000,365,346.590029,-313.660318,-396.538702,-1.396824e+06,49923.070537


In [47]:
final_predictions = final_df[['daterecorded', 'maxtempf', 'mintempf', 'precipitationinch', 'watertempf', 'stlhead_predict', 'chinook_predict', 'coho_predict', 'chad_predict', 'sock_predict']]
final_predictions

Unnamed: 0,daterecorded,maxtempf,mintempf,precipitationinch,watertempf,stlhead_predict,chinook_predict,coho_predict,chad_predict,sock_predict
0,01-01,43.766667,32.206897,0.402258,40.897143,-321.266513,-819.484018,-458.963780,-5.813035e+05,-15776.917110
1,01-02,42.903226,33.937500,0.453125,40.935714,-90.747953,-395.467249,-564.254552,-5.856016e+05,-15846.169094
2,01-03,43.406250,34.375000,0.398125,40.717143,-110.883147,-132.760286,-446.591611,-5.933395e+05,-15936.662177
3,01-04,42.125000,34.064516,0.351562,40.562857,-8.569446,-139.535844,-449.162498,-5.503916e+05,-16151.022109
4,01-05,42.500000,33.451613,0.393437,40.485714,-10.395484,-348.498575,-378.896105,-5.614772e+05,-16062.922269
...,...,...,...,...,...,...,...,...,...,...
361,12-27,41.483871,33.806452,0.457333,41.441818,328.980363,167.968389,-367.019639,-1.280974e+06,49220.626801
362,12-28,42.875000,34.312500,0.557000,41.212727,226.277209,269.281668,-333.922038,-1.213572e+06,47766.967039
363,12-29,43.633333,33.774194,0.487742,41.441818,153.672370,-27.914003,-265.571523,-1.186879e+06,48184.774306
364,12-30,41.750000,32.774194,0.481935,41.360000,346.590029,-313.660318,-396.538702,-1.396824e+06,49923.070537


## Results - Daily ML

In [38]:
index = ['Steelhead', 'Chinook', 'Coho', 'Shad', 'Sockeye']

data = {'Train': pd.Series([stlhd_train_score_day, chin_train_score_day, coho_train_score_day, shad_train_score_day, sock_train_score_day], index = index), 
       'Test': pd.Series([stlhd_test_score_day, chin_test_score_day, coho_test_score_day, shad_test_score_day, sock_test_score_day], index = index)}

results_df_day = pd.DataFrame(data)
results_df_day

Unnamed: 0,Train,Test
Steelhead,0.931327,0.91236
Chinook,0.519328,0.328325
Coho,0.654091,0.525366
Shad,0.702998,0.609618
Sockeye,0.110693,0.111657
