# Logistic Regression Model

## Data Import And Cleaning

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, auc
import scripts.logistic_regression_functions as lrf
import scripts.helper_functions as hf
from functools import partial

#reloads the module with the most recent updates
%load_ext autoreload
%aimport scripts.logistic_regression_functions, scripts.helper_functions
%autoreload 2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_files = ['../../data/cleaned/utah_2017_vineyard.csv', 
               '../../data/cleaned/utah_2017_marina.csv']

test_files = ['../../data/cleaned/utah_2018_marina.csv',
               '../../data/cleaned/utah_2018_bird.csv']

drop_columns = ['Chlorophyll ug/L', 'Chlorophyll RFU', 'Wiper Pos V',
                'Cable Pwr V', 'Battery V', 'Chlorophyll (ug/L)', 'FDOM RFU']

x_columns = ['Temp C', 'Sp Cond (uS/cm)', 'pH (mV)', 'pH', 'Turbidity (NTU)',
            'ODOSat%', 'ODO (mg/L)']

target_column = 'BGA-Phycocyanin RFU'

# TODO Create average off of minor bloom
RFU_THRESHOLD = 1.2
train_index = 0
test_index = 0
#the operation to be used for math operations during training.
math_operation = partial(np.square)

In [3]:
# Load the data
train_dfs = lrf.import_df_data(train_files, drop_columns)
test_dfs = lrf.import_df_data(test_files, drop_columns)

train_dfs[train_index].head()

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4


In [4]:
test_dfs[test_index].head()

Unnamed: 0,Time America/Boise UTC-06:00,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU
0,4/11/2018 10:30,11.88,1782,-146.1,8.2,42.49,89.1,9.58,0.6
1,4/11/2018 10:45,11.78,1783,-143.7,8.15,42.89,87.4,9.42,0.6
2,4/11/2018 11:00,11.79,1784,-144.3,8.17,41.24,89.0,9.58,0.7
3,4/11/2018 11:15,11.66,1783,-143.1,8.14,42.65,87.1,9.41,0.7
4,4/11/2018 11:30,11.68,1784,-143.5,8.15,42.06,87.5,9.45,0.7


In [5]:
train_dfs[train_index].describe()

Unnamed: 0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU
count,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0
mean,18.488235,1933.564575,-116.806951,8.633415,58.129348,87.05698,8.179008,0.979511
std,6.134464,151.800551,8.97383,0.147231,54.622278,13.793755,1.278416,0.731423
min,5.63,3.0,-153.7,7.93,0.66,62.8,5.17,0.0
25%,12.7,1848.0,-120.8,8.53,29.85,80.0,7.26,0.5
50%,20.32,1939.0,-115.7,8.65,42.14,83.2,8.08,0.9
75%,23.63,2070.0,-112.1,8.7,65.795,88.9,9.25,1.2
max,28.57,2164.0,-76.5,9.23,636.7,209.7,16.17,16.1


In [6]:
train_dfs[train_index][[target_column]].describe()

Unnamed: 0,BGA-Phycocyanin RFU
count,18947.0
mean,0.979511
std,0.731423
min,0.0
25%,0.5
50%,0.9
75%,1.2
max,16.1


In [7]:
test_dfs[test_index].describe()

Unnamed: 0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU
count,19487.0,19487.0,19487.0,19487.0,19487.0,19487.0,19487.0,19487.0
mean,19.131184,1990.294607,-196.624329,8.922357,63.929067,90.057202,8.309566,1.459773
std,5.045367,230.743946,27.769024,0.357733,70.885989,17.627283,1.486747,1.31196
min,0.0,0.0,-284.0,0.0,-0.88,0.0,0.0,-0.4
25%,15.48,1869.0,-216.95,8.73,33.75,79.1,7.28,0.6
50%,20.19,1958.0,-192.0,8.92,48.38,85.3,8.01,1.0
75%,23.07,2149.0,-179.5,9.1,68.275,95.3,9.24,1.7
max,28.6,2417.0,0.0,10.1,1248.92,245.5,19.59,21.1


## Use Datetime Index

In [8]:
print(test_dfs[test_index].dtypes)
print(train_dfs[train_index].dtypes)

Time America/Boise UTC-06:00     object
Temp C                          float64
Sp Cond (uS/cm)                   int64
pH (mV)                         float64
pH                              float64
Turbidity (NTU)                 float64
ODOSat%                         float64
ODO (mg/L)                      float64
BGA-Phycocyanin RFU             float64
dtype: object
Date (mm.dd.yyyy)       object
Time 24hr               object
Temp C                 float64
Sp Cond (uS/cm)          int64
pH (mV)                float64
pH                     float64
Turbidity (NTU)        float64
ODOSat%                float64
ODO (mg/L)             float64
BGA-Phycocyanin RFU    float64
dtype: object


In [9]:
# We will need to engineer a datetime column for our 2017 data

# Create a single datetime column from the strings provided in our csv's
for i in range(0, len(train_dfs)):
    timestamp = train_dfs[i]['Date (mm.dd.yyyy)'] + ' '+ train_dfs[i]['Time 24hr']
    timestamp = pd.to_datetime(timestamp)
    train_dfs[i]['timestamp'] = timestamp
    train_dfs[i]['datetime'] = timestamp
    train_dfs[i] = train_dfs[i].set_index('timestamp')
    train_dfs[i] = train_dfs[i].drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
train_dfs[train_index].head()

Unnamed: 0_level_0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-05-05 00:00:00,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,2017-05-05 00:00:00
2017-05-05 00:15:00,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,2017-05-05 00:15:00
2017-05-05 00:30:00,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,2017-05-05 00:30:00
2017-05-05 00:45:00,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,2017-05-05 00:45:00
2017-05-05 01:00:00,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,2017-05-05 01:00:00


In [10]:
# Now to datetime index our 2018 data
for i in range(0, len(test_dfs)):
    timestamp = pd.to_datetime(test_dfs[i]['Time America/Boise UTC-06:00'])
    test_dfs[i]['timestamp'] = timestamp
    test_dfs[i]['datetime'] = timestamp
    test_dfs[i] = test_dfs[i].set_index('timestamp')
    test_dfs[i] = test_dfs[i].drop(columns=['Time America/Boise UTC-06:00'])
test_dfs[test_index].head()

Unnamed: 0_level_0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-04-11 10:30:00,11.88,1782,-146.1,8.2,42.49,89.1,9.58,0.6,2018-04-11 10:30:00
2018-04-11 10:45:00,11.78,1783,-143.7,8.15,42.89,87.4,9.42,0.6,2018-04-11 10:45:00
2018-04-11 11:00:00,11.79,1784,-144.3,8.17,41.24,89.0,9.58,0.7,2018-04-11 11:00:00
2018-04-11 11:15:00,11.66,1783,-143.1,8.14,42.65,87.1,9.41,0.7,2018-04-11 11:15:00
2018-04-11 11:30:00,11.68,1784,-143.5,8.15,42.06,87.5,9.45,0.7,2018-04-11 11:30:00


## Fill missing values


In [11]:
# Fill all missing values with the mean
for df in test_dfs + train_dfs:
    for column in df.columns:
        number_null = np.sum(df[column].isnull())
        column_length = len(df[column])
        print('{} percent null: {:.1f}%'.format(
            column, (number_null / column_length) * 100))
        if number_null > 0:
            print("Filling {} with mean: {}\n".format(column, df[column].mean()))
            df[column] = df[column].fillna(df[column].mean()) 

Temp C percent null: 0.0%
Sp Cond (uS/cm) percent null: 0.0%
pH (mV) percent null: 0.0%
pH percent null: 0.0%
Turbidity (NTU) percent null: 0.0%
ODOSat% percent null: 0.0%
ODO (mg/L) percent null: 0.0%
BGA-Phycocyanin RFU percent null: 0.0%
datetime percent null: 0.0%
Temp C percent null: 0.0%
Sp Cond (uS/cm) percent null: 0.0%
pH (mV) percent null: 0.0%
pH percent null: 0.0%
Turbidity (NTU) percent null: 0.0%
ODOSat% percent null: 0.0%
ODO (mg/L) percent null: 0.0%
BGA-Phycocyanin RFU percent null: 0.0%
datetime percent null: 0.0%
Temp C percent null: 0.0%
Sp Cond (uS/cm) percent null: 0.0%
pH (mV) percent null: 0.0%
pH percent null: 0.0%
Turbidity (NTU) percent null: 0.0%
ODOSat% percent null: 0.0%
ODO (mg/L) percent null: 0.0%
BGA-Phycocyanin RFU percent null: 0.0%
datetime percent null: 0.0%
Temp C percent null: 0.0%
Sp Cond (uS/cm) percent null: 0.0%
pH (mV) percent null: 0.0%
pH percent null: 0.0%
Turbidity (NTU) percent null: 3.2%
Filling Turbidity (NTU) with mean: 64.9625066543

## Add Weather Data

In [12]:
## Import And Clean Weather Data
weather = pd.read_csv('../../data/cleaned/daily_weather_metric_2017_2018.csv')

# Find out how much of the data is missing for each column.
drop_columns = []
missing_threshold = .05
for column in weather.columns:
    number_null = np.sum(weather[column].isnull())
    column_length = len(weather[column])
    should_drop_column = (number_null / column_length) > missing_threshold
    print('{} percent null: {:.1f}%'.format(column, (number_null / column_length) * 100))
    if should_drop_column:
        drop_columns.append(column)
print('dropping columns: {}'.format(drop_columns))
weather = weather.drop(drop_columns, axis=1)

# Drop the columns we will not be using.
weather = weather.drop(columns=['STATION','NAME','LATITUDE','LONGITUDE','ELEVATION', 'TOBS'])

weather['DATE'] = pd.to_datetime(weather['DATE'])
# Set a datetime index for convenience
weather['timestamp'] = weather['DATE']
weather = weather.set_index('timestamp')
print(weather.dtypes)

weather['DATE'] = weather['DATE'].apply(lambda x: x.date())

STATION percent null: 0.0%
NAME percent null: 0.0%
LATITUDE percent null: 0.0%
LONGITUDE percent null: 0.0%
ELEVATION percent null: 0.0%
DATE percent null: 0.0%
DAPR percent null: 99.7%
EVAP percent null: 50.6%
MDPR percent null: 99.7%
MNPN percent null: 50.9%
MXPN percent null: 50.9%
PRCP percent null: 0.9%
SNOW percent null: 0.5%
SNWD percent null: 0.5%
TAVG percent null: 100.0%
TMAX percent null: 0.2%
TMIN percent null: 0.0%
TOBS percent null: 0.5%
WESD percent null: 100.0%
WESF percent null: 100.0%
WT01 percent null: 100.0%
WT03 percent null: 99.5%
WT04 percent null: 100.0%
WT05 percent null: 100.0%
WT11 percent null: 100.0%
dropping columns: ['DAPR', 'EVAP', 'MDPR', 'MNPN', 'MXPN', 'TAVG', 'WESD', 'WESF', 'WT01', 'WT03', 'WT04', 'WT05', 'WT11']
DATE    datetime64[ns]
PRCP           float64
SNOW           float64
SNWD           float64
TMAX           float64
TMIN           float64
dtype: object


In [13]:
# Add the weather data to our training/testing sets
count = 0
total = len(train_dfs + test_dfs)
for df in train_dfs + test_dfs:
    count += 1
    print("starting dataframe {}/{}".format(count, total))
    for column in weather.drop(columns='DATE').columns:
        print("adding series for column:", column)
        values = []
        old_datetime = None
        value = None
        for d in df['datetime']:
            d = d.date()
            if d != old_datetime:
                old_datetime = d
                value = weather[weather['DATE'] == d][column]
            values.append(value.values[0] if len(value.values) > 0 else np.nan)
        df[column] = values

starting dataframe 1/4
adding series for column: PRCP
adding series for column: SNOW
adding series for column: SNWD
adding series for column: TMAX
adding series for column: TMIN
starting dataframe 2/4
adding series for column: PRCP
adding series for column: SNOW
adding series for column: SNWD
adding series for column: TMAX
adding series for column: TMIN
starting dataframe 3/4
adding series for column: PRCP
adding series for column: SNOW
adding series for column: SNWD
adding series for column: TMAX
adding series for column: TMIN
starting dataframe 4/4
adding series for column: PRCP
adding series for column: SNOW
adding series for column: SNWD
adding series for column: TMAX
adding series for column: TMIN


### Adding the Wind to weather


In [14]:
weather_files = ['../../data/raw_data/weather/provo_airport_2017', '../../data/raw_data/weather/provo_airport_2018']
# the array to store the readings in
contents = []

# Extract the weather data in the ISD format
for file in weather_files:
    contents.append(hf.extract_weather_data(file))

In [15]:
# the dfs containing the wind information for the readings.
wind_frame_2017 = pd.DataFrame(columns=["angle","speed"])
wind_frame_2018 = pd.DataFrame(columns=["angle","speed"])


wind_frames = [wind_frame_2017, wind_frame_2018]
for i in range(len(contents)):
    wind_dict = {}
    for reading in contents[i]:
        reading.GPO_timestamp = hf.round_time(reading.GPO_timestamp)

        # no key value made yet
        if reading.GPO_timestamp not in wind_dict.keys():
            # check to see if the readings are valid
            # Sets them to 'invalid' so that when we inpute them it can be handled
            if reading.WO_wind_angle == '999':
                reading.WO_wind_angle = np.nan
            else:
                reading.WO_wind_angle = int(reading.WO_wind_angle)
            if reading.WO_wind_speed == '9999':
                reading.WO_wind_speed = np.nan
            else:
                reading.WO_wind_speed = int(reading.WO_wind_speed)
            wind_dict[reading.GPO_timestamp] = [reading.WO_wind_angle, reading.WO_wind_speed]
    wind_frames[i] = pd.DataFrame.from_dict(wind_dict, orient='index')
    wind_frames[i] = wind_frames[i].rename(columns={0:'Wind Angle', 1:'Wind Speed'})

In [16]:
#we need to join the wind data now to the existing dataframes, but the dont care about

for i in range(len(train_dfs)):
    # train dfs is the 2017 data set so we need to set the wind frame to that 
    # dataset
    train_dfs[i] = train_dfs[i].join(wind_frames[0])
for i in range(len(test_dfs)):
    # test dfs is the 2018 data set so we need to set the wind frame to that
    # dataset
    test_dfs[i] = test_dfs[i].join(wind_frames[1])

Need to deal with the NaNs in the dataframe for the WInd Angle and Wind Speed. There is a couple of ways that we can do this. One way is to assume that the wind doesn't change direction from the last valid responce (or the next in the case where the first entry is a NaN) or to assume that weather acts predictibly and will change direction and angle easily from entry to entry. 

I.E first entry 120 angle and 36 m/s speed, second (valid) entry 160 angle and 24 m/s speed slowing pan the angle inbetween and decrese the speed throughout so the entrries would look something like this:

From:

|Time Stamp|angle | speed |
|--|--|--|
|2018-04-11 12:00:00|120|36|
|2018-04-11 12:15:00|NaN|NaN|
|2018-04-11 12:30:00|NaN|NaN|
|2018-04-11 12:45:00|NaN|NaN|
|2018-04-11 13:00:00|160|24|

To:

|Time Stamp|angle | speed |
|--|--|--|
|2018-04-11 12:00:00|120|36|
|2018-04-11 12:15:00|130|33 |
|2018-04-11 12:30:00|140|30 |
|2018-04-11 12:45:00|150|27 |
|2018-04-11 13:00:00|160|24 |

In [17]:
for i in range(len(train_dfs)):
    train_dfs[i]['Wind Angle'] = train_dfs[i]['Wind Angle'].interpolate(limit_direction='both')
    train_dfs[i]['Wind Speed'] = train_dfs[i]['Wind Speed'].interpolate(limit_direction='both')
for i in range(len(test_dfs)):
    #test dfs is the 2018 data set so we need to set the wind frame to that dataset
    test_dfs[i]['Wind Angle'] = test_dfs[i]['Wind Angle'].interpolate(limit_direction='both')
    test_dfs[i]['Wind Speed'] = test_dfs[i]['Wind Speed'].interpolate(limit_direction='both')

## Extract Windows


In [18]:
x_window = pd.Timedelta('28 days')
# Segment each data frame
for i in range(0, len(train_dfs)):
    print("Windowizing 2017 data set:", i)
    train_dfs[i] = hf.data_window_reduction(
        train_dfs[i], 'datetime', target_column, x_win_size=x_window)
    print("Windowizing 2018 data set:", i)
    test_dfs[i] = hf.data_window_reduction(
        test_dfs[i], 'datetime', target_column, x_win_size=x_window)
    print()

Windowizing 2017 data set: 0
Segmenting...
Extracting feature windows...
Extracting target windows...
Combining extractions...
Windowizing 2018 data set: 0
Segmenting...
Extracting feature windows...
Extracting target windows...
Combining extractions...

Windowizing 2017 data set: 1
Segmenting...
Extracting feature windows...
Extracting target windows...
Combining extractions...
Windowizing 2018 data set: 1
Segmenting...
Extracting feature windows...
Extracting target windows...
Combining extractions...



## Add Weather Categories

In [19]:
# Add a rainy category
for df in test_dfs + train_dfs:
    df['rained'] = df['PRCP'].apply(
        lambda x: 1 if x > 0 else 0).astype('category')
test_dfs[test_index].columns

# add the weather columns to our x_columns
x_columns = list(set(x_columns 
                     + ['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN', 'rained', 'Wind Speed', 'Wind Angle']))

## Logistic Regression Model

In [20]:
lrf.add_target_column(train_dfs + test_dfs, 
                      threshold=RFU_THRESHOLD)

### Turbidity investigation

In [21]:
# We are missing some Turbidity data in the 2017 marina set which may be effecting the model's performance since it occurs 
# about 3-4 weeks prior to the first major bloom in 2017, well within the 28 day window size. 
# The missing Turbidity data has been Imputed with the mean value

max_iter = 25000
loss = "log"
columns = ['Turbidity (NTU)']

marina_accuracies = []
vineyard_accuracies = []
# Cross train turbidity only at each site
for i in range(len(train_dfs)):
    train = train_dfs[i].copy()
    for j in range(len(test_dfs)):
        test = test_dfs[j].copy()
        print("Training on:", train_files[i])
        print("Testing on:", test_files[j])
        model = SGDClassifier(max_iter=max_iter, loss=loss)
        accuracy, recall, precision, cm, _, _, _ = lrf.train_model(
            model, train, test, columns, 'bloom')
        print("Accuracy", accuracy)
        print("Recall:", recall)
        print("Precision", precision)
        print("Confusion Matrix:\n", cm)
        if i == 0:
            vineyard_accuracies.append(accuracy)
        else:
            marina_accuracies.append(accuracy)
            
        # Cross train
        print("Training on:", test_files[j])
        print("Testing on:", train_files[i])
        model = SGDClassifier(max_iter=max_iter, loss=loss)
        accuracy, recall, precision, cm, _, _, _ = lrf.train_model(
            model, test, train, columns, 'bloom')
        print("Accuracy", accuracy)
        print("Recall:", recall)
        print("Precision", precision)
        print("Confusion Matrix:\n", cm)
        if i == 0:
            vineyard_accuracies.append(accuracy)
        else:
            marina_accuracies.append(accuracy)
            
print("Vineyard Accuracies:", vineyard_accuracies)
print("Marina Accuracies:", marina_accuracies)
print("Vineyard mean accuracy:", np.mean(vineyard_accuracies))
print("Marina mean accuracy:", np.mean(marina_accuracies))

Training on: ../../data/cleaned/utah_2017_vineyard.csv
Testing on: ../../data/cleaned/utah_2018_marina.csv
Accuracy 0.45484949832775917
Recall: 0.5775401069518716
Precision 0.5625
Confusion Matrix:
 [[ 28  84]
 [ 79 108]]
Training on: ../../data/cleaned/utah_2018_marina.csv
Testing on: ../../data/cleaned/utah_2017_vineyard.csv
Accuracy 0.43252595155709345
Recall: 0.9615384615384616
Precision 0.44014084507042256
Confusion Matrix:
 [[  0 159]
 [  5 125]]
Training on: ../../data/cleaned/utah_2017_vineyard.csv
Testing on: ../../data/cleaned/utah_2018_bird.csv
Accuracy 0.49498327759197325
Recall: 0.4263157894736842
Precision 0.6585365853658537
Confusion Matrix:
 [[ 67  42]
 [109  81]]
Training on: ../../data/cleaned/utah_2018_bird.csv
Testing on: ../../data/cleaned/utah_2017_vineyard.csv
Accuracy 0.4740484429065744
Recall: 0.9538461538461539
Precision 0.45925925925925926
Confusion Matrix:
 [[ 13 146]
 [  6 124]]
Training on: ../../data/cleaned/utah_2017_marina.csv
Testing on: ../../data/cle

We see more varience but not much difference in the average.
#### Greedy algorithm on each site

In [22]:
max_iter = 25000
loss = "log"

marina_accuracies = []
vineyard_accuracies = []
# Cross train turbidity only at each site
for i in range(len(train_dfs)):
    train = train_dfs[i].copy()
    for j in range(len(test_dfs)):
        test = test_dfs[j].copy()
        print("Training on:", train_files[i])
        print("Testing on:", test_files[j])
        sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                            test,
                                            x_columns,
                                            'bloom', verbose=0)
        model = SGDClassifier(max_iter=max_iter, loss=loss)
        accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
            model, train, test, x_columns, 'bloom', sorted_columns)
        print("Accuracy", accuracy)
        print("Recall:", recall)
        print("Precision", precision)
        print("Confusion Matrix:\n", cm)
        if i == 0:
            vineyard_accuracies.append(accuracy)
        else:
            marina_accuracies.append(accuracy)
            
        # Cross train
        print("Training on:", test_files[j])
        print("Testing on:", train_files[i])
        sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                    test,
                                    x_columns,
                                    'bloom', verbose=0)
        model = SGDClassifier(max_iter=max_iter, loss=loss)
        accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
            model, test, train, x_columns, 'bloom', sorted_columns)
        print("Accuracy", accuracy)
        print("Recall:", recall)
        print("Precision", precision)
        print("Confusion Matrix:\n", cm)
        if i == 0:
            vineyard_accuracies.append(accuracy)
        else:
            marina_accuracies.append(accuracy)
            
print("Vineyard Accuracies:", vineyard_accuracies)
print("Marina Accuracies:", marina_accuracies)
print("Vineyard mean accuracy:", np.mean(vineyard_accuracies))
print("Marina mean accuracy:", np.mean(marina_accuracies))

Training on: ../../data/cleaned/utah_2017_vineyard.csv
Testing on: ../../data/cleaned/utah_2018_marina.csv


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.8361204013377926
Test model recall: 0.7914438502673797
Test model precision: 0.9367088607594937

Updating greedy model

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.8561872909698997
Test model recall: 0.8235294117647058
Test model precision: 0.9390243902439024

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'ODO (mg/L)']
Test model accuracy: 0.8361204013377926
Test model recall: 0.8074866310160428
Test model precision: 0.9207317073170732

Training model with: ['ODOSat%', 'TMIN', 'pH']
Test model accuracy: 0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.6643598615916955
Test model recall: 0.7384615384615385
Test model precision: 0.6037735849056604

Updating greedy model

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.6643598615916955
Test model recall: 0.7384615384615385
Test model precision: 0.6037735849056604

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7307692307692307
Test model precision: 0.6012658227848101

Training model with: ['ODOSat%', 'pH']
Test model accuracy: 0.5467128027681661
Test model recall: 0.

  'precision', 'predicted', average, warn_for)


Test model accuracy: 0.5501730103806228
Test model recall: 0.0
Test model precision: 0.0

Training model with: ['ODOSat%', 'PRCP']
Test model accuracy: 0.657439446366782
Test model recall: 0.7538461538461538
Test model precision: 0.593939393939394

Training model with: ['ODOSat%', 'Turbidity (NTU)']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'rained']
Test model accuracy: 0.657439446366782
Test model recall: 0.7461538461538462
Test model precision: 0.5950920245398773

Training model with: ['ODOSat%', 'Wind Angle']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'SNOW']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'SNWD']
Test model accuracy: 0.6608996539792388
Test mod

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['TMIN', 'Temp C', 'TMAX', 'pH', 'Wind Speed', 'pH (mV)', 'ODOSat%', 'PRCP', 'Turbidity (NTU)', 'ODO (mg/L)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['TMIN', 'Temp C', 'TMAX', 'pH', 'Wind Speed', 'pH (mV)', 'ODOSat%', 'PRCP', 'Turbidity (NTU)', 'ODO (mg/L)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['TMIN']
Test model accuracy: 0.8127090301003345
Test model recall: 0.8
Test model precision: 0.8941176470588236

Updating greedy model

Training model with: ['TMIN', 'Temp C']
Test model accuracy: 0.8762541806020067
Test model recall: 0.9473684210526315
Test model precision: 0.8695652173913043

Updating greedy model

Training model with: ['TMIN', 'Temp C', 'TMAX']
Test model accuracy: 0.8795986622073578
Test model recall: 0.9473684210526315
Test model precision: 0.8737864077669902

Updating greedy model

Training model with: ['TMIN', 'Temp C', 'TMAX', 'pH']
Test model accurac

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['TMIN', 'Temp C', 'TMAX', 'pH', 'Wind Speed', 'pH (mV)', 'ODOSat%', 'PRCP', 'Turbidity (NTU)', 'ODO (mg/L)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['TMIN', 'Temp C', 'TMAX', 'pH', 'Wind Speed', 'pH (mV)', 'ODOSat%', 'PRCP', 'Turbidity (NTU)', 'ODO (mg/L)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['TMIN']
Test model accuracy: 0.629757785467128
Test model recall: 0.823076923076923
Test model precision: 0.5602094240837696

Updating greedy model

Training model with: ['TMIN', 'Temp C']
Test model accuracy: 0.629757785467128
Test model recall: 0.823076923076923
Test model precision: 0.5602094240837696

Training model with: ['TMIN', 'TMAX']
Test model accuracy: 0.6643598615916955
Test model recall: 0.5769230769230769
Test model precision: 0.6410256410256411

Updating greedy model

Training model with: ['TMIN', 'TMAX', 'pH']
Test model accuracy: 0.6678200692041523
Test model

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.8127090301003345
Test model recall: 0.7379679144385026
Test model precision: 0.9517241379310345

Updating greedy model

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.7190635451505016
Test model recall: 0.6096256684491979
Test model precision: 0.912

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.8193979933110368
Test model recall: 0.7647058823529411
Test model precision: 0.934640522875817

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'TMAX']
Test model accuracy: 0.7190635451505016
Te

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.7185185185185186
Test model recall: 0.853448275862069
Test model precision: 0.6265822784810127

Updating greedy model

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.7185185185185186
Test model recall: 0.853448275862069
Test model precision: 0.6265822784810127

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.737037037037037
Test model recall: 0.8793103448275862
Test model precision: 0.6415094339622641

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'TMAX']
Test model accuracy: 0.72962962

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['TMIN', 'TMAX', 'Temp C', 'pH', 'ODOSat%', 'Wind Speed', 'PRCP', 'ODO (mg/L)', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['TMIN', 'TMAX', 'Temp C', 'pH', 'ODOSat%', 'Wind Speed', 'PRCP', 'ODO (mg/L)', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
Training model with: ['TMIN']
Test model accuracy: 0.8127090301003345
Test model recall: 0.8
Test model precision: 0.8941176470588236

Updating greedy model

Training model with: ['TMIN', 'TMAX']
Test model accuracy: 0.7290969899665551
Test model recall: 0.631578947368421
Test model precision: 0.916030534351145

Training model with: ['TMIN', 'Temp C']
Test model accuracy: 0.6488294314381271
Test model recall: 0.4473684210526316
Test model precision: 1.0

Training model with: ['TMIN', 'pH']
Test model accuracy: 0.882943143812709
Test model recall: 0.8631578947368421
Test model precision: 0.94797687

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['TMIN', 'TMAX', 'Temp C', 'pH', 'ODOSat%', 'Wind Speed', 'PRCP', 'ODO (mg/L)', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['TMIN', 'TMAX', 'Temp C', 'pH', 'ODOSat%', 'Wind Speed', 'PRCP', 'ODO (mg/L)', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
Training model with: ['TMIN']
Test model accuracy: 0.7
Test model recall: 0.9741379310344828
Test model precision: 0.5916230366492147

Updating greedy model

Training model with: ['TMIN', 'TMAX']
Test model accuracy: 0.6925925925925925
Test model recall: 0.646551724137931
Test model precision: 0.6410256410256411

Training model with: ['TMIN', 'Temp C']
Test model accuracy: 0.7
Test model recall: 0.9741379310344828
Test model precision: 0.5916230366492147

Training model with: ['TMIN', 'pH']
Test model accuracy: 0.7703703703703704
Test model recall: 0.8879310344827587
Test model precision: 0.677631

Marina did slightly better on average yet again
#### Combining the 2018 data sets

In [24]:
max_iter = 25000
loss = "log"

marina_accuracies = []
vineyard_accuracies = []
# Cross train turbidity only at each site
for i in range(len(train_dfs)):
    train = train_dfs[i].copy()
    test = test_dfs[0].copy()
    test.append(test_dfs[1].copy())
    print("Training on:", train_files[i])
    print("Testing on combined 2018")
    sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                        test,
                                        x_columns,
                                        'bloom', verbose=0)
    model = SGDClassifier(max_iter=max_iter, loss=loss)
    accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
        model, train, test, x_columns, 'bloom', sorted_columns)
    print("Accuracy", accuracy)
    print("Recall:", recall)
    print("Precision", precision)
    print("Confusion Matrix:\n", cm)
    if i == 0:
        vineyard_accuracies.append(accuracy)
    else:
        marina_accuracies.append(accuracy)

    # Cross train
    print("Training on combined 2018")
    print("Testing on:", train_files[i])
    sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                test,
                                x_columns,
                                'bloom', verbose=0)
    model = SGDClassifier(max_iter=max_iter, loss=loss)
    accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
        model, test, train, x_columns, 'bloom', sorted_columns)
    print("Accuracy", accuracy)
    print("Recall:", recall)
    print("Precision", precision)
    print("Confusion Matrix:\n", cm)
    if i == 0:
        vineyard_accuracies.append(accuracy)
    else:
        marina_accuracies.append(accuracy)
        
print("Vineyard Accuracies:", vineyard_accuracies)
print("Marina Accuracies:", marina_accuracies)
print("Vineyard mean accuracy:", np.mean(vineyard_accuracies))
print("Marina mean accuracy:", np.mean(marina_accuracies))

Training on: ../../data/cleaned/utah_2017_vineyard.csv
Testing on combined 2018


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.8361204013377926
Test model recall: 0.7914438502673797
Test model precision: 0.9367088607594937

Updating greedy model

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.8561872909698997
Test model recall: 0.8235294117647058
Test model precision: 0.9390243902439024

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'ODO (mg/L)']
Test model accuracy: 0.8361204013377926
Test model recall: 0.8074866310160428
Test model precision: 0.9207317073170732

Training model with: ['ODOSat%', 'TMIN', 'pH']
Test model accuracy: 0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'TMIN', 'ODO (mg/L)', 'pH', 'Wind Speed', 'Temp C', 'TMAX', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.6643598615916955
Test model recall: 0.7384615384615385
Test model precision: 0.6037735849056604

Updating greedy model

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.6643598615916955
Test model recall: 0.7384615384615385
Test model precision: 0.6037735849056604

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7307692307692307
Test model precision: 0.6012658227848101

Training model with: ['ODOSat%', 'pH']
Test model accuracy: 0.5467128027681661
Test model recall: 0.

  'precision', 'predicted', average, warn_for)


Test model accuracy: 0.5501730103806228
Test model recall: 0.0
Test model precision: 0.0

Training model with: ['ODOSat%', 'PRCP']
Test model accuracy: 0.657439446366782
Test model recall: 0.7538461538461538
Test model precision: 0.593939393939394

Training model with: ['ODOSat%', 'Turbidity (NTU)']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'rained']
Test model accuracy: 0.657439446366782
Test model recall: 0.7461538461538462
Test model precision: 0.5950920245398773

Training model with: ['ODOSat%', 'Wind Angle']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'SNOW']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'SNWD']
Test model accuracy: 0.6608996539792388
Test mod

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.8127090301003345
Test model recall: 0.7379679144385026
Test model precision: 0.9517241379310345

Updating greedy model

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.7190635451505016
Test model recall: 0.6096256684491979
Test model precision: 0.912

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.8193979933110368
Test model recall: 0.7647058823529411
Test model precision: 0.934640522875817

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'TMAX']
Test model accuracy: 0.7190635451505016
Te

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'TMAX', 'Temp C', 'Wind Speed', 'pH', 'PRCP', 'rained', 'Turbidity (NTU)', 'SNOW', 'pH (mV)', 'SNWD', 'Wind Angle', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.7185185185185186
Test model recall: 0.853448275862069
Test model precision: 0.6265822784810127

Updating greedy model

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.7185185185185186
Test model recall: 0.853448275862069
Test model precision: 0.6265822784810127

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.737037037037037
Test model recall: 0.8793103448275862
Test model precision: 0.6415094339622641

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'TMAX']
Test model accuracy: 0.72962962

Marina performed better on average
#### Combining the 2017 data sets

In [25]:
max_iter = 25000
loss = "log"

marina_accuracies = []
bird_accuracies = []
# Cross train turbidity only at each site
for i in range(len(test_dfs)):
    train = test_dfs[i].copy()
    test = train_dfs[0].copy()
    test.append(train_dfs[1].copy())
    print("Training on:", test_files[i])
    print("Testing on combined 2017")
    sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                        test,
                                        x_columns,
                                        'bloom', verbose=0)
    model = SGDClassifier(max_iter=max_iter, loss=loss)
    accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
        model, train, test, x_columns, 'bloom', sorted_columns)
    print("Accuracy", accuracy)
    print("Recall:", recall)
    print("Precision", precision)
    print("Confusion Matrix:\n", cm)
    if i == 1:
        bird_accuracies.append(accuracy)
    else:
        marina_accuracies.append(accuracy)

    # Cross train
    print("Training on combined 2017")
    print("Testing on:", test_files[i])
    sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                test,
                                x_columns,
                                'bloom', verbose=0)
    model = SGDClassifier(max_iter=max_iter, loss=loss)
    accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
        model, test, train, x_columns, 'bloom', sorted_columns)
    print("Accuracy", accuracy)
    print("Recall:", recall)
    print("Precision", precision)
    print("Confusion Matrix:\n", cm)
    if i == 1:
        bird_accuracies.append(accuracy)
    else:
        marina_accuracies.append(accuracy)
        
print("Bird Accuracies:", bird_accuracies)
print("Marina Accuracies:", marina_accuracies)
print("bird mean accuracy:", np.mean(bird_accuracies))
print("Marina mean accuracy:", np.mean(marina_accuracies))

Training on: ../../data/cleaned/utah_2018_marina.csv
Testing on combined 2017


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'Temp C', 'ODO (mg/L)', 'pH', 'pH (mV)', 'PRCP', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)', 'Turbidity (NTU)']
adjusted sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'Temp C', 'ODO (mg/L)', 'pH', 'pH (mV)', 'PRCP', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)', 'Turbidity (NTU)']
Training model with: ['ODOSat%']
Test model accuracy: 0.6643598615916955
Test model recall: 0.7384615384615385
Test model precision: 0.6037735849056604

Updating greedy model

Training model with: ['ODOSat%', 'Wind Speed']
Test model accuracy: 0.657439446366782
Test model recall: 0.7076923076923077
Test model precision: 0.6013071895424836

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.6643598615916955
Test model recall: 0.7384615384615385
Test model precision: 0.6037735849056604

Training model with: ['ODOSat%', 'TMAX']
Test model accuracy: 0.657439446366782
Test model recall: 0.

  'precision', 'predicted', average, warn_for)


Test model accuracy: 0.5501730103806228
Test model recall: 0.0
Test model precision: 0.0

Training model with: ['ODOSat%', 'PRCP']
Test model accuracy: 0.657439446366782
Test model recall: 0.7538461538461538
Test model precision: 0.593939393939394

Training model with: ['ODOSat%', 'Wind Angle']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'rained']
Test model accuracy: 0.657439446366782
Test model recall: 0.7461538461538462
Test model precision: 0.5950920245398773

Training model with: ['ODOSat%', 'SNOW']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'SNWD']
Test model accuracy: 0.6608996539792388
Test model recall: 0.7538461538461538
Test model precision: 0.5975609756097561

Training model with: ['ODOSat%', 'Sp Cond (uS/cm)']
Test model accuracy: 0.657439446366782
Test mode

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'Temp C', 'ODO (mg/L)', 'pH', 'pH (mV)', 'PRCP', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)', 'Turbidity (NTU)']
adjusted sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'Temp C', 'ODO (mg/L)', 'pH', 'pH (mV)', 'PRCP', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)', 'Turbidity (NTU)']
Training model with: ['ODOSat%']
Test model accuracy: 0.8361204013377926
Test model recall: 0.7914438502673797
Test model precision: 0.9367088607594937

Updating greedy model

Training model with: ['ODOSat%', 'Wind Speed']
Test model accuracy: 0.7759197324414716
Test model recall: 0.7165775401069518
Test model precision: 0.9054054054054054

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.8561872909698997
Test model recall: 0.8235294117647058
Test model precision: 0.9390243902439024

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'TMAX']
Test model accuracy: 0.84615

  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'pH', 'Temp C', 'pH (mV)', 'PRCP', 'ODO (mg/L)', 'Turbidity (NTU)', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'pH', 'Temp C', 'pH (mV)', 'PRCP', 'ODO (mg/L)', 'Turbidity (NTU)', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.6782006920415224
Test model recall: 0.8076923076923077
Test model precision: 0.6069364161849711

Updating greedy model

Training model with: ['ODOSat%', 'Wind Speed']
Test model accuracy: 0.6678200692041523
Test model recall: 0.7769230769230769
Test model precision: 0.6011904761904762

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.6228373702422145
Test model recall: 0.8076923076923077
Test model precision: 0.5555555555555556

Training model with: ['ODOSat%', 'TMAX']
Test model accuracy: 0.671280276816609
Test model recall: 0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'pH', 'Temp C', 'pH (mV)', 'PRCP', 'ODO (mg/L)', 'Turbidity (NTU)', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'Wind Speed', 'TMIN', 'TMAX', 'pH', 'Temp C', 'pH (mV)', 'PRCP', 'ODO (mg/L)', 'Turbidity (NTU)', 'Wind Angle', 'rained', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.6287625418060201
Test model recall: 0.41578947368421054
Test model precision: 1.0

Updating greedy model

Training model with: ['ODOSat%', 'Wind Speed']
Test model accuracy: 0.68561872909699
Test model recall: 0.5105263157894737
Test model precision: 0.9897959183673469

Updating greedy model

Training model with: ['ODOSat%', 'Wind Speed', 'TMIN']
Test model accuracy: 0.8193979933110368
Test model recall: 0.8157894736842105
Test model precision: 0.8908045977011494

Updating greedy model

Training model with: ['ODOSat%', 'Wind Speed', 'TMIN', '

Training on the combined 2017 is better than testing on combined 2017
#### Combining both datasets

In [29]:
max_iter = 25000
loss = "log"


# Cross train turbidity only at each site
train = train_dfs[0].copy().append(train_dfs[1])
test = test_dfs[0].copy()
test.append(test_dfs[1].copy())
print("Training on combined 2017")
print("Testing on combined 2018")
sorted_columns = lrf.sort_columns_by_metric(model, train, 
                                    test,
                                    x_columns,
                                    'bloom', verbose=0)
model = SGDClassifier(max_iter=max_iter, loss=loss)
accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
    model, train, test, x_columns, 'bloom', sorted_columns)
print("Accuracy", accuracy)
print("Recall:", recall)
print("Precision", precision)
print("Confusion Matrix:\n", cm)

mean_accuracy = accuracy

# Cross train
print("Training on combined 2018")
print("Testing on combined 2017")
sorted_columns = lrf.sort_columns_by_metric(model, train, 
                            test,
                            x_columns,
                            'bloom', verbose=0)
model = SGDClassifier(max_iter=max_iter, loss=loss)
accuracy, recall, precision, cm, _, _, _ = lrf.greedy_model(
    model, test, train, x_columns, 'bloom', sorted_columns)
print("Accuracy", accuracy)
print("Recall:", recall)
print("Precision", precision)
print("Confusion Matrix:\n", cm)

mean_accuracy = (mean_accuracy + accuracy) / 2

print("Mean accuracy:", mean_accuracy)


Training on combined 2017
Testing on combined 2018


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'pH', 'Temp C', 'TMAX', 'Wind Speed', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'pH', 'Temp C', 'TMAX', 'Wind Speed', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.8327759197324415
Test model recall: 0.7754010695187166
Test model precision: 0.9477124183006536

Updating greedy model

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.8294314381270903
Test model recall: 0.7807486631016043
Test model precision: 0.9358974358974359

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.8327759197324415
Test model recall: 0.786096256684492
Test model precision: 0.9363057324840764

Training model with: ['ODOSat%', 'pH']
Test model accuracy: 0.822742474916388
Test model recall: 0.75

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


base_columns: []
sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'pH', 'Temp C', 'TMAX', 'Wind Speed', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
adjusted sorted_columns: ['ODOSat%', 'ODO (mg/L)', 'TMIN', 'pH', 'Temp C', 'TMAX', 'Wind Speed', 'pH (mV)', 'PRCP', 'Turbidity (NTU)', 'rained', 'Wind Angle', 'SNOW', 'SNWD', 'Sp Cond (uS/cm)']
Training model with: ['ODOSat%']
Test model accuracy: 0.6905187835420393
Test model recall: 0.7926829268292683
Test model precision: 0.6151419558359621

Updating greedy model

Training model with: ['ODOSat%', 'ODO (mg/L)']
Test model accuracy: 0.6887298747763864
Test model recall: 0.7886178861788617
Test model precision: 0.6139240506329114

Training model with: ['ODOSat%', 'TMIN']
Test model accuracy: 0.6994633273703041
Test model recall: 0.8048780487804879
Test model precision: 0.6226415094339622

Updating greedy model

Training model with: ['ODOSat%', 'TMIN', 'pH']
Test model accuracy: 0.6422182

The imputed Turbidity in Marina does not seem to effect the model, however training on 2018 almost always performs worse than training on 2017 data.