In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, auc

In [20]:
# Helper functions

def create_targets(data_frames, target_column, new_target_name, threshold):
    for df in data_frames:
        df[new_target_name] = df[target_column].apply(
                                            lambda x: 1 if x > threshold else 0)
    return data_frames

def import_df_data(files, drop_columns=[]):
    data_frames = []
    for file in files:
        df = pd.read_csv(file)
        for column in drop_columns:
            if column in df.columns:
                df = df.drop(columns=[column])
        data_frames.append(df)
    return data_frames

def train_model(training_df, testing_df, x_columns, y_column, max_iter=25000):
    """Trains a SGD Classifier model.

    training_df: dataframe of training data.
    testing_df: dataframe of testing data.
    x_columns: list of numerical feature column names.
    y_column: target string name.
        
    returns a tuple of:
    The trained model, predicted values, predicted probabilities, model accuracy
    """
    
    # Create training and testing matrices
    x_train = training_df[x_columns].values
    x_test = testing_df[x_columns].values
    y_train = training_df[y_column].values
    y_test = testing_df[y_column].values
    
    # Scale the data
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    
    # Train the model
    model = SGDClassifier(max_iter=max_iter, loss="log")
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    predictions_prob = model.predict_proba(x_test)
    accuracy = accuracy_score(y_test, predictions)
    return (model, predictions, predictions_prob, accuracy)

def roc_plot(actual, predictions):
    fpr, tpr, thresholds = roc_curve(actual, predictions)
    roc_auc = auc(fpr, tpr)
    plt.title("ROC")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.plot(fpr, tpr)

In [21]:
df = import_df_data(['../../data/cleaned/utah_2017_vineyard.csv'])[0]
df = create_targets([df], 'BGA-Phycocyanin RFU', 'bloom', 3)[0]
df.head()

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),Chlorophyll (ug/L),Chlorophyll RFU,ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,bloom
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,4.4,1.3,90.2,9.04,0.4,0
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,4.2,1.2,90.2,9.04,0.4,0
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,4.3,1.3,90.1,9.04,0.4,0
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,4.5,1.3,90.0,9.03,0.4,0
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,4.5,1.3,89.8,9.02,0.4,0
