In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import random
from sklearn import preprocessing

river = pd.read_csv('wdm_cleaned_Y_1.csv', error_bad_lines=False)
river. head()

Unnamed: 0,Site_ID,Date_Collected,Lab_DO_mg/L,Lab_pH,Nitrate_PPM,Ortho_Phosphate_PPM,Split_#,Stream_T(C),Time_Collected,Turbidity,Health_Score
0,FC01,5/20/2016,9.324009,7.978506,0.877,0.120927,1.0,16.4,11:49,8.05,4.814592
1,FC02,5/20/2016,9.324009,7.978506,2.13,0.120927,1.0,14.4,12:19,3.95,1.967592
2,FC03,5/20/2016,9.324009,7.978506,2.45,0.120927,1.0,17.1,12:59,16.11,14.447592
3,FC05,5/20/2016,9.324009,7.978506,0.892,0.120927,1.0,16.9,13:29,10.33,7.109592
4,FC06,5/20/2016,9.324009,7.978506,3.01,0.120927,1.0,17.0,13:43,17.14,16.037592


In [2]:
# Check corrolation
corr_matrix = river.corr()
corr_matrix["Health_Score"].sort_values(ascending = False)

Health_Score           1.000000
Turbidity              0.987417
Nitrate_PPM            0.201372
Ortho_Phosphate_PPM    0.126462
Split_#                0.004459
Lab_DO_mg/L           -0.011185
Lab_pH                -0.019337
Stream_T(C)           -0.022269
Name: Health_Score, dtype: float64

In [3]:
# Split X and y
from sklearn.model_selection import train_test_split

X = river.drop(['Health_Score','Date_Collected','Time_Collected'], axis = 1)
y = river["Health_Score"]

In [4]:
river_num = X.drop("Site_ID", axis = 1)

In [5]:
# Build pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)
    
class DataFrameSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, attribute_names):
          self.attribute_names = attribute_names 
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

num_attribs = list(river_num)
cat_attribs = ["Site_ID"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', MyLabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [6]:
# Apply pipeline
X_f = full_pipeline.fit_transform(X)

In [7]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X_f, y, test_size = 0.2, random_state = 42)

In [8]:
# Implement SGD regression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

lr = SGDRegressor()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)



In [9]:
# Measure the prediction
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

mse_lr = mean_squared_error(y_test, y_pred)
lr_rmse = np.sqrt(mse_lr)
lr_mae = mean_absolute_error(y_test, y_pred)
print("RMSE:", lr_rmse)
print("MAE:", lr_mae)

RMSE: 0.09355868557582094
MAE: 0.04503149145608454


In [14]:
from sklearn.cross_validation import cross_val_score
print(cross_val_score(lr, X_train, y_train, cv=5))


[0.99997241 0.99998015 0.99998554 0.99999191 0.99998803]


