## Baseline Model Notebook

Goals of this notebook:
* Given y_true, y_pred, calculate the RMSE
* Implement a basic evaluation function
* Assuming we are given X_train, y_train, fit a basic model and evaluate it
* Additionally, implement cross_validation scoring

In [1]:
import seaborn as sns

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from sklearn import set_config
set_config(transform_output="pandas")

# Set random seed 
RSEED = 42

warnings.filterwarnings("ignore")



In [7]:
final_data = pd.read_csv("data/wrangled_data.csv")

In [8]:
# TODO: Solving date formate issue with date_caught

del final_data['date_caught']

In [9]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 41 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Unnamed: 0.1                                    18062 non-null  int64  
 1   Unnamed: 0                                      18062 non-null  int64  
 2   capture_site                                    18062 non-null  int64  
 3   tag_2                                           18062 non-null  int64  
 4   ccl_cm                                          18062 non-null  float64
 5   ccw_cm                                          18062 non-null  float64
 6   weight_kg                                       18062 non-null  float64
 7   status                                          18062 non-null  int64  
 8   release_site                                    18062 non-null  int64  
 9   capture_method_beached                 

In [10]:
# Select X and y features
X = final_data.drop(['capture_number'], axis = 1)
y = final_data['capture_number']


# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)  
print("y_train:", y_train.shape)  
print("X_test:", X_test.shape) 
print("y_test:", y_test.shape)  

X_train: (12643, 40)
y_train: (12643,)
X_test: (5419, 40)
y_test: (5419,)


In [11]:
# Imports
import numpy as np
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression


In [12]:
# Implement a basic evaluation function
def evaluate_rmse(y_true, y_pred, ndigits=3):
    """ Prints the RMSE (root mean squared error) of y_pred in relation to y_true"""
    rmse = mean_squared_error(y_true, y_pred, squared=False )
    print("Number of predictions: ", len(y_pred))
    print("RMSE: ", round(rmse, ndigits))
    return rmse

In [13]:
# Test the evaluation function
y_true_testing = [3, -0.5, 2, 7]
y_pred_testing = [2.5, 0.0, 2, 8]
#np.sqrt(sum((np.array(y_true_testing)-np.array(y_pred_testing))**2)/len(y_true_testing))
assert float(np.abs(evaluate_rmse(y_true_testing, y_pred_testing) - 0.612)) <= 0.001

Number of predictions:  4
RMSE:  0.612


In [14]:
# Assuming we are given X_train, y_train, fit a basic linear model and evaluate it
# TODO: need X_test, y_test
# initialize the model
lin_reg = LinearRegression()

# train model 
lin_reg.fit(X_train,y_train)

# make predictions on X_test
y_predicted = lin_reg.predict(X_test)

# evaluate
error = evaluate_rmse(y_test, y_predicted)


Number of predictions:  5419
RMSE:  3.657


In [16]:
# Additionally, implement cross_validation scoring

scorer_rmse = make_scorer(mean_squared_error, squared=False)

lr = LinearRegression()

# TODO: increase cv to 5?
print("CV RMSE scores: ", cross_val_score(lr, X_train, y_train, cv=5, scoring=scorer_rmse, verbose=5))


[CV] END ................................ score: (test=3.597) total time=   0.0s
[CV] END ................................ score: (test=3.570) total time=   0.0s
[CV] END ................................ score: (test=3.544) total time=   0.0s
[CV] END ................................ score: (test=3.715) total time=   0.0s
[CV] END ................................ score: (test=3.649) total time=   0.0s
CV RMSE scores:  [3.59745626 3.57041746 3.54429514 3.7153757  3.648583  ]


Plotting could be done now

Error analysis