In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Data Processing

In [2]:
df = pd.read_csv('../datasets/ocean_bottle/bottle.csv')
pd.options.display.max_columns = None
df.head()

  df = pd.read_csv('../datasets/ocean_bottle/bottle.csv')


Unnamed: 0,Cst_Cnt,Btl_Cnt,Sta_ID,Depth_ID,Depthm,T_degC,Salnty,O2ml_L,STheta,O2Sat,Oxy_µmol/Kg,BtlNum,RecInd,T_prec,T_qual,S_prec,S_qual,P_qual,O_qual,SThtaq,O2Satq,ChlorA,Chlqua,Phaeop,Phaqua,PO4uM,PO4q,SiO3uM,SiO3qu,NO2uM,NO2q,NO3uM,NO3q,NH3uM,NH3q,C14As1,C14A1p,C14A1q,C14As2,C14A2p,C14A2q,DarkAs,DarkAp,DarkAq,MeanAs,MeanAp,MeanAq,IncTim,LightP,R_Depth,R_TEMP,R_POTEMP,R_SALINITY,R_SIGMA,R_SVA,R_DYNHT,R_O2,R_O2Sat,R_SIO3,R_PO4,R_NO3,R_NO2,R_NH4,R_CHLA,R_PHAEO,R_PRES,R_SAMP,DIC1,DIC2,TA1,TA2,pH2,pH1,DIC Quality Comment
0,1,1,054.0 056.0,19-4903CR-HY-060-0930-05400560-0000A-3,0,10.5,33.44,,25.649,,,,3,1.0,,2.0,,9.0,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,,9.0,,,9.0,,,9.0,,,9.0,,,0.0,10.5,10.5,33.44,25.64,233.0,0.0,,,,,,,,,,0,,,,,,,,
1,1,2,054.0 056.0,19-4903CR-HY-060-0930-05400560-0008A-3,8,10.46,33.44,,25.656,,,,3,2.0,,2.0,,9.0,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,,9.0,,,9.0,,,9.0,,,9.0,,,8.0,10.46,10.46,33.44,25.65,232.5,0.01,,,,,,,,,,8,,,,,,,,
2,1,3,054.0 056.0,19-4903CR-HY-060-0930-05400560-0010A-7,10,10.46,33.437,,25.654,,,,7,2.0,,3.0,,9.0,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,,9.0,,,9.0,,,9.0,,,9.0,,,10.0,10.46,10.46,33.437,25.65,232.8,0.02,,,,,,,,,,10,,,,,,,,
3,1,4,054.0 056.0,19-4903CR-HY-060-0930-05400560-0019A-3,19,10.45,33.42,,25.643,,,,3,2.0,,2.0,,9.0,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,,9.0,,,9.0,,,9.0,,,9.0,,,19.0,10.45,10.45,33.42,25.64,234.1,0.04,,,,,,,,,,19,,,,,,,,
4,1,5,054.0 056.0,19-4903CR-HY-060-0930-05400560-0020A-7,20,10.45,33.421,,25.643,,,,7,2.0,,3.0,,9.0,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,9.0,,,9.0,,,9.0,,,9.0,,,9.0,,,20.0,10.45,10.45,33.421,25.64,234.0,0.04,,,,,,,,,,20,,,,,,,,


In [14]:
# 74 columns, 864,863 rows
df.shape

(864863, 74)

In [15]:
# how many null values are in our dataset?
df.isna().sum() 

Cst_Cnt                     0
Btl_Cnt                     0
Sta_ID                      0
Depth_ID                    0
Depthm                      0
                        ...  
TA1                    862779
TA2                    864629
pH2                    864853
pH1                    864779
DIC Quality Comment    864808
Length: 74, dtype: int64

We see that our data has many columns with substantial amounts of missing values. Let's stick with columns that have fewer than 10% missing data since we have a large number of columns.

In [16]:
columns = [col for col in df.columns if (df[col].isnull().sum() / df.shape[0]) <= 0.07]
columns, len(columns)

(['Cst_Cnt',
  'Btl_Cnt',
  'Sta_ID',
  'Depth_ID',
  'Depthm',
  'T_degC',
  'Salnty',
  'STheta',
  'RecInd',
  'T_prec',
  'S_prec',
  'NH3q',
  'C14A1q',
  'C14A2q',
  'DarkAq',
  'MeanAq',
  'R_Depth',
  'R_TEMP',
  'R_POTEMP',
  'R_SALINITY',
  'R_SIGMA',
  'R_SVA',
  'R_DYNHT',
  'R_PRES'],
 24)

We're left with 24 columns out of the 74. For our first pass and to make it simple, we're going to drop the 50 columns that have high amounts of na values. This will give us a quick and easy baseline. If our model preformance is good, we don't need to add in more columns, however, if it's bad we can consider doing more data processing.

In [17]:
#  getting rid of the 50 columns
df = df[columns]
df.head()

Unnamed: 0,Cst_Cnt,Btl_Cnt,Sta_ID,Depth_ID,Depthm,T_degC,Salnty,STheta,RecInd,T_prec,S_prec,NH3q,C14A1q,C14A2q,DarkAq,MeanAq,R_Depth,R_TEMP,R_POTEMP,R_SALINITY,R_SIGMA,R_SVA,R_DYNHT,R_PRES
0,1,1,054.0 056.0,19-4903CR-HY-060-0930-05400560-0000A-3,0,10.5,33.44,25.649,3,1.0,2.0,9.0,9.0,9.0,9.0,9.0,0.0,10.5,10.5,33.44,25.64,233.0,0.0,0
1,1,2,054.0 056.0,19-4903CR-HY-060-0930-05400560-0008A-3,8,10.46,33.44,25.656,3,2.0,2.0,9.0,9.0,9.0,9.0,9.0,8.0,10.46,10.46,33.44,25.65,232.5,0.01,8
2,1,3,054.0 056.0,19-4903CR-HY-060-0930-05400560-0010A-7,10,10.46,33.437,25.654,7,2.0,3.0,9.0,9.0,9.0,9.0,9.0,10.0,10.46,10.46,33.437,25.65,232.8,0.02,10
3,1,4,054.0 056.0,19-4903CR-HY-060-0930-05400560-0019A-3,19,10.45,33.42,25.643,3,2.0,2.0,9.0,9.0,9.0,9.0,9.0,19.0,10.45,10.45,33.42,25.64,234.1,0.04,19
4,1,5,054.0 056.0,19-4903CR-HY-060-0930-05400560-0020A-7,20,10.45,33.421,25.643,7,2.0,3.0,9.0,9.0,9.0,9.0,9.0,20.0,10.45,10.45,33.421,25.64,234.0,0.04,20


In [18]:
# checking null values again
df.isna().sum()

Cst_Cnt           0
Btl_Cnt           0
Sta_ID            0
Depth_ID          0
Depthm            0
T_degC        10963
Salnty        47354
STheta        52689
RecInd            0
T_prec        10963
S_prec        47354
NH3q          56564
C14A1q        16258
C14A2q        16240
DarkAq        24423
MeanAq        24424
R_Depth           0
R_TEMP        10963
R_POTEMP      46047
R_SALINITY    47354
R_SIGMA       52856
R_SVA         52771
R_DYNHT       46657
R_PRES            0
dtype: int64

We have a large amount of data so we're going to take a quick approach and simply drop the ropws with missing data. This isn't ideal, but for a first iteration of our model, it's a good way to get a baseline. From eariler, we know that none of the columns have no more than 10% missing data, so dropping the rows with missing data should still keep most of the data and preserve the columns.

In [19]:
df = df.dropna()

In [20]:
# checking null values again
df.isna().sum()

Cst_Cnt       0
Btl_Cnt       0
Sta_ID        0
Depth_ID      0
Depthm        0
T_degC        0
Salnty        0
STheta        0
RecInd        0
T_prec        0
S_prec        0
NH3q          0
C14A1q        0
C14A2q        0
DarkAq        0
MeanAq        0
R_Depth       0
R_TEMP        0
R_POTEMP      0
R_SALINITY    0
R_SIGMA       0
R_SVA         0
R_DYNHT       0
R_PRES        0
dtype: int64

Ahh, beautiful, no missing data!

### Inspecting Column Data Types

In [21]:
# we're looking for non continuous values --> "string" values
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 733418 entries, 0 to 864860
Data columns (total 24 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Cst_Cnt     733418 non-null  int64  
 1   Btl_Cnt     733418 non-null  int64  
 2   Sta_ID      733418 non-null  object 
 3   Depth_ID    733418 non-null  object 
 4   Depthm      733418 non-null  int64  
 5   T_degC      733418 non-null  float64
 6   Salnty      733418 non-null  float64
 7   STheta      733418 non-null  float64
 8   RecInd      733418 non-null  int64  
 9   T_prec      733418 non-null  float64
 10  S_prec      733418 non-null  float64
 11  NH3q        733418 non-null  float64
 12  C14A1q      733418 non-null  float64
 13  C14A2q      733418 non-null  float64
 14  DarkAq      733418 non-null  float64
 15  MeanAq      733418 non-null  float64
 16  R_Depth     733418 non-null  float64
 17  R_TEMP      733418 non-null  float64
 18  R_POTEMP    733418 non-null  float64
 19  R_SALIN

In [22]:
df.select_dtypes('object').head(3)

Unnamed: 0,Sta_ID,Depth_ID
0,054.0 056.0,19-4903CR-HY-060-0930-05400560-0000A-3
1,054.0 056.0,19-4903CR-HY-060-0930-05400560-0008A-3
2,054.0 056.0,19-4903CR-HY-060-0930-05400560-0010A-7


Since these columns appear to just be ID columns and not very informative, we are going to drop them. If our model preformance is poor, we can look to add them back in.

In [23]:
df.drop(['Sta_ID','Depth_ID'], inplace=True, axis=1)

In [24]:
df.head()

Unnamed: 0,Cst_Cnt,Btl_Cnt,Depthm,T_degC,Salnty,STheta,RecInd,T_prec,S_prec,NH3q,C14A1q,C14A2q,DarkAq,MeanAq,R_Depth,R_TEMP,R_POTEMP,R_SALINITY,R_SIGMA,R_SVA,R_DYNHT,R_PRES
0,1,1,0,10.5,33.44,25.649,3,1.0,2.0,9.0,9.0,9.0,9.0,9.0,0.0,10.5,10.5,33.44,25.64,233.0,0.0,0
1,1,2,8,10.46,33.44,25.656,3,2.0,2.0,9.0,9.0,9.0,9.0,9.0,8.0,10.46,10.46,33.44,25.65,232.5,0.01,8
2,1,3,10,10.46,33.437,25.654,7,2.0,3.0,9.0,9.0,9.0,9.0,9.0,10.0,10.46,10.46,33.437,25.65,232.8,0.02,10
3,1,4,19,10.45,33.42,25.643,3,2.0,2.0,9.0,9.0,9.0,9.0,9.0,19.0,10.45,10.45,33.42,25.64,234.1,0.04,19
4,1,5,20,10.45,33.421,25.643,7,2.0,3.0,9.0,9.0,9.0,9.0,9.0,20.0,10.45,10.45,33.421,25.64,234.0,0.04,20


## Inspecting Stats of the Numerical Features

We're gonna check out the std and scale of our data which we'll feed into our model. We'll want to make sure all of our data is on the same scale so certain columns aren't given more importance just because the rows have numbers on a bigger scale.

In [25]:
# splitting columns up


# selecting Integer datatypes columns
df_int = df.select_dtypes(include=["int64"])
# selecting float datatypes columns
df_float = df.select_dtypes(include=["float64"])

# checking out the stats
df_int.describe()

Unnamed: 0,Cst_Cnt,Btl_Cnt,Depthm,RecInd,R_PRES
count,733418.0,733418.0,733418.0,733418.0,733418.0
mean,16421.194724,414217.1851,228.06258,4.723822,229.580553
std,9482.694845,230069.296792,309.103372,1.865705,312.359267
min,1.0,1.0,0.0,3.0,0.0
25%,8492.0,222666.25,50.0,3.0,50.0
50%,15633.0,412554.5,125.0,3.0,126.0
75%,24876.0,608509.75,300.0,7.0,302.0
max,34404.0,864861.0,5351.0,7.0,5458.0


### Inspecting RecInd Column

Here we see that all of our columns look like continous values except 'RecInd', where it looks like a categorical column was encoded.

In [26]:
# let's check out the values of that column
df_int['RecInd'].value_counts()

RecInd
3    376861
7    275121
5     80506
6       928
4         2
Name: count, dtype: int64

Instead of starting at 3, we're going to map it such that it starts at 0 and goes to 4. So 'scaling' our data. Unfortunately no meta data was provided for the dataset, so I'm not the most sure what this represents.

In [27]:
# mapping_dict = {3:0,
#                 4:1,
#                 5:2,
#                 6:3,
#                 7:4
#                }
# df['RecInd'] = df['RecInd'].map(mapping_dict)
# df['RecInd'].head()
df = pd.get_dummies(df, columns=["RecInd"],dtype=float)

In [28]:
df.head(2)

Unnamed: 0,Cst_Cnt,Btl_Cnt,Depthm,T_degC,Salnty,STheta,T_prec,S_prec,NH3q,C14A1q,C14A2q,DarkAq,MeanAq,R_Depth,R_TEMP,R_POTEMP,R_SALINITY,R_SIGMA,R_SVA,R_DYNHT,R_PRES,RecInd_3,RecInd_4,RecInd_5,RecInd_6,RecInd_7
0,1,1,0,10.5,33.44,25.649,1.0,2.0,9.0,9.0,9.0,9.0,9.0,0.0,10.5,10.5,33.44,25.64,233.0,0.0,0,1.0,0.0,0.0,0.0,0.0
1,1,2,8,10.46,33.44,25.656,2.0,2.0,9.0,9.0,9.0,9.0,9.0,8.0,10.46,10.46,33.44,25.65,232.5,0.01,8,1.0,0.0,0.0,0.0,0.0


In [29]:
# now lets use log function on our numerical features
log_int_cols = [col for col in df_int.columns if col != "RecInd"]
for col in log_int_cols:
    df[col] = np.log1p(df[col])
df["R_Depth"] = np.log1p(df["R_Depth"])


# Preparing for Training

In [30]:
from sklearn.model_selection import train_test_split

# splitting data into features and target values
X = df.drop(columns=['T_degC']).values
y = df['T_degC'].values

In [31]:
# Split data into training and validation features
X_train, X_val, t_train, t_val = train_test_split(X, y, test_size=0.1, random_state=17)

## Importance Normalization, and Standardization in Linear Regression


### Normalization
Normalization is the process of scaling data to fit within a specific range, typically [0, 1]. This is crucial for linear regression because it ensures that all features contribute equally to the model. Without normalization, features with larger ranges could dominate the regression model and skew the results. Min-max scaling is a common normalization technique.

### Standardization
Standardization scales data to have a mean of 0 and a standard deviation of 1. This process is essential for linear regression because it ensures that the model's weights are on a comparable scale, which can lead to more stable and faster convergence during training. Standardization is typically performed using z-score scaling.

In [33]:
numerical_columns = df.select_dtypes(include=[np.number]).columns

In [34]:
from sklearn.preprocessing import MinMaxScaler

# Normalizing numerical variables
def normalize_transform(X_train, X_val):
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    return X_train_scaled, X_val_scaled

print("Numerical variables normalized.")
X_train, X_val = normalize_transform(X_train, X_val)

Numerical variables normalized.


In [35]:
# let's also standardize our data while we're at it
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Use the same scaler to transform the test data
X_val = scaler.transform(X_val)


Converting our training and validation data to pytorch tensors

In [36]:
import torch

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
t_train = torch.tensor(t_train, dtype=torch.float32).view(-1, 1)
X_val = torch.tensor(X_val, dtype=torch.float32)
t_val = torch.tensor(t_val, dtype=torch.float32).view(-1, 1)

# Defining Our Model

In [37]:
import torch.nn as nn
import torch.optim as optim

class TwoLayerRegressionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(TwoLayerRegressionModel, self).__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.output = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.hidden(x)
        x = self.relu(x)
        x = self.output(x)
        return x

# 25 input dim in our training set
input_dim = 25
hidden_dim = 10
model = TwoLayerRegressionModel(input_dim, hidden_dim)



# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)


# Training Loop

In [38]:
# Training loop
num_epochs = 200

for epoch in range(num_epochs):
    model.train()
    
    outputs = model(X_train)
    loss = criterion(outputs, t_train)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/200], Loss: 28.4063
Epoch [20/200], Loss: 12.8444
Epoch [30/200], Loss: 5.7659
Epoch [40/200], Loss: 2.1046
Epoch [50/200], Loss: 0.8519
Epoch [60/200], Loss: 0.5351
Epoch [70/200], Loss: 0.4217
Epoch [80/200], Loss: 0.3593
Epoch [90/200], Loss: 0.3154
Epoch [100/200], Loss: 0.2804
Epoch [110/200], Loss: 0.2511
Epoch [120/200], Loss: 0.2263
Epoch [130/200], Loss: 0.2052
Epoch [140/200], Loss: 0.1871
Epoch [150/200], Loss: 0.1718
Epoch [160/200], Loss: 0.1586
Epoch [170/200], Loss: 0.1473
Epoch [180/200], Loss: 0.1374
Epoch [190/200], Loss: 0.1287
Epoch [200/200], Loss: 0.1209


# Evaluting Model

In [39]:
# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(X_val)
    test_loss = criterion(predictions, t_val)
    print(f'Test Loss: {test_loss.item():.4f}')

Test Loss: 0.1192


Our training loss was always lower than our testing loss so it means our model wasn't over fitting yet and that we could continue to train it to squeeze even more preformance from it.

## Storing the Model in the System

In [40]:
import pickle

# save the model to disk
model_filename = "model.pkl"
pickle.dump(model, open(model_filename, "wb"))

## Loading Model Back 

In [41]:
loaded_model = pickle.load(open("model.pkl", "rb"))

In [47]:
# doing an example prediction

# switch to evaluation mode
model.eval()

# Generate Predictions:
with torch.no_grad():
    prediction = model(X_val[2])
print("True value:", t_val[2])
print("Predicted value:", prediction)

True value: tensor([5.3800])
Predicted value: tensor([5.2377])


# Next Steps to Improve Preformance!
- Increase training epochs to overfit model --> proves problem has a solution!
- Consider adding dropout layers to reel the model back in.
- Add in other columns with missing data to increase the amount of data the model has to look at
- Change model capacity and parameters!
- Fine tune model with Grid search or random search algorithms to find better hyperparameters.
- Try out other models ex: Scikit learn Linear Regression model