In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

from sklearn import linear_model

warnings.filterwarnings('ignore')

# Test Dataset formating

In [53]:
final_test_df = pd.read_csv('data/Sample_sub.csv')

In [54]:
test_df = pd.read_csv('data/Sample_sub.csv')

In [86]:
test_df

Unnamed: 0,capture_site,week_caught,turtles_rescued
0,0,1,7
1,0,2,1
2,0,3,5
3,0,4,2
4,0,5,3
...,...,...,...
1271,9,40,0
1272,9,41,7
1273,9,42,7
1274,9,43,3


In [6]:
def extract_capture_site(df):
    df['capture_site'] = df['ID'].apply(lambda x: x.split('_')[-2])
    return df

In [7]:
def extract_and_convert_week(df):
    # Extract the second to last element
    df['week_caught'] = df['ID'].apply(lambda x: x[-2:])

    # Convert to datetime with appropriate format for year and month ("%Y%m")
    df['week_caught'] = df['week_caught'].apply(lambda x : int(x))

    return df

In [8]:
def formating_sample_sub(df): 
    # extracting capture site id
    df = extract_capture_site(df)
    
    # extracting week of rascue
    df = extract_and_convert_week(df)
    
    # renaming columns to match training set
    df.rename(columns={'Capture_Number': 'turtles_rescued'}, inplace=True)
    
    # getting rid of mixed column
    df.drop(columns=['ID'], inplace=True)
    
    # Standartising prediction 
    df = df.groupby(['capture_site', 'week_caught'])['turtles_rescued'].sum().reset_index()
    
    return df

In [56]:
test_df = formating_sample_sub(test_df)

In [57]:
test_df.to_csv('data/test_df.csv', index=False)

In [58]:
test_df = pd.read_csv('data/test_df.csv')

# Train Dataset Formating

In [30]:
train_df = pd.read_csv('data/train.csv')

In [None]:
train_df.head()

In [32]:
import re

# Defining function to Standartising column names 
def standardize_column_names(col):
    # Replace spaces with underscores
    col = col.replace(' ', '_')
    # Insert underscore before each uppercase letter preceded by a lowercase letter or followed by a lowercase letter
    col = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', col)
    col = re.sub(r'(?<=[A-Z])(?=[A-Z][a-z])', '_', col)
    # Convert to lower case
    col = col.lower()
    # Ensure single underscores only (in case of consecutive underscores from initial spaces)
    col = re.sub(r'_+', '_', col)
    return col


In [33]:
# Applying function to df
train_df.columns = [standardize_column_names(col) for col in train_df.columns]

# Printing the updated column names to verify the changes
print(train_df.columns)

Index(['rescue_id', 'date_time_caught', 'researcher', 'capture_site',
       'foraging_ground', 'capture_method', 'fisher', 'landing_site',
       'species', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'ccl_cm',
       'ccw_cm', 'weight_kg', 'sex', 'turtle_characteristics', 'status',
       'release_site', 'date_time_release'],
      dtype='object')


In [24]:
# Define the function to extract the number
def extract_number_split(s):
    return int(s.split('_')[-1])

# Define a function to apply the extraction to multiple columns
def apply_extraction(df, columns):
    for column in columns:
        df[column] = df[column].apply(extract_number_split)
    return df

In [34]:
columns_to_extract_train = ['fisher', 'researcher', 'capture_site', 'species']
train_df = apply_extraction(train_df, columns_to_extract_train)

In [35]:
columns_to_drop = ['rescue_id', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex',
                   'capture_method', 'release_site', 'landing_site', 'status', 'foraging_ground', 'date_time_release']

train_df = train_df.drop(columns=columns_to_drop)

In [45]:
def convert_and_split_datetime(df, columns):
    """
    Convert specified datetime columns to timestamp and split into year and week columns
    with new names based on the original column names.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns.
    columns (list): List of column names to convert and split.
    
    Returns:
    pd.DataFrame: The DataFrame with new year and week columns.
    """
    for column in columns:
        # Convert the column to datetime
        df[column] = pd.to_datetime(df[column], errors='coerce')

        # Extract the base name without 'date_time_' prefix
        base_name = column.replace('date_time_', '')

        # Create new columns for year and week with the desired names
        df[f'year_{base_name}'] = df[column].dt.year
        df[f'week_{base_name}'] = df[column].dt.isocalendar().week

        # Drop the original datetime column if desired
        df.drop(columns=[column], inplace=True)

    return df

In [46]:
# Apply function to train_df
columns_to_convert = ['date_time_caught']
train_df = convert_and_split_datetime(train_df, columns_to_convert)

train_df.head()

Unnamed: 0,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg,year_caught,week_caught
0,25,0,1072,6,64.7,62.6,,2000,51
1,6,0,520,6,35.85,31.35,,2001,43
2,6,0,1669,5,51.8,49.2,,2001,44
3,32,0,1798,6,60.5,59.0,,2002,11
4,25,0,1918,5,34.7,33.0,,2002,32


## Imputing Missing Data in Weight

In [47]:
from sklearn.impute import KNNImputer
def imput_missing_weight_values(df, n = 5):
    knn_df = df[['ccl_cm', 'ccw_cm', 'weight_kg']]
    imputer = KNNImputer(n_neighbors=n)
    imputer.set_output(transform='pandas')

    return imputer.fit_transform(knn_df)

In [48]:
imputed_df = imput_missing_weight_values(train_df)
train_df['ccl_cm'] = imputed_df['ccl_cm']
train_df['ccw_cm'] = imputed_df['ccw_cm']
train_df['weight_kg'] = imputed_df['weight_kg']

In [50]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   researcher    18062 non-null  int64  
 1   capture_site  18062 non-null  int64  
 2   fisher        18062 non-null  int64  
 3   species       18062 non-null  int64  
 4   ccl_cm        18062 non-null  float64
 5   ccw_cm        18062 non-null  float64
 6   weight_kg     18062 non-null  float64
 7   year_caught   18062 non-null  int32  
 8   week_caught   18062 non-null  UInt32 
dtypes: UInt32(1), float64(3), int32(1), int64(4)
memory usage: 1.1 MB


In [51]:
train_df.to_csv('data/train.csv')

In [None]:
train_df = pd.read_csv('data/train.csv')

# Baseline Model

In [317]:
baseline_df = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index(name='turtles_rescued')
baseline_df

Unnamed: 0,year_caught,capture_site,week_caught,turtles_rescued
0,1998,11,28,1
1,1998,11,32,1
2,1998,11,39,2
3,1998,11,43,1
4,1998,11,45,1
...,...,...,...,...
7952,2018,27,36,1
7953,2018,27,38,1
7954,2018,27,45,1
7955,2018,28,44,1


In [318]:
baseline_df = baseline_df[~baseline_df['year_caught'].between(1988, 2006)].reset_index(drop=True)
baseline_df

Unnamed: 0,year_caught,capture_site,week_caught,turtles_rescued
0,2007,2,2,1
1,2007,2,3,2
2,2007,2,4,1
3,2007,2,5,1
4,2007,2,7,1
...,...,...,...,...
6461,2018,27,36,1
6462,2018,27,38,1
6463,2018,27,45,1
6464,2018,28,44,1


In [319]:
baseline_df.drop(['year_caught'], axis=1, inplace=True)

In [59]:
baseline_test = test_df.copy()

In [329]:
class BaselinePredictor:
    def __init__(self, df):
        self.df = df

    def predict_turtles_rescued_all(self):
        # Initialize an empty list to store dataframes
        dfs = []

        # Get unique capture sites and weeks in the baseline_df
        capture_site_all = self.df['capture_site'].unique()
        weeks_in_year = self.df['week_caught'].unique()

        # Iterate over each capture site and week to calculate the mean turtles_rescued
        for capture_site in capture_site_all:
            for week in weeks_in_year:
                # Calculate mean turtles_rescued for the current capture site and week
                mean_turtles_rescued = self.df[(self.df['capture_site'] == capture_site) & (self.df['week_caught'] == week)]['turtles_rescued'].mean()

                # Append a dataframe to the list
                dfs.append(pd.DataFrame({'capture_site': [capture_site], 'week_caught': [week], 'turtles_rescued': [mean_turtles_rescued]}))

        # Concatenate all dataframes in the list
        predict_df = pd.concat(dfs, ignore_index=True)

        return predict_df

In [330]:
# Initialize the predictor with the baseline_df
predictor = BaselinePredictor(baseline_df)

# Predict the baseline values
predict_baseline = predictor.predict_turtles_rescued_all()

# Print the predicted baseline DataFrame
print(predict_baseline)

      capture_site  week_caught  turtles_rescued
0                2            2         1.666667
1                2            3         1.333333
2                2            4         1.000000
3                2            5         1.500000
4                2            7         1.000000
...            ...          ...              ...
1532            17           37         2.333333
1533            17           39         1.000000
1534            17           33         1.250000
1535            17           20         1.000000
1536            17           53         1.000000

[1537 rows x 3 columns]


In [340]:
from sklearn.metrics import mean_absolute_error

# Ensure both DataFrames have the same number of rows
num_rows_baseline_test = baseline_test.shape[0]

# Randomly sample rows from predict_baseline to match the number of rows in sample_sub
predict_baseline_trimmed = predict_baseline.sample(n=num_rows_baseline_test, random_state=42).reset_index(drop=True)

# Ensure the indices match
baseline_test = baseline_test.reset_index(drop=True)

# Combine both DataFrames to ensure we drop NaNs in corresponding rows
combined_df = pd.concat([baseline_test['turtles_rescued'], predict_baseline_trimmed['turtles_rescued']], axis=1, keys=['true', 'pred'])

# Drop rows with NaN values in either column
combined_df = combined_df.dropna()

# Separate the true and predicted values
y_true = combined_df['true']
y_pred = combined_df['pred']

# Calculate MAE
mae_baseline = mean_absolute_error(y_true, y_pred)
print(mae_baseline)


3.2108299243509903


## Contextualising MAE

In [331]:
print(baseline_test.turtles_rescued.min())
print(baseline_test.turtles_rescued.max())

0
9


In [332]:
target_min = 1
target_max = 9
target_range = target_max - target_min
acceptable_mae = target_range * 0.1  # Example threshold of 10% of the range
print(f"Acceptable MAE: {acceptable_mae}")

Acceptable MAE: 0.8


## RMSE Baseline

In [341]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Summary statistics
print(sample_sub['turtles_rescued'].describe())

# Baseline RMSE
mean_turtles_rescued = baseline_test['turtles_rescued'].mean()
baseline_predictions = [mean_turtles_rescued] * len(sample_sub)
baseline_mse = mean_squared_error(baseline_test['turtles_rescued'], baseline_predictions)
baseline_rmse = np.sqrt(baseline_mse)
print(f"Baseline RMSE: {baseline_rmse}")

# Coefficient of Variation of RMSE
cv_rmse = (rmse / mean_turtles_rescued) * 100
print(f"Coefficient of Variation of RMSE: {cv_rmse:.2f}%")

# Standard Deviation of Turtles Rescued
std_turtles_rescued = baseline_test['turtles_rescued'].std()
print(f"Standard Deviation of Turtles Rescued: {std_turtles_rescued}")
print(f"RMSE as a proportion of Standard Deviation: {rmse / std_turtles_rescued}")


Root Mean Squared Error: 3.9440045387035063
count    1276.000000
mean        4.436520
std         2.878706
min         0.000000
25%         2.000000
50%         4.000000
75%         7.000000
max         9.000000
Name: turtles_rescued, dtype: float64
Baseline RMSE: 2.8775776437795377
Coefficient of Variation of RMSE: 88.90%
Standard Deviation of Turtles Rescued: 2.878705884420333
RMSE as a proportion of Standard Deviation: 1.370061651677794


## Benchmarking against baseline

In [342]:
# Calculate the MAE for the baseline model
baseline_mae = mean_absolute_error(sample_sub['turtles_rescued'], baseline_predictions)
print(f"Baseline MAE: {baseline_mae}")

# Compare baseline MAE with your model's MAE
if mae_baseline < baseline_mae:
    print("Your model is performing better than the baseline.")
else:
    print("Your model is not performing better than the baseline.")


Baseline MAE: 2.519082703589784
Your model is not performing better than the baseline.


## Interactive version

In [87]:
def predict_turtles_rescued(capture_site, week):
    predict_df = baseline_df[(baseline_df['capture_site'] == 5) & (baseline_df['week_caught'] == 5)]
    turtle_rescued = predict_df['turtles_rescued'].mean()
    return turtle_rescued

In [88]:
week = input(f'Enter a week number for which you would like to predict (from 1 to {len(baseline_df.week_caught.unique())}):')
capture_site = input(f'Enter a capture site number (from 1 to 29):')
print(f'Predicted Turtles Rescued for {capture_site} and {week}: {predict_turtles_rescued(capture_site, week)}')

NameError: name 'baseline_df' is not defined

# Enseble Model Staking

## Pipeline building

In [118]:
# machine-learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
    PolynomialFeatures,
    FunctionTransformer
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from imblearn.pipeline import Pipeline as imb_pipe

## Predicting what turtles are caught by week

In [None]:
# hypothesis is that turtles are effected (due to species, weight, etc) by week of the year (e.g seasons)

In [77]:
# loading data for model
week_model_df = train_df.copy()

In [78]:
# 30% examples in test data
train_week, test_week = train_test_split(week_model_df, test_size = 0.3, random_state=1) 

In [81]:
x = train_week.drop('week_caught', axis=1, inplace=True) # capture site? 
y = test_week.copy()

In [84]:
x = train_week.copy()

In [89]:
x

Unnamed: 0,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg,year_caught
4586,20,15,1473,6,50.00,48.3,15.590,2016
6898,20,19,1448,6,44.37,41.2,10.480,2018
17127,20,9,1230,6,41.10,38.1,7.400,2015
5417,30,16,1478,5,38.60,37.7,7.386,2005
17261,20,9,996,6,28.90,28.5,3.180,2015
...,...,...,...,...,...,...,...,...
10955,30,25,1478,5,86.50,76.1,78.470,2005
17289,30,9,1415,6,46.00,41.5,11.700,2015
5192,20,15,1343,6,32.30,30.0,3.500,2018
12172,20,25,1464,5,37.30,37.1,6.500,2011


In [69]:
# Prior to training our model, we’ll set aside a portion of our data in order to evaluate its performance.
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=1)

TypeError: Expected sequence or array-like, got <class 'NoneType'>

## Predicting what turtles are caught by capture_site

## Transforming 

In [127]:
transform_df = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index(name='turtles_rescued')
transform_df

Unnamed: 0,year_caught,capture_site,week_caught,turtles_rescued
0,1998,11,28,1
1,1998,11,32,1
2,1998,11,39,2
3,1998,11,43,1
4,1998,11,45,1
...,...,...,...,...
7952,2018,27,36,1
7953,2018,27,38,1
7954,2018,27,45,1
7955,2018,28,44,1


In [149]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd

class TurtleRescueModifierTrain:
    def __init__(self):
        # Generate arrays for year, week_caught, and capture_site
        years = np.repeat(np.arange(1988, 2019), 53)
        week_caught = np.tile(np.arange(1, 54), 31)
        capture_site = np.repeat(np.arange(1, 32), 53)

        # Create the DataFrame
        self.target_df = pd.DataFrame({
            'year_caught': years,
            'week_caught': week_caught,
            'capture_site': capture_site,
            'turtles_rescued': np.zeros(len(years))
        })

    def merge_data(self, source_df):
        for index, row in source_df.iterrows():
            # Match conditions based on year_caught, week_caught, and capture_site
            match_condition = (
                    (self.target_df['year_caught'] == row['year_caught']) &
                    (self.target_df['week_caught'] == row['week_caught']) &
                    (self.target_df['capture_site'] == row['capture_site'])
            )

            # Check if a matching row exists in the target_df
            matching_row_index = self.target_df.index[match_condition]

            if len(matching_row_index) > 0:
                # Update the existing row in target_df with data from source_df
                self.target_df.loc[matching_row_index[0], 'turtles_rescued'] = row['turtles_rescued']
            else:
                # If no matching row exists, create a new row in target_df
                new_row = {
                    'year_caught': row['year_caught'],
                    'week_caught': row['week_caught'],
                    'capture_site': row['capture_site'],
                    'turtles_rescued': row['turtles_rescued']
                }
                self.target_df = self.target_df.append(new_row, ignore_index=True)

        return self.target_df
    def impute_missing_values(self, transform_df):
        """
        Impute missing values in self.df using K-Nearest Neighbors (KNN) based on transform_df.
        """
        # Initialize KNN Imputer with k=5
        imputer = KNNImputer(n_neighbors=5)

        # Iterate through rows of transform_df
        for index, row in transform_df.iterrows():
            # Extract features for imputation
            features = self.df[
                (self.df['year_caught'] == row['year_caught']) &
                (self.df['week_caught'] == row['week_caught']) &
                (self.df['capture_site'] == row['capture_site'])
                ][['year_caught', 'week_caught', 'capture_site']]

            # Check if any missing values need to be imputed
            if features.empty:
                # Impute missing values for the current row of transform_df
                features = [[row['year_caught'], row['week_caught'], row['capture_site']]]
                imputed_values = imputer.fit_transform(features)
                self.df.loc[len(self.df)] = np.append(imputed_values[0], row['turtles_rescued'])

        return self.df


In [150]:
transformed_train = TurtleRescueModifierTrain().merge_data(transform_df)
transformed_train

AttributeError: 'DataFrame' object has no attribute 'append'

In [141]:
class TurtleRescueContainer:
    def __init__(self):
        # Initialize the DataFrame with 53 weeks and 29 capture sites
        self.df = pd.DataFrame({
            'week_caught': np.tile(np.arange(1, 54), 29),
            'capture_site': np.repeat(np.arange(1, 30), 53),
            'turtles_rescued': np.zeros(53 * 29),
            'weight_week': np.zeros(53 * 29),
            'weight_capture_site': np.zeros(53 * 29)
        })
        self.df['weight_combined'] = np.zeros(53 * 29)

    def update_with_mean(self, df1, df2):
        """
        Update the internal DataFrame with means of turtles_rescued, weight_week,
        and weight_capture_site from two DataFrames grouped by week_caught and capture_site.

        Parameters:
        df1 (pd.DataFrame): The DataFrame containing 'week_caught', 'turtles_rescued', 'weight_week'.
        df2 (pd.DataFrame): The DataFrame containing 'capture_site', 'turtles_rescued', 'weight_capture_site'.
        """
        # Calculate mean values for df1 grouped by week_caught
        if 'week_caught' in df1.columns:
            df1_grouped = df1.groupby('week_caught').agg({
                'turtles_rescued': 'mean',
                'weight_week': 'mean'
            }).reset_index()

            # Merge the means into the internal DataFrame
            self.df = self.df.merge(df1_grouped, on='week_caught', how='left', suffixes=('', '_df1'))
            self.df['turtles_rescued'] = self.df[['turtles_rescued', 'turtles_rescued_df1']].mean(axis=1)
            self.df['weight_week'] = self.df[['weight_week', 'weight_week_df1']].mean(axis=1)
            self.df.drop(columns=['turtles_rescued_df1', 'weight_week_df1'], inplace=True)

        # Calculate mean values for df2 grouped by capture_site
        if 'capture_site' in df2.columns:
            df2_grouped = df2.groupby('capture_site').agg({
                'turtles_rescued': 'mean',
                'weight_capture_site': 'mean'
            }).reset_index()

            # Merge the means into the internal DataFrame
            self.df = self.df.merge(df2_grouped, on='capture_site', how='left', suffixes=('', '_df2'))
            self.df['turtles_rescued'] = self.df[['turtles_rescued', 'turtles_rescued_df2']].mean(axis=1)
            self.df['weight_capture_site'] = self.df[['weight_capture_site', 'weight_capture_site_df2']].mean(axis=1)
            self.df.drop(columns=['turtles_rescued_df2', 'weight_capture_site_df2'], inplace=True)

        # Calculate the combined weight
        self.df['weight_combined'] = self.df['weight_week'] + self.df['weight_capture_site']
        
        def impute_missing(self):
         """
         Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
         """
         # Extract features for imputation
         features = self.df[['year', 'week_caught', 'capture_site']]
        
         # Initialize KNN Imputer with k=5 (you can adjust k as needed)
         imputer = KNNImputer(n_neighbors=5)
        
         # Impute missing values
         self.df['turtles_rescued'] = imputer.fit_transform(features)

        
        return self.df

In [142]:
bla = 

## Calculate weights for the validation set

In [None]:
# Function to calculate weights
def calculate_weights(y_true, y_pred):
    residuals = np.abs(y_true - y_pred)
    weights = 1 / (residuals + 1e-5)  # Adding a small value to avoid division by zero
    weights /= weights.sum()  # Normalize weights
    return weights

In [None]:
# Calculate weights for the validation set
weights = calculate_weights(y_val, (lr_pred + rf_pred + gb_pred) / 3)

# Weighted predictions
final_pred = (lr_pred * weights + rf_pred * weights + gb_pred * weights) / weights.sum()

## Final model

In [None]:
postprocessor =

In [None]:
model = imb_pipe(
    steps=[
        ('fl_imputer', flipper_length_imputer),
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]
)