In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

from sklearn import linear_model

warnings.filterwarnings('ignore')

# Test Dataset formating

In [None]:
final_test_df = pd.read_csv('data/Sample_sub.csv')

In [None]:
test_df = pd.read_csv('data/Sample_sub.csv')

In [None]:
test_df

In [None]:
def extract_capture_site(df):
    df['capture_site'] = df['ID'].apply(lambda x: x.split('_')[-2])
    return df

In [None]:
def extract_and_convert_week(df):
    # Extract the second to last element
    df['week_caught'] = df['ID'].apply(lambda x: x[-2:])

    # Convert to datetime with appropriate format for year and month ("%Y%m")
    df['week_caught'] = df['week_caught'].apply(lambda x : int(x))

    return df

In [None]:
def formating_sample_sub(df): 
    # extracting capture site id
    df = extract_capture_site(df)
    
    # extracting week of rascue
    df = extract_and_convert_week(df)
    
    # renaming columns to match training set
    df.rename(columns={'Capture_Number': 'turtles_rescued'}, inplace=True)
    
    # getting rid of mixed column
    df.drop(columns=['ID'], inplace=True)
    
    # Standartising prediction 
    df = df.groupby(['capture_site', 'week_caught'])['turtles_rescued'].sum().reset_index()
    
    return df

In [None]:
test_df = formating_sample_sub(test_df)

In [None]:
test_df.to_csv('data/test_df.csv', index=False)

In [None]:
test_df = pd.read_csv('data/test_df.csv')

# Train Dataset Formating

In [None]:
train_df = pd.read_csv('data/train.csv')

In [None]:
train_df.head()

In [None]:
import re

# Defining function to Standartising column names 
def standardize_column_names(col):
    # Replace spaces with underscores
    col = col.replace(' ', '_')
    # Insert underscore before each uppercase letter preceded by a lowercase letter or followed by a lowercase letter
    col = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', col)
    col = re.sub(r'(?<=[A-Z])(?=[A-Z][a-z])', '_', col)
    # Convert to lower case
    col = col.lower()
    # Ensure single underscores only (in case of consecutive underscores from initial spaces)
    col = re.sub(r'_+', '_', col)
    return col


In [None]:
# Applying function to df
train_df.columns = [standardize_column_names(col) for col in train_df.columns]

# Printing the updated column names to verify the changes
print(train_df.columns)

In [None]:
# Define the function to extract the number
def extract_number_split(s):
    return int(s.split('_')[-1])

# Define a function to apply the extraction to multiple columns
def apply_extraction(df, columns):
    for column in columns:
        # Convert column to string type if it's not already
        if df[column].dtype != 'object':
            df[column] = df[column].astype(str)

        # Apply the extraction function
        df[column] = df[column].apply(extract_number_split)
    return df

In [None]:
columns_to_extract_train = ['fisher', 'researcher', 'capture_site', 'species']
train_df = apply_extraction(train_df, columns_to_extract_train)

In [None]:
columns_to_drop = ['rescue_id', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex',
                   'capture_method', 'release_site', 'landing_site', 'status', 'foraging_ground', 'date_time_release']

train_df = train_df.drop(columns=columns_to_drop)

In [None]:
train_df

In [None]:
def convert_and_split_datetime(df, columns):
    """
    Convert specified datetime columns to timestamp and split into year and week columns
    with new names based on the original column names.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns.
    columns (list): List of column names to convert and split.
    
    Returns:
    pd.DataFrame: The DataFrame with new year and week columns.
    """
    for column in columns:
        # Convert the column to datetime
        df[column] = pd.to_datetime(df[column], errors='coerce')

        # Extract the base name without 'date_time_' prefix
        base_name = column.replace('date_time_', '')

        # Create new columns for year and week with the desired names
        df[f'year_{base_name}'] = df[column].dt.year
        df[f'week_{base_name}'] = df[column].dt.isocalendar().week

        # Drop the original datetime column if desired
        df.drop(columns=[column], inplace=True)

    return df

In [None]:
# Apply function to train_df
columns_to_convert = ['date_time_caught']
train_df = convert_and_split_datetime(train_df, columns_to_convert)

train_df.head()

## Imputing Missing Data in Weight

In [None]:
from sklearn.impute import KNNImputer
def imput_missing_weight_values(df, n = 5):
    knn_df = df[['ccl_cm', 'ccw_cm', 'weight_kg']]
    imputer = KNNImputer(n_neighbors=n)
    imputer.set_output(transform='pandas')

    return imputer.fit_transform(knn_df)

In [None]:
imputed_df = imput_missing_weight_values(train_df)
train_df['ccl_cm'] = imputed_df['ccl_cm']
train_df['ccw_cm'] = imputed_df['ccw_cm']
train_df['weight_kg'] = imputed_df['weight_kg']

In [None]:
train_df.info()

In [None]:
train_df.to_csv('data/train.csv')

In [None]:
train_df = pd.read_csv('data/train.csv')

# Baseline Model

In [None]:
baseline_df = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index(name='turtles_rescued')
baseline_df

In [None]:
baseline_df = baseline_df[~baseline_df['year_caught'].between(1988, 2006)].reset_index(drop=True)
baseline_df

In [None]:
baseline_df.drop(['year_caught'], axis=1, inplace=True)

In [None]:
baseline_test = test_df.copy()

In [None]:
class BaselinePredictor:
    def __init__(self, df):
        self.df = df

    def predict_turtles_rescued_all(self):
        # Initialize an empty list to store dataframes
        dfs = []

        # Get unique capture sites and weeks in the baseline_df
        capture_site_all = self.df['capture_site'].unique()
        weeks_in_year = self.df['week_caught'].unique()

        # Iterate over each capture site and week to calculate the mean turtles_rescued
        for capture_site in capture_site_all:
            for week in weeks_in_year:
                # Calculate mean turtles_rescued for the current capture site and week
                mean_turtles_rescued = self.df[(self.df['capture_site'] == capture_site) & (self.df['week_caught'] == week)]['turtles_rescued'].mean()

                # Append a dataframe to the list
                dfs.append(pd.DataFrame({'capture_site': [capture_site], 'week_caught': [week], 'turtles_rescued': [mean_turtles_rescued]}))

        # Concatenate all dataframes in the list
        predict_df = pd.concat(dfs, ignore_index=True)

        return predict_df

In [None]:
# Initialize the predictor with the baseline_df
predictor = BaselinePredictor(baseline_df)

# Predict the baseline values
predict_baseline = predictor.predict_turtles_rescued_all()

# Print the predicted baseline DataFrame
print(predict_baseline)

In [None]:
from sklearn.metrics import mean_absolute_error

# Ensure both DataFrames have the same number of rows
num_rows_baseline_test = baseline_test.shape[0]

# Randomly sample rows from predict_baseline to match the number of rows in sample_sub
predict_baseline_trimmed = predict_baseline.sample(n=num_rows_baseline_test, random_state=42).reset_index(drop=True)

# Ensure the indices match
baseline_test = baseline_test.reset_index(drop=True)

# Combine both DataFrames to ensure we drop NaNs in corresponding rows
combined_df = pd.concat([baseline_test['turtles_rescued'], predict_baseline_trimmed['turtles_rescued']], axis=1, keys=['true', 'pred'])

# Drop rows with NaN values in either column
combined_df = combined_df.dropna()

# Separate the true and predicted values
y_true = combined_df['true']
y_pred = combined_df['pred']

# Calculate MAE
mae_baseline = mean_absolute_error(y_true, y_pred)
print(mae_baseline)


## Contextualising MAE

In [None]:
print(baseline_test.turtles_rescued.min())
print(baseline_test.turtles_rescued.max())

In [None]:
target_min = 1
target_max = 9
target_range = target_max - target_min
acceptable_mae = target_range * 0.1  # Example threshold of 10% of the range
print(f"Acceptable MAE: {acceptable_mae}")

## RMSE Baseline

In [None]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Summary statistics
print(test_df['turtles_rescued'].describe())

# Baseline RMSE
mean_turtles_rescued = baseline_test['turtles_rescued'].mean()
baseline_predictions = [mean_turtles_rescued] * len(test_df)
baseline_mse = mean_squared_error(baseline_test['turtles_rescued'], baseline_predictions)
baseline_rmse = np.sqrt(baseline_mse)
print(f"Baseline RMSE: {baseline_rmse}")

# Coefficient of Variation of RMSE
cv_rmse = (rmse / mean_turtles_rescued) * 100
print(f"Coefficient of Variation of RMSE: {cv_rmse:.2f}%")

# Standard Deviation of Turtles Rescued
std_turtles_rescued = baseline_test['turtles_rescued'].std()
print(f"Standard Deviation of Turtles Rescued: {std_turtles_rescued}")
print(f"RMSE as a proportion of Standard Deviation: {rmse / std_turtles_rescued}")


## Benchmarking against baseline

In [None]:
# Calculate the MAE for the baseline model
baseline_mae = mean_absolute_error(test_df['turtles_rescued'], baseline_predictions)
print(f"Baseline MAE: {baseline_mae}")

# Compare baseline MAE with your model's MAE
if mae_baseline < baseline_mae:
    print("Your model is performing better than the baseline.")
else:
    print("Your model is not performing better than the baseline.")


## Interactive version

In [None]:
def predict_turtles_rescued(capture_site, week):
    predict_df = baseline_df[(baseline_df['capture_site'] == 5) & (baseline_df['week_caught'] == 5)]
    turtle_rescued = predict_df['turtles_rescued'].mean()
    return turtle_rescued

In [None]:
week = input(f'Enter a week number for which you would like to predict (from 1 to {len(baseline_df.week_caught.unique())}):')
capture_site = input(f'Enter a capture site number (from 1 to 29):')
print(f'Predicted Turtles Rescued for {capture_site} and {week}: {predict_turtles_rescued(capture_site, week)}')

# Enseble Model Staking

## Transformin data for round 1

In [None]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd

In [None]:
to_transform_train = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index(name='turtles_rescued')
to_transform_test = test_df.copy()

In [None]:
class TurtleRescueModifierTrain:
    def __init__(self):
        # Generate arrays for year, week_caught, and capture_site
        years = np.repeat(np.arange(1988, 2019), 53 * 29)
        week_caught = np.tile(np.arange(1, 54), 31 * 29)
        capture_site = np.repeat(np.arange(1, 30), 53 * 31)

        # Create the DataFrame
        self.df = pd.DataFrame({
            'year_caught': years,
            'week_caught': week_caught,
            'capture_site': capture_site,
            'turtles_rescued': np.zeros(len(years))
        })

    def merge_data(self, source_df):
        for index, row in source_df.iterrows():
            # Match conditions based on year_caught, week_caught, and capture_site
            match_condition = (
                    (self.df['year_caught'] == row['year_caught']) &
                    (self.df['week_caught'] == row['week_caught']) &
                    (self.df['capture_site'] == row['capture_site'])
            )

            # Check if a matching row exists in the target_df
            matching_row_index = self.df.index[match_condition]

            if len(matching_row_index) > 0:
                # Update the existing row in target_df with data from source_df
                self.df.loc[matching_row_index[0], 'turtles_rescued'] = row['turtles_rescued']
            else:
                # If no matching row exists, create a new row in target_df
                new_row = {
                    'year_caught': row['year_caught'],
                    'week_caught': row['week_caught'],
                    'capture_site': row['capture_site'],
                    'turtles_rescued': row['turtles_rescued']
                }
                self.df = pd.concat([self.df, pd.DataFrame([new_row])], ignore_index=True)

        return self

    def impute_missing_values(self):
        """
        Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
        """
        # Ensure some 'turtles_rescued' values are NaN for imputation demonstration
        self.df.loc[self.df.sample(frac=0.1).index, 'turtles_rescued'] = np.nan

        # Select columns for imputation
        features = self.df[['year_caught', 'week_caught', 'capture_site']]
        targets = self.df[['turtles_rescued']]

        # Combine features and targets for imputation
        combined = pd.concat([features, targets], axis=1)

        # Initialize KNN Imputer with k=5
        imputer = KNNImputer(n_neighbors=5)

        # Impute missing values
        imputed_data = imputer.fit_transform(combined)
        self.df['turtles_rescued'] = imputed_data[:, -1]
        
        # Convert 'turtles_rescued' to integer type
        self.df['turtles_rescued'] = self.df['turtles_rescued'].astype(int)
        return self.df


In [None]:
merged_train = TurtleRescueModifierTrain()
merged_train.merge_data(to_transform_train)
imputed_train = TurtleRescueModifierTrain.impute_missing_values(merged_train)

In [None]:
class TurtleRescueModifierTest:
    def __init__(self, year):
        """
        Initialize the TurtleRescueModifierTrain with data for a specific year.
        
        :param year: The specific year for which to initialize the data.
        """
        # Generate arrays for the specified year, week_caught, and capture_site
        self.year = year
        weeks_per_year = 53
        capture_sites = 29

        year = np.repeat(year, weeks_per_year * capture_sites)
        week_caught = np.tile(np.arange(1, weeks_per_year + 1), capture_sites)
        capture_site = np.repeat(np.arange(1, capture_sites + 1), weeks_per_year)

        # Create the DataFrame
        self.df = pd.DataFrame({
            'year_caught': year,
            'week_caught': week_caught,
            'capture_site': capture_site,
            'turtles_rescued': np.zeros(len(year))
        })

    def merge_data(self, source_df):
        # Add 'year_caught' column to source_df if it doesn't exist
        if 'year_caught' not in source_df.columns:
            source_df['year_caught'] = self.year
        
        for index, row in source_df.iterrows():
            # Match conditions based on year_caught, week_caught, and capture_site
            match_condition = (
                (self.df['year_caught'] == row['year_caught']) &
                (self.df['week_caught'] == row['week_caught']) &
                (self.df['capture_site'] == row['capture_site'])
            )

            # Check if a matching row exists in the target_df
            matching_row_index = self.df.index[match_condition]

            if len(matching_row_index) > 0:
                # Update the existing row in target_df with data from source_df
                self.df.loc[matching_row_index[0], 'turtles_rescued'] = row['turtles_rescued']
            else:
                # If no matching row exists, create a new row in target_df
                new_row = {
                    'year_caught': row['year_caught'],
                    'week_caught': row['week_caught'],
                    'capture_site': row['capture_site'],
                    'turtles_rescued': row['turtles_rescued']
                }
                self.df = pd.concat([self.df, pd.DataFrame([new_row])], ignore_index=True)

        return self



    def impute_missing_values(self):
        """
        Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
        """
        # Introduce NaN values into the 'turtles_rescued' column to demonstrate imputation
        self.df.loc[self.df.sample(frac=0.1).index, 'turtles_rescued'] = np.nan

        # Initialize KNN Imputer with k=5
        imputer = KNNImputer(n_neighbors=5)

        # Select columns for imputation
        features = self.df[['year_caught', 'week_caught', 'capture_site']]
        targets = self.df[['turtles_rescued']]

        # Combine features and targets for imputation
        combined = pd.concat([features, targets], axis=1)

        # Impute missing values
        imputed_data = imputer.fit_transform(combined)
        self.df['turtles_rescued'] = imputed_data[:, -1]

        # Convert 'turtles_rescued' to integer type
        self.df['turtles_rescued'] = self.df['turtles_rescued'].astype(int)

        return self.df


In [None]:
merged_test = TurtleRescueModifierTest(year = 2019)
merged_test.merge_data(to_transform_test)
imputed_test = merged_test.impute_missing_values()

In [None]:
class TurtleRescueContainer:
    def __init__(self):
        # Initialize the DataFrame with 53 weeks and 29 capture sites
        self.df = pd.DataFrame({
            'week_caught': np.tile(np.arange(1, 54), 29),
            'capture_site': np.repeat(np.arange(1, 30), 53),
            'turtles_rescued': np.zeros(53 * 29),
            'weight_week': np.zeros(53 * 29),
            'weight_capture_site': np.zeros(53 * 29)
        })
        self.df['weight_combined'] = np.zeros(53 * 29)

    def update_with_mean(self, df1, df2):
        """
        Update the internal DataFrame with means of turtles_rescued, weight_week,
        and weight_capture_site from two DataFrames grouped by week_caught and capture_site.

        Parameters:
        df1 (pd.DataFrame): The DataFrame containing 'week_caught', 'turtles_rescued', 'weight_week'.
        df2 (pd.DataFrame): The DataFrame containing 'capture_site', 'turtles_rescued', 'weight_capture_site'.
        """
        # Calculate mean values for df1 grouped by week_caught
        if 'week_caught' in df1.columns:
            df1_grouped = df1.groupby('week_caught').agg({
                'turtles_rescued': 'mean',
                'weight_week': 'mean'
            }).reset_index()

            # Merge the means into the internal DataFrame
            self.df = self.df.merge(df1_grouped, on='week_caught', how='left', suffixes=('', '_df1'))
            self.df['turtles_rescued'] = self.df[['turtles_rescued', 'turtles_rescued_df1']].mean(axis=1)
            self.df['weight_week'] = self.df[['weight_week', 'weight_week_df1']].mean(axis=1)
            self.df.drop(columns=['turtles_rescued_df1', 'weight_week_df1'], inplace=True)

        # Calculate mean values for df2 grouped by capture_site
        if 'capture_site' in df2.columns:
            df2_grouped = df2.groupby('capture_site').agg({
                'turtles_rescued': 'mean',
                'weight_capture_site': 'mean'
            }).reset_index()

            # Merge the means into the internal DataFrame
            self.df = self.df.merge(df2_grouped, on='capture_site', how='left', suffixes=('', '_df2'))
            self.df['turtles_rescued'] = self.df[['turtles_rescued', 'turtles_rescued_df2']].mean(axis=1)
            self.df['weight_capture_site'] = self.df[['weight_capture_site', 'weight_capture_site_df2']].mean(axis=1)
            self.df.drop(columns=['turtles_rescued_df2', 'weight_capture_site_df2'], inplace=True)

        # Calculate the combined weight
        self.df['weight_combined'] = self.df['weight_week'] + self.df['weight_capture_site']
        
        def impute_missing(self):
         """
         Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
         """
         # Extract features for imputation
         features = self.df[['year', 'week_caught', 'capture_site']]
        
         # Initialize KNN Imputer with k=5 (you can adjust k as needed)
         imputer = KNNImputer(n_neighbors=5)
        
         # Impute missing values
         self.df['turtles_rescued'] = imputer.fit_transform(features)

        
        return self.df

## Simple ML model

In [171]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# loading data for model
train_simple = imputed_train.copy()
test_simple = imputed_test.copy()

# Extract features and target from train_week DataFrame
X_train_simple = train_simple[['year_caught', 'week_caught', 'capture_site']]
y_train_simple = train_simple['turtles_rescued']

X_test_simple = test_simple[['year_caught', 'week_caught', 'capture_site']]
y_test_simple = ['turtles_rescued']

# Define a list of regression models
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    SVR()
]

# Define a list to store the trained models
trained_models_simple = []

# Iterate over each model and fit it to the training data
for model in models:
    model.fit(X_train_simple, y_train_simple)
    trained_models_simple.append(model)

# Perform cross-validation for each model and evaluate their performance
for model in models:
    scores_week_caught = cross_val_score(model, X_train_simple, y_train_simple, scoring='neg_mean_squared_error', cv=5)
    rmse_scores_week_caught = np.sqrt(-scores_week_caught)
    print(f"{model.__class__.__name__}: Mean RMSE: {rmse_scores_week_caught.mean()}, Std RMSE: {rmse_scores_week_caught.std()}")

LinearRegression: Mean RMSE: 0.7755586975741101, Std RMSE: 0.9123569095216355
Ridge: Mean RMSE: 0.77555857499529, Std RMSE: 0.9123565437055333
Lasso: Mean RMSE: 0.8568197695076567, Std RMSE: 0.8799947970367469
DecisionTreeRegressor: Mean RMSE: 1.7478028821827514, Std RMSE: 0.7129092440924288
RandomForestRegressor: Mean RMSE: 1.5572672141203443, Std RMSE: 0.6232480063598077
SVR: Mean RMSE: 0.6744766983019547, Std RMSE: 0.9261452350434871


## Predicting turtles_rescued by week

In [None]:
# loading data for model
train_week_caught = imputed_train.copy()
test_week_caught = imputed_test.copy()

# Extract features and target from train_week DataFrame
X_train_week_caught = train_week_caught[['year_caught', 'week_caught', 'capture_site']]
y_train_week_caught = train_week_caught['turtles_rescued']

X_test_week_caught = test_week_caught[['year_caught', 'week_caught', 'capture_site']]
y_test_week_caught = test_week_caught['turtles_rescued']


# Define a list to store the trained models
trained_models_week_caught = []

# Iterate over each model and fit it to the training data
for model in models:
    model.fit(X_train_week_caught, y_train_week_caught)
    trained_models_week_caught.append(model)

# Perform cross-validation for each model and evaluate their performance
for model in models:
    scores_week_caught = cross_val_score(model, X_train_week_caught, y_train_week_caught, scoring='neg_mean_squared_error', cv=5)
    rmse_scores_week_caught = np.sqrt(-scores_week_caught)
    print(f"{model.__class__.__name__}: Mean RMSE: {rmse_scores_week_caught.mean()}, Std RMSE: {rmse_scores_week_caught.std()}")


## Predicting what turtles are caught by capture_site

In [None]:
# loading data for model
train_capture_site = imputed_train.copy()
test_capture_site = imputed_test.copy()

X_train_capture_site = train_capture_site[['year_caught', 'week_caught', 'capture_site']]
y_train_capture_site = train_capture_site['turtles_rescued']

X_test_capture_site = test_capture_site[['year_caught', 'week_caught', 'capture_site']]
y_test_capture_site = test_capture_site['turtles_rescued']

# Define a list to store the trained models
trained_models_capture_site = []

# Iterate over each model and fit it to the training data
for model in models:
    model.fit(X_train_capture_site, y_train_capture_site)
    trained_models_capture_site.append(model)

# Perform cross-validation for each model and evaluate their performance
for model in models:
    scores_capture_site = cross_val_score(model, X_train_capture_site, y_train_capture_site,
                                          scoring='neg_mean_squared_error', cv=5)
    rmse_scores_capture_site = np.sqrt(-scores_capture_site)
    print(
        f"{model.__class__.__name__}: Mean RMSE: {rmse_scores_capture_site.mean()}, Std RMSE: {rmse_scores_capture_site.std()}")

## Calculate weights for the validation set

# Function to calculate weights
def calculate_weights(y_true, y_pred):
    residuals = np.abs(y_true - y_pred)
    weights = 1 / (residuals + 1e-5)  # Adding a small value to avoid division by zero
    weights /= weights.sum()  # Normalize weights
    return weights

# Calculate weights for the validation set
weights = calculate_weights(y_val, (lr_pred + rf_pred + gb_pred) / 3)

# Weighted predictions
final_pred = (lr_pred * weights + rf_pred * weights + gb_pred * weights) / weights.sum()

## Final model

In [167]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error

X_train = imputed_train[['year_caught', 'week_caught', 'capture_site']]
y_train = imputed_train['turtles_rescued']

X_test = imputed_test[['year_caught', 'week_caught', 'capture_site']]
y_test = imputed_test.turtles_rescued.copy()

# Generate predictions using trained models for capture_site
predictions_capture_site_train = np.column_stack([model.predict(X_train_capture_site) for model in trained_models_capture_site])

# Generate predictions using trained models for week_caught
predictions_week_caught_train = np.column_stack([model.predict(X_train_week_caught) for model in trained_models_week_caught])

# Generate predictions using trained models for capture_site
predictions_capture_site_test = np.column_stack([model.predict(X_test_capture_site) for model in trained_models_capture_site])

# Generate predictions using trained models for week_caught
predictions_week_caught_test = np.column_stack([model.predict(X_test_week_caught) for model in trained_models_week_caught])

In [169]:
# Combine the predictions into a single feature set
combined_predictions_train = np.concatenate((predictions_capture_site_train, predictions_week_caught_train), axis=1)
combined_predictions_test = np.concatenate((predictions_capture_site_test, predictions_week_caught_test), axis=1)

In [172]:
# Train a new model (e.g., Random Forest) on the combined predictions
combined_model = SVR()
combined_model.fit(combined_predictions_train, y_train)  # Assuming y_test is the target variable

# Use the combined model to make predictions
combined_predictions_train_final = combined_model.predict(combined_predictions_train)
combined_predictions_test_final = combined_model.predict(combined_predictions_test)

# Calculate RMSE for predictions made by the combined model on the test data
rmse_combined_test = np.sqrt(mean_squared_error(y_test, combined_predictions_test_final))
print(f"Combined Model RMSE on Test Data: {rmse_combined_test}")

# Perform cross-validation for the combined model on the training data
cv_scores_combined = cross_val_score(combined_model, combined_predictions_train, y_train,
                                     scoring='neg_mean_squared_error', cv=5)
rmse_cv_combined = np.sqrt(-cv_scores_combined)
print(f"Combined Model Cross-Validation RMSE: Mean={rmse_cv_combined.mean()}, Std={rmse_cv_combined.std()}")


Combined Model RMSE on Test Data: 3.884347895389471
Combined Model Cross-Validation RMSE: Mean=0.666241684750007, Std=0.9121731389277432
