In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

from sklearn import linear_model

warnings.filterwarnings('ignore')

# Test Dataset formating

In [67]:
final_test_df = pd.read_csv('data/Sample_sub.csv')

In [68]:
test_df = pd.read_csv('data/Sample_sub.csv')

In [69]:
test_df

Unnamed: 0,ID,Capture_Number
0,CaptureSite_0_201901,7
1,CaptureSite_0_201902,1
2,CaptureSite_0_201903,5
3,CaptureSite_0_201904,2
4,CaptureSite_0_201905,3
...,...,...
1271,CaptureSite_9_201940,0
1272,CaptureSite_9_201941,7
1273,CaptureSite_9_201942,7
1274,CaptureSite_9_201943,3


In [70]:
def extract_capture_site(df):
    df['capture_site'] = df['ID'].apply(lambda x: x.split('_')[-2])
    return df

In [71]:
def extract_and_convert_week(df):
    # Extract the second to last element
    df['week_caught'] = df['ID'].apply(lambda x: x[-2:])

    # Convert to datetime with appropriate format for year and month ("%Y%m")
    df['week_caught'] = df['week_caught'].apply(lambda x : int(x))

    return df

In [72]:
def formating_sample_sub(df): 
    # extracting capture site id
    df = extract_capture_site(df)
    
    # extracting week of rascue
    df = extract_and_convert_week(df)
    
    # renaming columns to match training set
    df.rename(columns={'Capture_Number': 'turtles_rescued'}, inplace=True)
    
    # getting rid of mixed column
    df.drop(columns=['ID'], inplace=True)
    
    # Standartising prediction 
    df = df.groupby(['capture_site', 'week_caught'])['turtles_rescued'].sum().reset_index()
    
    return df

In [73]:
test_df = formating_sample_sub(test_df)

In [75]:
test_df = pd.read_csv('data/test_df.csv')

# Train Dataset Formating

In [76]:
train_df = pd.read_csv('data/train.csv')

In [77]:
train_df.head()

Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,...,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease
0,2000_RE_0060,2000-12-22,Researcher_25,CaptureSite_0,Ocean,Net,Fisher_1072,LandingSite_CaptureSiteCategory_2,Species_6,CC00147,...,,,64.7,62.6,,Unknown,algae at rear of shell,Released,ReleaseSite_50,22/12/00
1,2001_RE_0187,2001-10-28,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_520,LandingSite_CaptureSiteCategory_2,Species_6,W442,...,,,35.85,31.35,,Unknown,multiple b's on front flippers& a lot of alga...,Released,ReleaseSite_62,28/10/01
2,2001_RE_0197,2001-11-01,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_1669,LandingSite_CaptureSiteCategory_2,Species_5,KE0376,...,,,51.8,49.2,,Unknown,clean,Released,ReleaseSite_50,01/11/01
3,2002_RE_0031,2002-03-11,Researcher_32,CaptureSite_0,Ocean,Net,Fisher_1798,LandingSite_CaptureSiteCategory_2,Species_6,CC00302,...,,,60.5,59.0,,Unknown,1 b 3 CS+ calcerous algae at rear end of shell...,Released,ReleaseSite_50,11/03/02
4,2002_RE_0118,2002-08-08,Researcher_25,CaptureSite_0,Ocean,Beached,Fisher_1918,LandingSite_CaptureSiteCategory_2,Species_5,NotTagged_0113,...,,,34.7,33.0,,Unknown,very lively+ right eye is hanging out + swolle...,Released,ReleaseSite_62,08/08/02


In [78]:
import re

# Defining function to Standartising column names 
def standardize_column_names(col):
    # Replace spaces with underscores
    col = col.replace(' ', '_')
    # Insert underscore before each uppercase letter preceded by a lowercase letter or followed by a lowercase letter
    col = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', col)
    col = re.sub(r'(?<=[A-Z])(?=[A-Z][a-z])', '_', col)
    # Convert to lower case
    col = col.lower()
    # Ensure single underscores only (in case of consecutive underscores from initial spaces)
    col = re.sub(r'_+', '_', col)
    return col


In [79]:
# Applying function to df
train_df.columns = [standardize_column_names(col) for col in train_df.columns]

# Printing the updated column names to verify the changes
print(train_df.columns)

Index(['rescue_id', 'date_time_caught', 'researcher', 'capture_site',
       'foraging_ground', 'capture_method', 'fisher', 'landing_site',
       'species', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'ccl_cm',
       'ccw_cm', 'weight_kg', 'sex', 'turtle_characteristics', 'status',
       'release_site', 'date_time_release'],
      dtype='object')


In [80]:
# Define the function to extract the number
def extract_number_split(s):
    return int(s.split('_')[-1])

# Define a function to apply the extraction to multiple columns
def apply_extraction(df, columns):
    for column in columns:
        # Convert column to string type if it's not already
        if df[column].dtype != 'object':
            df[column] = df[column].astype(str)

        # Apply the extraction function
        df[column] = df[column].apply(extract_number_split)
    return df

In [81]:
columns_to_extract_train = ['fisher', 'researcher', 'capture_site', 'species']
train_df = apply_extraction(train_df, columns_to_extract_train)

In [82]:
columns_to_drop = ['rescue_id', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex',
                   'capture_method', 'release_site', 'landing_site', 'status', 'foraging_ground', 'date_time_release']

train_df = train_df.drop(columns=columns_to_drop)

In [83]:
train_df

Unnamed: 0,date_time_caught,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg
0,2000-12-22,25,0,1072,6,64.70,62.60,
1,2001-10-28,6,0,520,6,35.85,31.35,
2,2001-11-01,6,0,1669,5,51.80,49.20,
3,2002-03-11,32,0,1798,6,60.50,59.00,
4,2002-08-08,25,0,1918,5,34.70,33.00,
...,...,...,...,...,...,...,...,...
18057,2018-12-18,30,9,569,5,57.13,50.57,21.09
18058,2018-12-18,30,9,125,6,42.07,38.37,9.02
18059,2018-12-24,30,9,1343,5,57.20,52.30,
18060,2018-12-24,30,9,1551,5,51.90,48.50,


In [84]:
def convert_and_split_datetime(df, columns):
    """
    Convert specified datetime columns to timestamp and split into year and week columns
    with new names based on the original column names.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns.
    columns (list): List of column names to convert and split.
    
    Returns:
    pd.DataFrame: The DataFrame with new year and week columns.
    """
    for column in columns:
        # Convert the column to datetime
        df[column] = pd.to_datetime(df[column], errors='coerce')

        # Extract the base name without 'date_time_' prefix
        base_name = column.replace('date_time_', '')

        # Create new columns for year and week with the desired names
        df[f'year_{base_name}'] = df[column].dt.year
        df[f'week_{base_name}'] = df[column].dt.isocalendar().week

        # Drop the original datetime column if desired
        df.drop(columns=[column], inplace=True)

    return df

In [85]:
# Apply function to train_df
columns_to_convert = ['date_time_caught']
train_df = convert_and_split_datetime(train_df, columns_to_convert)

train_df.head()

Unnamed: 0,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg,year_caught,week_caught
0,25,0,1072,6,64.7,62.6,,2000,51
1,6,0,520,6,35.85,31.35,,2001,43
2,6,0,1669,5,51.8,49.2,,2001,44
3,32,0,1798,6,60.5,59.0,,2002,11
4,25,0,1918,5,34.7,33.0,,2002,32


## Imputing Missing Data in Weight

In [86]:
from sklearn.impute import KNNImputer
def imput_missing_weight_values(df, n = 5):
    knn_df = df[['ccl_cm', 'ccw_cm', 'weight_kg']]
    imputer = KNNImputer(n_neighbors=n)
    imputer.set_output(transform='pandas')

    return imputer.fit_transform(knn_df)

In [87]:
imputed_df = imput_missing_weight_values(train_df)
train_df['ccl_cm'] = imputed_df['ccl_cm']
train_df['ccw_cm'] = imputed_df['ccw_cm']
train_df['weight_kg'] = imputed_df['weight_kg']

In [88]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   researcher    18062 non-null  int64  
 1   capture_site  18062 non-null  int64  
 2   fisher        18062 non-null  int64  
 3   species       18062 non-null  int64  
 4   ccl_cm        18062 non-null  float64
 5   ccw_cm        18062 non-null  float64
 6   weight_kg     18062 non-null  float64
 7   year_caught   18062 non-null  int32  
 8   week_caught   18062 non-null  UInt32 
dtypes: UInt32(1), float64(3), int32(1), int64(4)
memory usage: 1.1 MB


In [90]:
train_df = pd.read_csv('data/train_df.csv')

# Baseline Model

In [91]:
baseline_df = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index(name='turtles_rescued')
baseline_df

Unnamed: 0,year_caught,capture_site,week_caught,turtles_rescued
0,1998,11,28,1
1,1998,11,32,1
2,1998,11,39,2
3,1998,11,43,1
4,1998,11,45,1
...,...,...,...,...
7952,2018,27,36,1
7953,2018,27,38,1
7954,2018,27,45,1
7955,2018,28,44,1


In [92]:
baseline_df = baseline_df[~baseline_df['year_caught'].between(1988, 2006)].reset_index(drop=True)
baseline_df

Unnamed: 0,year_caught,capture_site,week_caught,turtles_rescued
0,2007,2,2,1
1,2007,2,3,2
2,2007,2,4,1
3,2007,2,5,1
4,2007,2,7,1
...,...,...,...,...
6461,2018,27,36,1
6462,2018,27,38,1
6463,2018,27,45,1
6464,2018,28,44,1


In [93]:
baseline_df.drop(['year_caught'], axis=1, inplace=True)

In [94]:
baseline_test = test_df.copy()

In [95]:
class BaselinePredictor:
    def __init__(self, df):
        self.df = df

    def predict_turtles_rescued_all(self):
        # Initialize an empty list to store dataframes
        dfs = []

        # Get unique capture sites and weeks in the baseline_df
        capture_site_all = self.df['capture_site'].unique()
        weeks_in_year = self.df['week_caught'].unique()

        # Iterate over each capture site and week to calculate the mean turtles_rescued
        for capture_site in capture_site_all:
            for week in weeks_in_year:
                # Calculate mean turtles_rescued for the current capture site and week
                mean_turtles_rescued = self.df[(self.df['capture_site'] == capture_site) & (self.df['week_caught'] == week)]['turtles_rescued'].mean()

                # Append a dataframe to the list
                dfs.append(pd.DataFrame({'capture_site': [capture_site], 'week_caught': [week], 'turtles_rescued': [mean_turtles_rescued]}))

        # Concatenate all dataframes in the list
        predict_df = pd.concat(dfs, ignore_index=True)

        return predict_df

In [96]:
# Initialize the predictor with the baseline_df
predictor = BaselinePredictor(baseline_df)

# Predict the baseline values
predict_baseline = predictor.predict_turtles_rescued_all()

# Print the predicted baseline DataFrame
print(predict_baseline)

      capture_site  week_caught  turtles_rescued
0                2            2         1.666667
1                2            3         1.333333
2                2            4         1.000000
3                2            5         1.500000
4                2            7         1.000000
...            ...          ...              ...
1532            17           37         2.333333
1533            17           39         1.000000
1534            17           33         1.250000
1535            17           20         1.000000
1536            17           53         1.000000

[1537 rows x 3 columns]


## RMSE Baseline

In [152]:
from sklearn.metrics import mean_squared_error

# Calculate RMSE
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Summary statistics
print(test_df['turtles_rescued'].describe())

# Baseline RMSE
mean_turtles_rescued = baseline_test['turtles_rescued'].mean()
baseline_predictions = [mean_turtles_rescued] * len(test_df)
baseline_mse = mean_squared_error(baseline_test['turtles_rescued'], baseline_predictions)
baseline_rmse = np.sqrt(baseline_mse)
print(f"Baseline RMSE: {baseline_rmse}")


Root Mean Squared Error: 3.9440045387035063
count    1276.000000
mean        4.436520
std         2.878706
min         0.000000
25%         2.000000
50%         4.000000
75%         7.000000
max         9.000000
Name: turtles_rescued, dtype: float64
Baseline RMSE: 2.8775776437795377


## Interactive version

In [102]:
def predict_turtles_rescued(capture_site, week):
    predict_df = baseline_df[(baseline_df['capture_site'] == 5) & (baseline_df['week_caught'] == 5)]
    turtle_rescued = predict_df['turtles_rescued'].mean()
    return turtle_rescued

In [103]:
week = input(f'Enter a week number for which you would like to predict (from 1 to {len(baseline_df.week_caught.unique())}):')
capture_site = input(f'Enter a capture site number (from 1 to 29):')
print(f'Predicted Turtles Rescued for {capture_site} and {week}: {predict_turtles_rescued(capture_site, week)}')

Predicted Turtles Rescued for  and : 1.0


# Enseble Model Staking

## Transform data

In [104]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd

In [105]:
to_transform_train = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index(name='turtles_rescued')
to_transform_test = test_df.copy()

In [106]:
class TurtleRescueModifierTrain:
    def __init__(self):
        # Generate arrays for year, week_caught, and capture_site
        years = np.repeat(np.arange(1988, 2019), 53 * 29)
        week_caught = np.tile(np.arange(1, 54), 31 * 29)
        capture_site = np.repeat(np.arange(1, 30), 53 * 31)

        # Create the DataFrame
        self.df = pd.DataFrame({
            'year_caught': years,
            'week_caught': week_caught,
            'capture_site': capture_site,
            'turtles_rescued': np.zeros(len(years))
        })

    def merge_data(self, source_df):
        for index, row in source_df.iterrows():
            # Match conditions based on year_caught, week_caught, and capture_site
            match_condition = (
                    (self.df['year_caught'] == row['year_caught']) &
                    (self.df['week_caught'] == row['week_caught']) &
                    (self.df['capture_site'] == row['capture_site'])
            )

            # Check if a matching row exists in the target_df
            matching_row_index = self.df.index[match_condition]

            if len(matching_row_index) > 0:
                # Update the existing row in target_df with data from source_df
                self.df.loc[matching_row_index[0], 'turtles_rescued'] = row['turtles_rescued']
            else:
                # If no matching row exists, create a new row in target_df
                new_row = {
                    'year_caught': row['year_caught'],
                    'week_caught': row['week_caught'],
                    'capture_site': row['capture_site'],
                    'turtles_rescued': row['turtles_rescued']
                }
                self.df = pd.concat([self.df, pd.DataFrame([new_row])], ignore_index=True)

        return self

    def impute_missing_values(self):
        """
        Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
        """
        # Ensure some 'turtles_rescued' values are NaN for imputation demonstration
        self.df.loc[self.df.sample(frac=0.1).index, 'turtles_rescued'] = np.nan

        # Select columns for imputation
        features = self.df[['year_caught', 'week_caught', 'capture_site']]
        targets = self.df[['turtles_rescued']]

        # Combine features and targets for imputation
        combined = pd.concat([features, targets], axis=1)

        # Initialize KNN Imputer with k=5
        imputer = KNNImputer(n_neighbors=5)

        # Impute missing values
        imputed_data = imputer.fit_transform(combined)
        self.df['turtles_rescued'] = imputed_data[:, -1]
        
        # Convert 'turtles_rescued' to integer type
        self.df['turtles_rescued'] = self.df['turtles_rescued'].astype(int)
        return self.df


In [107]:
merged_train = TurtleRescueModifierTrain()
merged_train.merge_data(to_transform_train)
imputed_train = TurtleRescueModifierTrain.impute_missing_values(merged_train)

In [108]:
class TurtleRescueModifierTest:
    def __init__(self, year):
        """
        Initialize the TurtleRescueModifierTrain with data for a specific year.
        
        :param year: The specific year for which to initialize the data.
        """
        # Generate arrays for the specified year, week_caught, and capture_site
        self.year = year
        weeks_per_year = 53
        capture_sites = 29

        year = np.repeat(year, weeks_per_year * capture_sites)
        week_caught = np.tile(np.arange(1, weeks_per_year + 1), capture_sites)
        capture_site = np.repeat(np.arange(1, capture_sites + 1), weeks_per_year)

        # Create the DataFrame
        self.df = pd.DataFrame({
            'year_caught': year,
            'week_caught': week_caught,
            'capture_site': capture_site,
            'turtles_rescued': np.zeros(len(year))
        })

    def merge_data(self, source_df):
        # Add 'year_caught' column to source_df if it doesn't exist
        if 'year_caught' not in source_df.columns:
            source_df['year_caught'] = self.year
        
        for index, row in source_df.iterrows():
            # Match conditions based on year_caught, week_caught, and capture_site
            match_condition = (
                (self.df['year_caught'] == row['year_caught']) &
                (self.df['week_caught'] == row['week_caught']) &
                (self.df['capture_site'] == row['capture_site'])
            )

            # Check if a matching row exists in the target_df
            matching_row_index = self.df.index[match_condition]

            if len(matching_row_index) > 0:
                # Update the existing row in target_df with data from source_df
                self.df.loc[matching_row_index[0], 'turtles_rescued'] = row['turtles_rescued']
            else:
                # If no matching row exists, create a new row in target_df
                new_row = {
                    'year_caught': row['year_caught'],
                    'week_caught': row['week_caught'],
                    'capture_site': row['capture_site'],
                    'turtles_rescued': row['turtles_rescued']
                }
                self.df = pd.concat([self.df, pd.DataFrame([new_row])], ignore_index=True)

        return self



    def impute_missing_values(self):
        """
        Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
        """
        # Introduce NaN values into the 'turtles_rescued' column to demonstrate imputation
        self.df.loc[self.df.sample(frac=0.1).index, 'turtles_rescued'] = np.nan

        # Initialize KNN Imputer with k=5
        imputer = KNNImputer(n_neighbors=5)

        # Select columns for imputation
        features = self.df[['year_caught', 'week_caught', 'capture_site']]
        targets = self.df[['turtles_rescued']]

        # Combine features and targets for imputation
        combined = pd.concat([features, targets], axis=1)

        # Impute missing values
        imputed_data = imputer.fit_transform(combined)
        self.df['turtles_rescued'] = imputed_data[:, -1]

        # Convert 'turtles_rescued' to integer type
        self.df['turtles_rescued'] = self.df['turtles_rescued'].astype(int)

        return self.df


In [109]:
merged_test = TurtleRescueModifierTest(year = 2019)
merged_test.merge_data(to_transform_test)
imputed_test = merged_test.impute_missing_values()

In [110]:
class TurtleRescueContainer:
    def __init__(self):
        # Initialize the DataFrame with 53 weeks and 29 capture sites
        self.df = pd.DataFrame({
            'week_caught': np.tile(np.arange(1, 54), 29),
            'capture_site': np.repeat(np.arange(1, 30), 53),
            'turtles_rescued': np.zeros(53 * 29),
            'weight_week': np.zeros(53 * 29),
            'weight_capture_site': np.zeros(53 * 29)
        })
        self.df['weight_combined'] = np.zeros(53 * 29)

    def update_with_mean(self, df1, df2):
        """
        Update the internal DataFrame with means of turtles_rescued, weight_week,
        and weight_capture_site from two DataFrames grouped by week_caught and capture_site.

        Parameters:
        df1 (pd.DataFrame): The DataFrame containing 'week_caught', 'turtles_rescued', 'weight_week'.
        df2 (pd.DataFrame): The DataFrame containing 'capture_site', 'turtles_rescued', 'weight_capture_site'.
        """
        # Calculate mean values for df1 grouped by week_caught
        if 'week_caught' in df1.columns:
            df1_grouped = df1.groupby('week_caught').agg({
                'turtles_rescued': 'mean',
                'weight_week': 'mean'
            }).reset_index()

            # Merge the means into the internal DataFrame
            self.df = self.df.merge(df1_grouped, on='week_caught', how='left', suffixes=('', '_df1'))
            self.df['turtles_rescued'] = self.df[['turtles_rescued', 'turtles_rescued_df1']].mean(axis=1)
            self.df['weight_week'] = self.df[['weight_week', 'weight_week_df1']].mean(axis=1)
            self.df.drop(columns=['turtles_rescued_df1', 'weight_week_df1'], inplace=True)

        # Calculate mean values for df2 grouped by capture_site
        if 'capture_site' in df2.columns:
            df2_grouped = df2.groupby('capture_site').agg({
                'turtles_rescued': 'mean',
                'weight_capture_site': 'mean'
            }).reset_index()

            # Merge the means into the internal DataFrame
            self.df = self.df.merge(df2_grouped, on='capture_site', how='left', suffixes=('', '_df2'))
            self.df['turtles_rescued'] = self.df[['turtles_rescued', 'turtles_rescued_df2']].mean(axis=1)
            self.df['weight_capture_site'] = self.df[['weight_capture_site', 'weight_capture_site_df2']].mean(axis=1)
            self.df.drop(columns=['turtles_rescued_df2', 'weight_capture_site_df2'], inplace=True)

        # Calculate the combined weight
        self.df['weight_combined'] = self.df['weight_week'] + self.df['weight_capture_site']
        
        def impute_missing(self):
         """
         Impute missing values in the 'turtles_rescued' column using K-Nearest Neighbors (KNN).
         """
         # Extract features for imputation
         features = self.df[['year', 'week_caught', 'capture_site']]
        
         # Initialize KNN Imputer with k=5 (you can adjust k as needed)
         imputer = KNNImputer(n_neighbors=5)
        
         # Impute missing values
         self.df['turtles_rescued'] = imputer.fit_transform(features)

        
        return self.df

## Simple ML model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# loading data for model
train_simple = imputed_train.copy()
test_simple = imputed_test.copy()

# Extract features and target from train_week DataFrame
X_train_simple = train_simple[['year_caught', 'week_caught', 'capture_site']]
y_train_simple = train_simple['turtles_rescued']

X_test_simple = test_simple[['year_caught', 'week_caught', 'capture_site']]
y_test_simple = ['turtles_rescued']

# Define a list of regression models
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    SVR()
]

# Define a list to store the trained models
trained_models_simple = []

# Iterate over each model and fit it to the training data
for model in models:
    model.fit(X_train_simple, y_train_simple)
    trained_models_simple.append(model)

# Perform cross-validation for each model and evaluate their performance
for model in models:
    scores_week_caught = cross_val_score(model, X_train_simple, y_train_simple, scoring='neg_mean_squared_error', cv=5)
    rmse_scores_week_caught = np.sqrt(-scores_week_caught)
    print(f"{model.__class__.__name__}: Mean RMSE: {rmse_scores_week_caught.mean()}, Std RMSE: {rmse_scores_week_caught.std()}")

## Ensemble model 

In [151]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Define a list of regression models
models = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    SVR()
]

# Train and evaluate each model
trained_models_simple = []
for model in models:
    model.fit(X_train, y_train)
    trained_models_simple.append(model)

    # Perform cross-validation
    scores_simple = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse_scores_simple = np.sqrt(-scores_simple)
    print(f"{model.__class__.__name__}: Mean RMSE (CV): {rmse_scores_simple.mean()}, Std RMSE (CV): {rmse_scores_simple.std()}")

    # Calculate RMSE on the test set
    predictions_test = model.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, predictions_test))
    print(f"{model.__class__.__name__}: RMSE (Test): {rmse_test}")

# Define the base estimators for stacking
estimators = [
    ('ridge', Ridge(alpha=10)),
    ('lasso', Lasso(alpha=0.1)),
    ('gbr', GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5))
]

# Create the stacking regressor with Linear Regression as the final estimator
final_estimator = LinearRegression()
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=final_estimator)

# Train the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Evaluate the stacking regressor on the test set
stacking_predictions = stacking_regressor.predict(X_test)
stacking_rmse = np.sqrt(mean_squared_error(y_test, stacking_predictions))
print(f"Stacking Model RMSE (Test): {stacking_rmse}")

# Perform cross-validation for the stacking model
stacking_cv_scores = cross_val_score(stacking_regressor, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
stacking_cv_rmse = np.sqrt(-stacking_cv_scores)
print(f"Stacking Model Mean RMSE (CV): {stacking_cv_rmse.mean()}, Std RMSE (CV): {stacking_cv_rmse.std()}")


LinearRegression: Mean RMSE (CV): 1.1423161391458305, Std RMSE (CV): 0.053385205539318116
LinearRegression: RMSE (Test): 1.1438915753380086
Ridge: Mean RMSE (CV): 1.1423161384208917, Std RMSE (CV): 0.053385213596612496
Ridge: RMSE (Test): 1.143891573906307
Lasso: Mean RMSE (CV): 1.200933182501268, Std RMSE (CV): 0.05879184303830722
Lasso: RMSE (Test): 1.2018821186165782
DecisionTreeRegressor: Mean RMSE (CV): 1.0912595546037704, Std RMSE (CV): 0.0614285750713496
DecisionTreeRegressor: RMSE (Test): 1.1020641503934943
RandomForestRegressor: Mean RMSE (CV): 0.8656135264110377, Std RMSE (CV): 0.057189134479072465
RandomForestRegressor: RMSE (Test): 0.8882171233162441
SVR: Mean RMSE (CV): 1.3201629784585647, Std RMSE (CV): 0.06265591337407421
SVR: RMSE (Test): 1.3106863473952135
Stacking Model RMSE (Test): 0.8249750684273498
Stacking Model Mean RMSE (CV): 0.8138215010045841, Std RMSE (CV): 0.052474907402054384
