In [3]:
import pandas as pd
import re

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, PoissonRegressor




from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



import warnings

from sklearn import linear_model

warnings.filterwarnings('ignore')



Reading all files into Panda frames

In [None]:
capture_site = pd.read_csv('data/CaptureSite_category.csv')
sample_sub = pd.read_csv('data/Sample_sub.csv')
train_df = pd.read_csv('data/train.csv')

train_df.head()


We will drop irrelvant columns 

Rename all columns

In [None]:

# Standartising column names 
def standardize_column_names(col):
    # Replace spaces with underscores
    col = col.replace(' ', '_')
    # Insert underscore before each uppercase letter preceded by a lowercase letter or followed by a lowercase letter
    col = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', col)
    col = re.sub(r'(?<=[A-Z])(?=[A-Z][a-z])', '_', col)
    # Convert to lower case
    col = col.lower()
    # Ensure single underscores only (in case of consecutive underscores from initial spaces)
    col = re.sub(r'_+', '_', col)
    return col

train_df.columns = [standardize_column_names(col) for col in train_df.columns]

# Printing the updated column names to verify the changes
print(train_df.columns)

In [None]:
columns_to_drop = ['rescue_id', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex', 'capture_method', 'release_site', 'landing_site', 'status', 'foraging_ground']

train_df = train_df.drop(columns=columns_to_drop)

Some Helpmethods and clean the columns 'fischer'

In [None]:
# Extract a number of String of the form XXXX_000

def extract_number_split(s):
    num = s.split('_')[-1]
    return int(num)

extract_number_split('Fischer_5')


train_df['fisher'] = train_df['fisher'].apply(extract_number_split)
train_df['researcher'] = train_df['researcher'].apply(extract_number_split)
train_df['capture_site'] = train_df['capture_site'].apply(extract_number_split)
train_df['species'] = train_df['species'].apply(extract_number_split)




convert and split datetime

In [None]:
import pandas as pd

def convert_and_split_datetime(df, columns):
    """
    Convert specified datetime columns to timestamp and split into year and week columns
    with new names based on the original column names.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns.
    columns (list): List of column names to convert and split.
    
    Returns:
    pd.DataFrame: The DataFrame with new year and week columns.
    """
    for column in columns:
        # Convert the column to datetime
        df[column] = pd.to_datetime(df[column], errors='coerce')

        # Extract the base name without 'date_time_' prefix
        base_name = column.replace('date_time_', '')

        # Create new columns for year and week with the desired names
        df[f'year_{base_name}'] = df[column].dt.year
        df[f'week_{base_name}'] = df[column].dt.isocalendar().week

        # Drop the original datetime column if desired
        df.drop(columns=[column], inplace=True)

    return df

# Example usage
# Assuming 'df' is your DataFrame
columns_to_convert = ['date_time_caught', 'date_time_release']
train_df = convert_and_split_datetime(train_df, columns_to_convert)



We will use Knn on ccl_cm and ccw_cm to compute the 5409 missing values of weight

In [None]:
from sklearn.impute import KNNImputer
def imput_missing_weight_values(df, n = 5):
    knn_df = df[['ccl_cm', 'ccw_cm', 'weight_kg']]
    imputer = KNNImputer(n_neighbors=n)
    imputer.set_output(transform='pandas')

    return imputer.fit_transform(knn_df)


In [None]:
train_df

In [None]:
def split_column(df):
    # Split the column into three parts
    split_columns = df['SiteInfo'].str.split('_', expand=True)

# Assign these parts to new columns in the DataFrame
    df['site'] = split_columns[0]
    df['y'] = split_columns[1]
    df['Year'] = split_columns[2]

In [None]:
imputed_df = imput_missing_weight_values(train_df)
train_df['ccl_cm'] = imputed_df['ccl_cm']
train_df['ccw_cm'] = imputed_df['ccw_cm']
train_df['weight_kg'] = imputed_df['weight_kg']

In [None]:
train_df


Split Function

In [None]:
train_df.columns

In [None]:
train_df = train_df.drop(columns=[  'year_release',
       'week_release'])


In [None]:
train_df.to_csv('Data/df.csv')

In [None]:
train_df['year_week'] = (100 *train_df.year_caught) + train_df.week_caught
train_df.info()

In [None]:
train_df['year_week'] = train_df['year_week'].astype(str)
train_df['capture_site'] = train_df['capture_site'].astype(str)

train_df['ID'] = 'CaptureSite_'+ train_df['capture_site']+ '_' + train_df['year_week']
train_df.drop(columns=['capture_site','week_caught','year_week' ])

In [None]:
df_median  = train_df.groupby('ID').median().reset_index()

df_mean  = train_df.groupby('ID').mean().reset_index()
df_size = train_df.groupby('ID').size().reset_index(name='Capture_Number')


In [None]:
df = pd.DataFrame()

df['Capture_Number'] = df_size['Capture_Number']

df['ccl_cm'] =  df_mean['ccl_cm'] 
df['ccw_cm'] =  df_mean['ccw_cm'] 
df['weight_kg'] =  df_mean['weight_kg'] 

df['researcher'] =  df_median['researcher'] 
df['fisher'] =  df_median['fisher'] 
df['species'] =  df_median['species'] 




In [None]:
sample_sub['ID']

In [None]:
sample_sub = pd.read_csv('data/Sample_sub.csv')
sample_sub.head(1)

In [None]:
def split_colum(df_split):
    split_columns = df_split['ID'].str.split('_', expand=True)
    year_week = split_columns[2]
    year_week= year_week.astype(str)
    year = year_week.str.slice(2, 4)
    week = year_week.str.slice(5, 6)



    df_split['year'] = year.astype(int)
    df_split['week'] = week.astype(int)

    df_split['site'] = split_columns[1].astype(int)
    df_split.drop(columns = 'ID')
    return df_split


In [None]:
split_colum(sample_sub)
sample_sub


In [None]:
df = split_colum(df_size)
df

In [None]:
X = df[['year', 'week', 'site']]
y = df['Capture_Number']

X_test = sample_sub[['year', 'week', 'site']]
y_test = sample_sub['Capture_Number']



In [None]:
def print_error_score(y_test, y_pred):
    print('mean_squared_error: ' , mean_squared_error(y_test, y_pred))
    print('root mean_squared_error: ' , np.sqrt(mean_squared_error(y_test, y_pred)))
    print('mean_absolute_error: ' , mean_absolute_error(y_test, y_pred))
    print('r2_score: ' , r2_score(y_test, y_pred))

In [None]:
lg = LinearRegression()
lg.fit(X,y)
y_pred = lg.predict(X_test)
print('---- Lienar--------')
print_error_score(y_test, y_pred)

In [None]:
randomForesst = RandomForestRegressor(random_state= 100, )
#randomForesst.fit(X, y)
#y_pred_randomForest = randomForesst.predict(X_test)

param_grid = {
    'n_estimators': range(0, 1000, 100),
    'criterion': [ 'squared_error', 'absolute_error']
}
grid_search_random = GridSearchCV(estimator=randomForesst, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search_random.fit(X=X, y=y)

y_pred_randomForest = grid_search_random.predict(X_test)





print('---- Random Forest ------')
print_error_score(y_test, y_pred_randomForest)

#Ridge Regression


In [None]:
ridge = Ridge()
#ridge.fit(X, y)
#y_pred_ridge = ridge.predict(X_test)

param_grid = {
    'alpha': range(0, 1000, 1),
    'fit_intercept': [True, False],
}
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search_ridge.fit(X=X, y=y)

y_pred_ridge = grid_search_ridge.predict(X_test)


print(grid_search_ridge.best_estimator_)

print('------Ridge----------')
print_error_score(y_test, y_pred_ridge)

# PoissonRegressor

In [None]:
poisson = PoissonRegressor()
#ridge.fit(X, y)
#y_pred_ridge = ridge.predict(X_test)

param_grid = {
    'alpha': range(0, 100, 1),
    'fit_intercept': [True, False],
}
grid_search_ridge = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search_ridge.fit(X=X, y=y)

y_pred_ridge = grid_search_ridge.predict(X_test)


print(grid_search_ridge.best_estimator_)

print('------Ridge----------')
print_error_score(y_test, y_pred_ridge)

In [None]:
def split_data(X , y):
    # Splitting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split(  
    X, Y, test_size = 0.3, random_state = 100) 
    return X_train, X_test, y_train, y_test