In [135]:
import pandas as pd
import re

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



import warnings

from sklearn import linear_model

warnings.filterwarnings('ignore')



Reading all files into Panda frames

In [136]:
capture_site = pd.read_csv('data/CaptureSite_category.csv')
sample_sub = pd.read_csv('data/Sample_sub.csv')
train_df = pd.read_csv('data/train.csv')

train_df.head()


Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,...,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease
0,2000_RE_0060,2000-12-22,Researcher_25,CaptureSite_0,Ocean,Net,Fisher_1072,LandingSite_CaptureSiteCategory_2,Species_6,CC00147,...,,,64.7,62.6,,Unknown,algae at rear of shell,Released,ReleaseSite_50,22/12/00
1,2001_RE_0187,2001-10-28,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_520,LandingSite_CaptureSiteCategory_2,Species_6,W442,...,,,35.85,31.35,,Unknown,multiple b's on front flippers& a lot of alga...,Released,ReleaseSite_62,28/10/01
2,2001_RE_0197,2001-11-01,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_1669,LandingSite_CaptureSiteCategory_2,Species_5,KE0376,...,,,51.8,49.2,,Unknown,clean,Released,ReleaseSite_50,01/11/01
3,2002_RE_0031,2002-03-11,Researcher_32,CaptureSite_0,Ocean,Net,Fisher_1798,LandingSite_CaptureSiteCategory_2,Species_6,CC00302,...,,,60.5,59.0,,Unknown,1 b 3 CS+ calcerous algae at rear end of shell...,Released,ReleaseSite_50,11/03/02
4,2002_RE_0118,2002-08-08,Researcher_25,CaptureSite_0,Ocean,Beached,Fisher_1918,LandingSite_CaptureSiteCategory_2,Species_5,NotTagged_0113,...,,,34.7,33.0,,Unknown,very lively+ right eye is hanging out + swolle...,Released,ReleaseSite_62,08/08/02


We will drop irrelvant columns 

Rename all columns

In [137]:

# Standartising column names 
def standardize_column_names(col):
    # Replace spaces with underscores
    col = col.replace(' ', '_')
    # Insert underscore before each uppercase letter preceded by a lowercase letter or followed by a lowercase letter
    col = re.sub(r'(?<=[a-z])(?=[A-Z])', '_', col)
    col = re.sub(r'(?<=[A-Z])(?=[A-Z][a-z])', '_', col)
    # Convert to lower case
    col = col.lower()
    # Ensure single underscores only (in case of consecutive underscores from initial spaces)
    col = re.sub(r'_+', '_', col)
    return col

train_df.columns = [standardize_column_names(col) for col in train_df.columns]

# Printing the updated column names to verify the changes
print(train_df.columns)

Index(['rescue_id', 'date_time_caught', 'researcher', 'capture_site',
       'foraging_ground', 'capture_method', 'fisher', 'landing_site',
       'species', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'ccl_cm',
       'ccw_cm', 'weight_kg', 'sex', 'turtle_characteristics', 'status',
       'release_site', 'date_time_release'],
      dtype='object')


In [138]:
columns_to_drop = ['rescue_id', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex', 'capture_method', 'release_site', 'landing_site', 'status', 'foraging_ground']

train_df = train_df.drop(columns=columns_to_drop)

Some Helpmethods and clean the columns 'fischer'

In [139]:
# Extract a number of String of the form XXXX_000

def extract_number_split(s):
    num = s.split('_')[-1]
    return int(num)

extract_number_split('Fischer_5')


train_df['fisher'] = train_df['fisher'].apply(extract_number_split)
train_df['researcher'] = train_df['researcher'].apply(extract_number_split)
train_df['capture_site'] = train_df['capture_site'].apply(extract_number_split)
train_df['species'] = train_df['species'].apply(extract_number_split)




convert and split datetime

In [140]:
import pandas as pd

def convert_and_split_datetime(df, columns):
    """
    Convert specified datetime columns to timestamp and split into year and week columns
    with new names based on the original column names.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns.
    columns (list): List of column names to convert and split.
    
    Returns:
    pd.DataFrame: The DataFrame with new year and week columns.
    """
    for column in columns:
        # Convert the column to datetime
        df[column] = pd.to_datetime(df[column], errors='coerce')

        # Extract the base name without 'date_time_' prefix
        base_name = column.replace('date_time_', '')

        # Create new columns for year and week with the desired names
        df[f'year_{base_name}'] = df[column].dt.year
        df[f'week_{base_name}'] = df[column].dt.isocalendar().week

        # Drop the original datetime column if desired
        df.drop(columns=[column], inplace=True)

    return df

# Example usage
# Assuming 'df' is your DataFrame
columns_to_convert = ['date_time_caught', 'date_time_release']
train_df = convert_and_split_datetime(train_df, columns_to_convert)



We will use Knn on ccl_cm and ccw_cm to compute the 5409 missing values of weight

In [141]:
from sklearn.impute import KNNImputer
def imput_missing_weight_values(df, n = 5):
    knn_df = df[['ccl_cm', 'ccw_cm', 'weight_kg']]
    imputer = KNNImputer(n_neighbors=n)
    imputer.set_output(transform='pandas')

    return imputer.fit_transform(knn_df)


In [142]:
train_df

Unnamed: 0,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg,year_caught,week_caught,year_release,week_release
0,25,0,1072,6,64.70,62.60,,2000,51,2000.0,51
1,6,0,520,6,35.85,31.35,,2001,43,2001.0,43
2,6,0,1669,5,51.80,49.20,,2001,44,2001.0,2
3,32,0,1798,6,60.50,59.00,,2002,11,2002.0,44
4,25,0,1918,5,34.70,33.00,,2002,32,2002.0,32
...,...,...,...,...,...,...,...,...,...,...,...
18057,30,9,569,5,57.13,50.57,21.09,2018,51,2018.0,51
18058,30,9,125,6,42.07,38.37,9.02,2018,51,2018.0,51
18059,30,9,1343,5,57.20,52.30,,2018,52,2018.0,52
18060,30,9,1551,5,51.90,48.50,,2018,52,2018.0,52


In [143]:
imputed_df = imput_missing_weight_values(train_df)
train_df['ccl_cm'] = imputed_df['ccl_cm']
train_df['ccw_cm'] = imputed_df['ccw_cm']
train_df['weight_kg'] = imputed_df['weight_kg']

In [144]:
train_df


Unnamed: 0,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg,year_caught,week_caught,year_release,week_release
0,25,0,1072,6,64.70,62.60,31.490,2000,51,2000.0,51
1,6,0,520,6,35.85,31.35,4.852,2001,43,2001.0,43
2,6,0,1669,5,51.80,49.20,16.776,2001,44,2001.0,2
3,32,0,1798,6,60.50,59.00,29.614,2002,11,2002.0,44
4,25,0,1918,5,34.70,33.00,5.310,2002,32,2002.0,32
...,...,...,...,...,...,...,...,...,...,...,...
18057,30,9,569,5,57.13,50.57,21.090,2018,51,2018.0,51
18058,30,9,125,6,42.07,38.37,9.020,2018,51,2018.0,51
18059,30,9,1343,5,57.20,52.30,18.744,2018,52,2018.0,52
18060,30,9,1551,5,51.90,48.50,16.612,2018,52,2018.0,52


Split Function

In [145]:
train_df.columns

Index(['researcher', 'capture_site', 'fisher', 'species', 'ccl_cm', 'ccw_cm',
       'weight_kg', 'year_caught', 'week_caught', 'year_release',
       'week_release'],
      dtype='object')

In [146]:
train_df = train_df.drop(columns=[  'year_release',
       'week_release'])
train_df

Unnamed: 0,researcher,capture_site,fisher,species,ccl_cm,ccw_cm,weight_kg,year_caught,week_caught
0,25,0,1072,6,64.70,62.60,31.490,2000,51
1,6,0,520,6,35.85,31.35,4.852,2001,43
2,6,0,1669,5,51.80,49.20,16.776,2001,44
3,32,0,1798,6,60.50,59.00,29.614,2002,11
4,25,0,1918,5,34.70,33.00,5.310,2002,32
...,...,...,...,...,...,...,...,...,...
18057,30,9,569,5,57.13,50.57,21.090,2018,51
18058,30,9,125,6,42.07,38.37,9.020,2018,51
18059,30,9,1343,5,57.20,52.30,18.744,2018,52
18060,30,9,1551,5,51.90,48.50,16.612,2018,52


In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7957 entries, 0 to 7956
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year_caught   7957 non-null   int32  
 1   capture_site  7957 non-null   int64  
 2   week_caught   7957 non-null   UInt32 
 3   researcher    7957 non-null   float64
 4   fisher        7957 non-null   float64
 5   species       7957 non-null   float64
 6   ccl_cm        7957 non-null   float64
 7   ccw_cm        7957 non-null   float64
 8   weight_kg     7957 non-null   float64
dtypes: UInt32(1), float64(6), int32(1), int64(1)
memory usage: 505.2 KB


In [155]:
df = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).mean().reset_index()


df

Unnamed: 0,year_caught,capture_site,week_caught,researcher,fisher,species,ccl_cm,ccw_cm,weight_kg
0,1998,11,28,37.0,1235.0,5.0,48.260,36.83,9.550
1,1998,11,32,4.0,835.0,5.0,44.450,33.00,10.246
2,1998,11,39,25.0,681.5,4.5,51.435,47.00,22.485
3,1998,11,43,37.0,1716.0,5.0,47.000,43.18,12.376
4,1998,11,45,4.0,1058.0,5.0,39.370,35.56,6.184
...,...,...,...,...,...,...,...,...,...
7952,2018,27,36,20.0,969.0,5.0,41.500,39.40,8.730
7953,2018,27,38,30.0,1721.0,5.0,48.100,46.10,12.400
7954,2018,27,45,20.0,1478.0,5.0,48.400,44.00,13.120
7955,2018,28,44,20.0,1115.0,6.0,34.130,32.90,4.430


In [None]:
sample_sub['ID']

0       CaptureSite_0_201901
1       CaptureSite_0_201902
2       CaptureSite_0_201903
3       CaptureSite_0_201904
4       CaptureSite_0_201905
                ...         
1271    CaptureSite_9_201940
1272    CaptureSite_9_201941
1273    CaptureSite_9_201942
1274    CaptureSite_9_201943
1275    CaptureSite_9_201944
Name: ID, Length: 1276, dtype: object

In [None]:
df  = train_df.groupby(['year_caught', 'capture_site', 'week_caught']).size().reset_index()
df

Unnamed: 0,year_caught,capture_site,week_caught,0
0,1998,11,28,1
1,1998,11,32,1
2,1998,11,39,2
3,1998,11,43,1
4,1998,11,45,1
...,...,...,...,...
7952,2018,27,36,1
7953,2018,27,38,1
7954,2018,27,45,1
7955,2018,28,44,1


In [None]:
df.sort_values('turtles_rescued')

KeyError: 'turtles_rescued'

In [None]:
def split_data(X , y):
    # Splitting the dataset into train and test 
    X_train, X_test, y_train, y_test = train_test_split(  
    X, Y, test_size = 0.3, random_state = 100) 
    return X_train, X_test, y_train, y_test

In [None]:

# Assuming 'X' is your feature matrix and 'y' is your target variable
# Split the data into training and testing sets (optional)
columns_to_drop = ['rescue_id', 'capture_site', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex', 'status', 'rescue_count', 'foraging_ground', 'capture_method', 'landing_site', 'status', 'release_site', 'week_release', 'year_release', 'weight_kg', 'ccl_cm', 'ccw_cm']

X = train_df.drop(columns=columns_to_drop)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the linear regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Optionally, make predictions
y_pred = model.predict(X_test)

# Optionally, evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


KeyError: "['rescue_id', 'turtle_characteristics', 'tag_1', 'tag_2', 'lost_tags', 't_number', 'sex', 'status', 'rescue_count', 'foraging_ground', 'capture_method', 'landing_site', 'status', 'release_site', 'week_release', 'year_release'] not found in axis"

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

# Compute regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Display the evaluation metrics
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R2):", r2)
