In [2]:
import seaborn as sns

import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

from sklearn import set_config
set_config(transform_output="pandas")

# Set random seed 
RSEED = 42

warnings.filterwarnings("ignore")

In [103]:
df = pd.read_csv("data/cleaned_data.csv")

In [104]:
df.head()

Unnamed: 0.1,Unnamed: 0,date_caught,capture_site,tag_2,ccl_cm,ccw_cm,weight_kg,status,release_site,date_released,...,cs_category_3,cs_category_4,species_0,species_1,species_2,species_3,species_4,species_5,species_6,species_7
0,0,2000-12-22,CaptureSite_0,0,64.7,62.6,8.5,429,237,2000-12-22 00:00:00,...,False,False,False,False,False,False,False,False,True,False
1,1,2001-10-28,CaptureSite_0,0,35.85,31.35,8.5,429,250,2001-10-28 00:00:00,...,False,False,False,False,False,False,False,False,True,False
2,2,2001-11-01,CaptureSite_0,0,51.8,49.2,8.5,429,237,2001-01-11 00:00:00,...,False,False,False,False,False,False,False,True,False,False
3,3,2002-03-11,CaptureSite_0,0,60.5,59.0,8.5,429,237,2002-11-03 00:00:00,...,False,False,False,False,False,False,False,False,True,False
4,4,2002-08-08,CaptureSite_0,0,34.7,33.0,8.5,429,250,2002-08-08 00:00:00,...,False,False,False,False,False,False,False,True,False,False


In [105]:
capture_site_category = pd.read_csv('data/CaptureSite_category.csv')

In [106]:
pd.set_option('display.max_colwidth', None)

In [107]:
df.columns

Index(['Unnamed: 0', 'date_caught', 'capture_site', 'tag_2', 'ccl_cm',
       'ccw_cm', 'weight_kg', 'status', 'release_site', 'date_released',
       'cm_beached', 'cm_by hand', 'cm_collected floater', 'cm_fish trap',
       'cm_jarife', 'cm_longline', 'cm_net', 'cm_not_recorded', 'cm_speargun',
       'cm_stranded', 'cm_uzio', 'foraging_ground_0', 'foraging_ground_1',
       'cs_category_0', 'cs_category_1', 'cs_category_2', 'cs_category_3',
       'cs_category_4', 'species_0', 'species_1', 'species_2', 'species_3',
       'species_4', 'species_5', 'species_6', 'species_7'],
      dtype='object')

In [108]:
capture_site_category.head()

Unnamed: 0,CaptureSite,CaptureSiteCategory,Type
0,CaptureSite_0,CaptureSiteCategory_2,Type_1
1,CaptureSite_1,CaptureSiteCategory_2,Type_1
2,CaptureSite_10,CaptureSiteCategory_2,Type_1
3,CaptureSite_11,CaptureSiteCategory_0,Type_0
4,CaptureSite_12,CaptureSiteCategory_2,Type_1


In [109]:
#format column names
capture_site_category = capture_site_category.rename({'CaptureSite':'capture_site','CaptureSiteCategory':'cs_category','Type':'type'},axis=1)

In [110]:
capture_site_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   capture_site  29 non-null     object
 1   cs_category   29 non-null     object
 2   type          29 non-null     object
dtypes: object(3)
memory usage: 828.0+ bytes


In [111]:
categorical_columns = ['capture_site', 'cs_category', 'type'
                       ]
# convert to categories
for col in categorical_columns:
    capture_site_category[col] = capture_site_category[col].astype('category')

In [112]:
capture_site_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   capture_site  29 non-null     category
 1   cs_category   29 non-null     category
 2   type          29 non-null     category
dtypes: category(3)
memory usage: 1.8 KB


In [113]:
df.columns

Index(['Unnamed: 0', 'date_caught', 'capture_site', 'tag_2', 'ccl_cm',
       'ccw_cm', 'weight_kg', 'status', 'release_site', 'date_released',
       'cm_beached', 'cm_by hand', 'cm_collected floater', 'cm_fish trap',
       'cm_jarife', 'cm_longline', 'cm_net', 'cm_not_recorded', 'cm_speargun',
       'cm_stranded', 'cm_uzio', 'foraging_ground_0', 'foraging_ground_1',
       'cs_category_0', 'cs_category_1', 'cs_category_2', 'cs_category_3',
       'cs_category_4', 'species_0', 'species_1', 'species_2', 'species_3',
       'species_4', 'species_5', 'species_6', 'species_7'],
      dtype='object')

In [115]:
df["date_caught"] = pd.to_datetime(df["date_caught"])
df["year"] = df["date_caught"].dt.year
df["week_of_year"] = df["date_caught"].dt.isocalendar().week  # Using isocalendar() method
df["year_woy"] = df["year"] * 100 + df["week_of_year"]
df.head()



Unnamed: 0.1,Unnamed: 0,date_caught,capture_site,tag_2,ccl_cm,ccw_cm,weight_kg,status,release_site,date_released,...,species_1,species_2,species_3,species_4,species_5,species_6,species_7,year,week_of_year,year_woy
0,0,2000-12-22,CaptureSite_0,0,64.7,62.6,8.5,429,237,2000-12-22 00:00:00,...,False,False,False,False,False,True,False,2000,51,200051
1,1,2001-10-28,CaptureSite_0,0,35.85,31.35,8.5,429,250,2001-10-28 00:00:00,...,False,False,False,False,False,True,False,2001,43,200143
2,2,2001-11-01,CaptureSite_0,0,51.8,49.2,8.5,429,237,2001-01-11 00:00:00,...,False,False,False,False,True,False,False,2001,44,200144
3,3,2002-03-11,CaptureSite_0,0,60.5,59.0,8.5,429,237,2002-11-03 00:00:00,...,False,False,False,False,False,True,False,2002,11,200211
4,4,2002-08-08,CaptureSite_0,0,34.7,33.0,8.5,429,250,2002-08-08 00:00:00,...,False,False,False,False,True,False,False,2002,32,200232


In [116]:
df.columns

Index(['Unnamed: 0', 'date_caught', 'capture_site', 'tag_2', 'ccl_cm',
       'ccw_cm', 'weight_kg', 'status', 'release_site', 'date_released',
       'cm_beached', 'cm_by hand', 'cm_collected floater', 'cm_fish trap',
       'cm_jarife', 'cm_longline', 'cm_net', 'cm_not_recorded', 'cm_speargun',
       'cm_stranded', 'cm_uzio', 'foraging_ground_0', 'foraging_ground_1',
       'cs_category_0', 'cs_category_1', 'cs_category_2', 'cs_category_3',
       'cs_category_4', 'species_0', 'species_1', 'species_2', 'species_3',
       'species_4', 'species_5', 'species_6', 'species_7', 'year',
       'week_of_year', 'year_woy'],
      dtype='object')

In [117]:
# Extracting the target variable from the dataset
target=df.groupby(["year_woy","capture_site"]).capture_site.count().rename("capture_number").reset_index()

In [118]:
target

Unnamed: 0,year_woy,capture_site,capture_number
0,199816,CaptureSite_27,1
1,199828,CaptureSite_11,1
2,199832,CaptureSite_11,1
3,199832,CaptureSite_27,1
4,199839,CaptureSite_11,2
...,...,...,...
7952,201852,CaptureSite_23,1
7953,201852,CaptureSite_25,4
7954,201852,CaptureSite_3,1
7955,201852,CaptureSite_7,1


In [119]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7957 entries, 0 to 7956
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_woy        7957 non-null   Int64 
 1   capture_site    7957 non-null   object
 2   capture_number  7957 non-null   int64 
dtypes: Int64(1), int64(1), object(1)
memory usage: 194.4+ KB


In [120]:
target.isnull().sum()

year_woy          0
capture_site      0
capture_number    0
dtype: int64

In [121]:
capture_site_category.head()

Unnamed: 0,capture_site,cs_category,type
0,CaptureSite_0,CaptureSiteCategory_2,Type_1
1,CaptureSite_1,CaptureSiteCategory_2,Type_1
2,CaptureSite_10,CaptureSiteCategory_2,Type_1
3,CaptureSite_11,CaptureSiteCategory_0,Type_0
4,CaptureSite_12,CaptureSiteCategory_2,Type_1


In [123]:
# merging the tables
df_1 = pd.merge (left=df, right=capture_site_category, left_on ='capture_site', right_on = 'capture_site', how='left')

In [124]:
df_1.head()

Unnamed: 0.1,Unnamed: 0,date_caught,capture_site,tag_2,ccl_cm,ccw_cm,weight_kg,status,release_site,date_released,...,species_3,species_4,species_5,species_6,species_7,year,week_of_year,year_woy,cs_category,type
0,0,2000-12-22,CaptureSite_0,0,64.7,62.6,8.5,429,237,2000-12-22 00:00:00,...,False,False,False,True,False,2000,51,200051,CaptureSiteCategory_2,Type_1
1,1,2001-10-28,CaptureSite_0,0,35.85,31.35,8.5,429,250,2001-10-28 00:00:00,...,False,False,False,True,False,2001,43,200143,CaptureSiteCategory_2,Type_1
2,2,2001-11-01,CaptureSite_0,0,51.8,49.2,8.5,429,237,2001-01-11 00:00:00,...,False,False,True,False,False,2001,44,200144,CaptureSiteCategory_2,Type_1
3,3,2002-03-11,CaptureSite_0,0,60.5,59.0,8.5,429,237,2002-11-03 00:00:00,...,False,False,False,True,False,2002,11,200211,CaptureSiteCategory_2,Type_1
4,4,2002-08-08,CaptureSite_0,0,34.7,33.0,8.5,429,250,2002-08-08 00:00:00,...,False,False,True,False,False,2002,32,200232,CaptureSiteCategory_2,Type_1


In [125]:
df_1.shape

(18062, 41)

In [126]:
final_data=df_1.merge(target,on=["year_woy","capture_site"],how="left")

In [127]:
final_data.head()

Unnamed: 0.1,Unnamed: 0,date_caught,capture_site,tag_2,ccl_cm,ccw_cm,weight_kg,status,release_site,date_released,...,species_4,species_5,species_6,species_7,year,week_of_year,year_woy,cs_category,type,capture_number
0,0,2000-12-22,CaptureSite_0,0,64.7,62.6,8.5,429,237,2000-12-22 00:00:00,...,False,False,True,False,2000,51,200051,CaptureSiteCategory_2,Type_1,1
1,1,2001-10-28,CaptureSite_0,0,35.85,31.35,8.5,429,250,2001-10-28 00:00:00,...,False,False,True,False,2001,43,200143,CaptureSiteCategory_2,Type_1,1
2,2,2001-11-01,CaptureSite_0,0,51.8,49.2,8.5,429,237,2001-01-11 00:00:00,...,False,True,False,False,2001,44,200144,CaptureSiteCategory_2,Type_1,1
3,3,2002-03-11,CaptureSite_0,0,60.5,59.0,8.5,429,237,2002-11-03 00:00:00,...,False,False,True,False,2002,11,200211,CaptureSiteCategory_2,Type_1,1
4,4,2002-08-08,CaptureSite_0,0,34.7,33.0,8.5,429,250,2002-08-08 00:00:00,...,False,True,False,False,2002,32,200232,CaptureSiteCategory_2,Type_1,2


In [128]:
final_data.isnull().sum()

Unnamed: 0              0
date_caught             0
capture_site            0
tag_2                   0
ccl_cm                  0
ccw_cm                  0
weight_kg               0
status                  0
release_site            0
date_released           0
cm_beached              0
cm_by hand              0
cm_collected floater    0
cm_fish trap            0
cm_jarife               0
cm_longline             0
cm_net                  0
cm_not_recorded         0
cm_speargun             0
cm_stranded             0
cm_uzio                 0
foraging_ground_0       0
foraging_ground_1       0
cs_category_0           0
cs_category_1           0
cs_category_2           0
cs_category_3           0
cs_category_4           0
species_0               0
species_1               0
species_2               0
species_3               0
species_4               0
species_5               0
species_6               0
species_7               0
year                    0
week_of_year            0
year_woy    

In [129]:
final_data['capture_number'].value_counts()

capture_number
1     4136
2     3336
3     2577
4     1768
5     1290
7     1071
6     1008
8      584
9      531
10     450
11     308
14     210
12     204
13     182
15      90
16      80
17      51
35      35
28      28
23      23
22      22
21      21
20      20
19      19
18      18
Name: count, dtype: int64

In [130]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 42 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Unnamed: 0            18062 non-null  int64         
 1   date_caught           18062 non-null  datetime64[ns]
 2   capture_site          18062 non-null  object        
 3   tag_2                 18062 non-null  int64         
 4   ccl_cm                18062 non-null  float64       
 5   ccw_cm                18062 non-null  float64       
 6   weight_kg             18062 non-null  float64       
 7   status                18062 non-null  int64         
 8   release_site          18062 non-null  int64         
 9   date_released         18062 non-null  object        
 10  cm_beached            18062 non-null  bool          
 11  cm_by hand            18062 non-null  bool          
 12  cm_collected floater  18062 non-null  bool          
 13  cm_fish trap    

In [131]:
final_data['capture_site'].nunique()

29

In [132]:
final_data['capture_site'] = LabelEncoder().fit_transform(final_data['capture_site'])

In [133]:
final_data['cs_category'].nunique()

5

In [134]:
final_data['cs_category'] = LabelEncoder().fit_transform(final_data['cs_category'])

In [135]:
final_data['type'].nunique()

2

In [136]:
final_data['type'] = LabelEncoder().fit_transform(final_data['type'])

In [137]:
del final_data['date_released'] 

In [138]:
final_data['capture_number']

0        1
1        1
2        1
3        1
4        2
        ..
18057    5
18058    5
18059    3
18060    3
18061    3
Name: capture_number, Length: 18062, dtype: int64

In [139]:
# Select X and y features
X = final_data.drop(['capture_number'], axis = 1)
y = final_data['capture_number']

In [140]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 40 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Unnamed: 0            18062 non-null  int64         
 1   date_caught           18062 non-null  datetime64[ns]
 2   capture_site          18062 non-null  int64         
 3   tag_2                 18062 non-null  int64         
 4   ccl_cm                18062 non-null  float64       
 5   ccw_cm                18062 non-null  float64       
 6   weight_kg             18062 non-null  float64       
 7   status                18062 non-null  int64         
 8   release_site          18062 non-null  int64         
 9   cm_beached            18062 non-null  bool          
 10  cm_by hand            18062 non-null  bool          
 11  cm_collected floater  18062 non-null  bool          
 12  cm_fish trap          18062 non-null  bool          
 13  cm_jarife       

In [141]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# Check the shape of the data sets
print("X_train:", X_train.shape)  
print("y_train:", y_train.shape)  
print("X_test:", X_test.shape) 
print("y_test:", y_test.shape)   

X_train: (12643, 40)
y_train: (12643,)
X_test: (5419, 40)
y_test: (5419,)


In [142]:
final_data['capture_number'].value_counts()

capture_number
1     4136
2     3336
3     2577
4     1768
5     1290
7     1071
6     1008
8      584
9      531
10     450
11     308
14     210
12     204
13     182
15      90
16      80
17      51
35      35
28      28
23      23
22      22
21      21
20      20
19      19
18      18
Name: count, dtype: int64

In [143]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18062 entries, 0 to 18061
Data columns (total 41 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Unnamed: 0            18062 non-null  int64         
 1   date_caught           18062 non-null  datetime64[ns]
 2   capture_site          18062 non-null  int64         
 3   tag_2                 18062 non-null  int64         
 4   ccl_cm                18062 non-null  float64       
 5   ccw_cm                18062 non-null  float64       
 6   weight_kg             18062 non-null  float64       
 7   status                18062 non-null  int64         
 8   release_site          18062 non-null  int64         
 9   cm_beached            18062 non-null  bool          
 10  cm_by hand            18062 non-null  bool          
 11  cm_collected floater  18062 non-null  bool          
 12  cm_fish trap          18062 non-null  bool          
 13  cm_jarife       

In [None]:
final_data.drop(columns="data_caught")

In [144]:
# export data to csv, index to True 
final_data.to_csv('data/wrangled_data.csv', index=True)