# To-Do
- pipreqs to update requirements.txt
- naive bayes outperforming catboost
- cast features as category dtype before training catboost
- engineer categorical features from 'number_of_outings_in_last_year' and 'trip_fishing_effort_hours'
- re-evaluate models after adding new features -> look at feature importance on catboost model

In [30]:
import duckdb
from pathlib import Path
from catboost import CatBoostClassifier, Pool
from catboost.utils import get_confusion_matrix
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [8]:
#constants
SCHEMA = 'analytics'
DUCKDB_PATH = str(Path().resolve().parent / "data/noaa_dw.duckdb")

In [9]:
#Database Connection and Query
with duckdb.connect(DUCKDB_PATH) as con:
    query = f"""
            SELECT
            fish_caught_time_of_day,
            trip_month_name,
            us_region,
            nautical_zone,
            fishing_method_collapsed,
            --number_of_outings_in_last_year,
            --trip_fishing_effort_hours,
            caught
            FROM
            {SCHEMA}.trip_details
            """
    df = con.sql(query).df() #materialize into pandas dataframe

In [10]:
#Preprocessing
df = df.dropna() #drop any rows that have NaNs

In [11]:
categorical_features = ['fish_caught_time_of_day',
                        'trip_month_name',
                        'us_region',
                        'nautical_zone',
                        'fishing_method_collapsed']

In [None]:
#Define Features and Label
X = df.drop(labels='caught', axis=1)
y = df['caught']

#Encoding
encoder = OrdinalEncoder()
X = encoder.fit_transform(X)

In [5]:
#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Baseline Model - Categorical Naive Bayes
nb_clf = CategoricalNB()
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))

In [31]:
#Alternate Model - CatBoost
#Define Features and Label
X = df.drop(labels='caught', axis=1)
y = df['caught']

#Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                           verbose=True)

model.fit(X_train, y_train)
print(model.feature_names_)
print(model.get_feature_importance())
print(model.best_score_)
print(model.score(X_test, y_test))
print(get_confusion_matrix(model, Pool(X_train, y_train)))


0:	learn: 0.6007155	total: 15.5ms	remaining: 15.5ms
1:	learn: 0.5862046	total: 31.8ms	remaining: 0us
['fish_caught_time_of_day', 'trip_month_name', 'us_region', 'nautical_zone', 'fishing_method_collapsed']
[ 0.          0.         22.33486244  0.         77.66513756]
{'learn': {'Logloss': 0.5862046213959327}}
0.6874655389309708
[[378314.  79817.]
 [153457. 135629.]]
