In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

In [3]:
plt.rcParams['figure.figsize'] = (15, 8)

In [4]:
use_db = pd.read_csv("output/db_obs_preproc.csv").drop(columns="Unnamed: 0")

## Simple classification 
Considering a threshold of 1mg/L of mortality

In [5]:
use_db

Unnamed: 0,test_cas,exposure_type,obs_duration_mean,conc1_type,conc1_mean,class,tax_order,family,genus,species
0,10108642,S,0.032948,T,46.900000,Actinopterygii,Cypriniformes,Cyprinidae,Carassius,auratus
1,88302,S,-2.148755,A,2.920000,Actinopterygii,Salmoniformes,Salmonidae,Oncorhynchus,mykiss
2,1397940,S,0.798587,A,0.000063,Actinopterygii,Perciformes,Centrarchidae,Micropterus,salmoides
3,25474413,S,0.798587,A,0.012000,Actinopterygii,Perciformes,Centrarchidae,Lepomis,macrochirus
4,540727,F,-0.281992,T,144.000000,Actinopterygii,Salmoniformes,Salmonidae,Oncorhynchus,mykiss
...,...,...,...,...,...,...,...,...,...,...
56624,7646857,F,-0.721540,T,3.870000,Actinopterygii,Siluriformes,Clariidae,Clarias,gariepinus
56625,333415,AQUA,0.032948,F,1.800000,Actinopterygii,Cypriniformes,Cyprinidae,Cyprinus,carpio
56626,52645531,S,0.798587,A,0.007200,Actinopterygii,Perciformes,Centrarchidae,Lepomis,macrochirus
56627,2545600,R,-0.721540,A,17.500000,Actinopterygii,Salmoniformes,Salmonidae,Oncorhynchus,kisutch


# 1a
let's simply try with a get dummies

In [28]:
X = use_db.copy()
X = X[[
 'test_cas',
 'exposure_type',
 'obs_duration_mean',
 'conc1_type',
 'species', 
 'genus']]
X = pd.get_dummies(X)

# 1b
let's simply try with a get dummies but keeping only categorical values

In [15]:
X = use_db.copy()
X = X[[
 'test_cas',
 'exposure_type',
 'conc1_type',
 'species', 
 'genus']]
X = pd.get_dummies(X)

# 2
let's try another specification (we have to specify the categorical columns)

In [8]:
X = use_db.copy()
X = X[[
 'test_cas',
 'exposure_type',
 'obs_duration_mean',
 'conc1_type',
 'species', 
 'genus']]

enc = OrdinalEncoder(dtype=int)
enc.fit(X[['test_cas',
             'exposure_type',
             'conc1_type',
             'species', 
             'genus']])
X[['test_cas','exposure_type','conc1_type','species', 'genus']] = enc.transform(X[['test_cas','exposure_type','conc1_type','species','genus']]) + 1

# 3
and what about keeping only the categorical values?

In [20]:
X = use_db.copy()
X = X[['test_cas','exposure_type','conc1_type','species', 'genus']]

enc = OrdinalEncoder(dtype=int)
enc.fit(X[['test_cas',
             'exposure_type',
             'conc1_type',
             'species', 
             'genus']])
X[['test_cas','exposure_type','conc1_type','species', 'genus']] = enc.transform(X[['test_cas','exposure_type','conc1_type','species','genus']]) + 1

Setting threshold

In [21]:
y = use_db[["conc1_mean"]].copy().values
y = np.where(y > 1, 1, 0)

Checking if the splitting is balanced:

In [22]:
print(len(y[y==0]), len(y[y==1]))

23412 33217


### KNN

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, test_size=0.33, random_state=42)

In [34]:
neigh = KNeighborsClassifier() #metric="hamming"
neigh.fit(X_train, y_train.ravel())
y_pred = neigh.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

0.890892551369863

### Random Forest

In [66]:
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train, y_train.ravel())
y_pred = forest.predict(X_test)

In [67]:
accuracy_score(y_test, y_pred)

0.8318707191780822

### Gradient boosting

In [14]:
grboost = GradientBoostingClassifier()
grboost.fit(X_train, y_train.ravel())
y_pred = grboost.predict(X_test)

In [15]:
accuracy_score(y_test, y_pred)

0.7498929794520548

### CV on KNN (best result so far)

In [16]:
grid = GridSearchCV(neigh, param_grid={"n_neighbors": list(range(1, 15))}, scoring="accuracy", cv=5, verbose=1, n_jobs=-2)
grid.fit(X_train, y_train.ravel());

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done  70 out of  70 | elapsed:  1.7min finished


In [17]:
grid.best_params_

{'n_neighbors': 1}

Best results is **k=1**

In [21]:
grid.best_score_

0.9015576816636356

In [22]:
y_pred = grid.predict(X_test)

In [23]:
accuracy_score(y_test, y_pred)

0.9041630993150684

#### 90% of accuracy with KNN, k=1, threshold 1 mg/L

## Simple regression

In [24]:
X = use_db.copy()

In [25]:
X = X[[
 'test_cas',
 'exposure_type',
 'obs_duration_mean',
 'conc1_type',
 'species', 
 'genus']]

Categorical variables with get_dummies:

In [26]:
X = pd.get_dummies(X)

In [27]:
y = base_db[["conc1_mean"]].copy()

CV on ridge

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

reg = RidgeCV(alphas=np.logspace(-6, 6, 13))
reg.fit(X_train, y_train);

In [44]:
reg.alpha_

100.0

In [45]:
y_pred = reg.predict(X_test)

In [46]:
mean_squared_error(y_test, y_pred)

221955983.9236199

In [47]:
y_pred_train = reg.predict(X_train)
mean_squared_error(y_train, y_pred_train)

282426813.18152875

As expected, **the loss is huge**