In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

In [3]:
plt.rcParams['figure.figsize'] = (15, 8)

In [5]:
use_db = pd.read_csv("output/db_obs_preproc.csv").drop(columns="Unnamed: 0")

## Simple classification 
Considering a threshold of 1mg/L of mortality

In [6]:
X = use_db.copy()
X = X[[
 'test_cas',
 'exposure_type',
 'obs_duration_mean',
 'conc1_type',
 'species', 
 'genus']]
X = pd.get_dummies(X)

Setting threshold

In [7]:
y = base_db[["conc1_mean"]].copy().values
y = np.where(y > 1, 1, 0)

Checking if the division is skewed:

In [172]:
print(len(y[y==0]), len(y[y==1]))

23412 33217


### KNN

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [10]:
neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train.ravel())
y_pred = neigh.predict(X_test)

In [11]:
accuracy_score(y_test, y_pred)

0.8902504280821918

### Random Forest

In [12]:
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(X_train, y_train.ravel())
y_pred = forest.predict(X_test)

In [13]:
accuracy_score(y_test, y_pred)

0.834064640410959

### Gradient boosting

In [14]:
grboost = GradientBoostingClassifier()
grboost.fit(X_train, y_train.ravel())
y_pred = grboost.predict(X_test)

In [15]:
accuracy_score(y_test, y_pred)

0.7498929794520548

### CV on KNN (best result so far)

In [16]:
grid = GridSearchCV(neigh, param_grid={"n_neighbors": list(range(1, 15))}, scoring="accuracy", cv=5, verbose=1, n_jobs=-2)
grid.fit(X_train, y_train.ravel());

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done  70 out of  70 | elapsed:  1.7min finished


In [17]:
grid.best_params_

{'n_neighbors': 1}

Best results is **k=1**

In [21]:
grid.best_score_

0.9015576816636356

In [22]:
y_pred = grid.predict(X_test)

In [23]:
accuracy_score(y_test, y_pred)

0.9041630993150684

#### 90% of accuracy with KNN, k=1, threshold 1 mg/L

## Simple regression

In [24]:
X = use_db.copy()

In [25]:
X = X[[
 'test_cas',
 'exposure_type',
 'obs_duration_mean',
 'conc1_type',
 'species', 
 'genus']]

Categorical variables with get_dummies:

In [26]:
X = pd.get_dummies(X)

In [27]:
y = base_db[["conc1_mean"]].copy()

CV on ridge

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

reg = RidgeCV(alphas=np.logspace(-6, 6, 13))
reg.fit(X_train, y_train);

In [44]:
reg.alpha_

100.0

In [45]:
y_pred = reg.predict(X_test)

In [46]:
mean_squared_error(y_test, y_pred)

221955983.9236199

In [47]:
y_pred_train = reg.predict(X_train)
mean_squared_error(y_train, y_pred_train)

282426813.18152875

As expected, **the loss is huge**