In [37]:
import pandas as pd
import sklearn 
import numpy as np
from os.path import dirname

from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [38]:
#path = dirname(__file__)

data_all = pd.read_csv("data_jun_jul_aug_sep_oct.csv")

data_clean = data_all.drop(['trips'], axis=1)
#data_clean = data_clean[data_clean.iwrap_type_from_dataset != 'Other ship']
data_clean = data_clean.dropna() #drop nan values, our models can't handle this
data_processed = data_clean[data_clean.length_from_data_set < 400] #vessels that are over 400m do not exist
data_processed = data_processed[data_processed.Speed_max <65] #boats that go faster than that are helicopters
data_processed = data_processed[data_processed.Speed_max >0] #boats that stand still

data_processed = shuffle(data_processed) #shuffle the data to get different boats for limited versions
data_processed = data_processed.reset_index(drop =True) #reset index

types_of_vessels = data_processed.iwrap_type_from_dataset.value_counts()
print(types_of_vessels)

General cargo ship     14794
Pleasure boat          13297
Passenger ship          7044
Other ship              5011
Oil products tanker     4371
Support ship            2547
Fishing ship             734
Fast ferry               197
Name: iwrap_type_from_dataset, dtype: int64


In [52]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
#lb = preprocessing.MultiLabelBinarizer()

lb.fit(data_processed['iwrap_type_from_dataset'])
labels = lb.transform(data_processed['iwrap_type_from_dataset'])
X = data_processed[['Speed_mean', 'Speed_median', 'Speed_min', 'Speed_max', 'Speed_std', 'ROT_mean', 'ROT_median', 'ROT_min', 'ROT_max', 'ROT_std']]
y = lb.inverse_transform(labels)

In [53]:
y[:10]

array(['Pleasure boat', 'Pleasure boat', 'Pleasure boat',
       'Oil products tanker', 'Oil products tanker', 'Pleasure boat',
       'General cargo ship', 'Oil products tanker', 'General cargo ship',
       'General cargo ship'], dtype='<U19')

In [54]:
#split into test and train+val 
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2,  random_state=0, stratify = y)
# split train+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1, stratify = y_trainval)

print("\nSize of training set: {}   size of validation set: {}   size of test set:" " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

#Scaling the data
scaler = RobustScaler() #accounts for outliers
scaler.fit(X_trainval)
X_trainval_scaled = scaler.transform(X_trainval)
X_valid_scaled = scaler.transform(X_valid)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(lb.classes_)
#print(lb.y_type_)


Size of training set: 28797   size of validation set: 9599   size of test set: 9599

['Fast ferry' 'Fishing ship' 'General cargo ship' 'Oil products tanker'
 'Other ship' 'Passenger ship' 'Pleasure boat' 'Support ship']


In [55]:
logreg = LogisticRegression(solver = 'liblinear', multi_class='ovr')
logreg.fit(X_train_scaled, y_train)

print("Training set score: {:.3f}".format(logreg.score(X_train_scaled, y_train_new)))
print("Test set score: {:.3f}\n".format(logreg.score(X_test_scaled, y_test)))
y_pred = logreg.predict(X_test_scaled)

Training set score: 0.277
Test set score: 0.587



In [56]:
logreg = LogisticRegression(max_iter=10000,C=0.5)
# # Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
corss_val_log = cross_val_score(logreg, X_valid_scaled, y_valid, cv=kfold)
print("Cross-validation scores for Logistic reg:\n{}".format(corss_val_log))
print("Mean cross validation for Logistic reg {:.3f}".format(np.mean(corss_val_log)))

# #Training and test
logreg.fit(X_train_scaled, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train_scaled, y_train)))
print("Test set score: {:.3f}\n".format(logreg.score(X_test_scaled, y_test)))
y_pred = logreg.predict(X_test_scaled)
#Classification report
print(classification_report(y_test, y_pred, target_names=lb.classes_))
print('Precision is positive predictive value \nRecall is true positive rate or sensitivity \n')

Cross-validation scores for Logistic reg:
[0.59166667 0.59635417 0.58125    0.58802083 0.59614382]
Mean cross validation for Logistic reg 0.591
Training set score: 0.586
Test set score: 0.591

                     precision    recall  f1-score   support

         Fast ferry       0.00      0.00      0.00        39
       Fishing ship       0.00      0.00      0.00       147
 General cargo ship       0.58      0.89      0.70      2959
Oil products tanker       0.00      0.00      0.00       874
         Other ship       0.47      0.26      0.33      1002
     Passenger ship       0.41      0.22      0.29      1409
      Pleasure boat       0.66      0.92      0.77      2659
       Support ship       0.38      0.01      0.01       510

           accuracy                           0.59      9599
          macro avg       0.31      0.29      0.26      9599
       weighted avg       0.49      0.59      0.51      9599

Precision is positive predictive value 
Recall is true positive rate or 

  _warn_prf(average, modifier, msg_start, len(result))
