In [None]:
!pip install requests
!pip install pyquery
!pip install pandas
!pip install sklearn
!pip install pycountry
!pip install scipy



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
datapath = '/content/drive/MyDrive/Shared/clean_pirate_attacks_dataset.csv'
data = pd.read_csv(datapath)
data.head()

Unnamed: 0,Ship name,IMO Number,Flag,Gross tonnage,Type of ship,Date,Time,Area,Latitude,Longitude,Place,Label
0,Vantage Wave,9506514.0,Liberia,22064.0,General cargo ship,2021-01-28,18:20,Malacca Strait,1° 16.87' N,104° 18.70' E,In international waters,1
1,Ark Royal,9219446.0,Barbados,27011.0,Bulk carrier,2021-01-25,03:25,South China Sea,1° 16.58' N,104° 15.70' E,In international waters,1
2,Maersk Cardiff,9529255.0,Singapore,50869.0,Container ship,2021-01-13,19:20,West Africa,2° 23.58' N,5° 31.00' E,In international waters,1
3,Myrto,9518086.0,Marshall Islands,43012.0,Bulk carrier,2021-01-14,08:49,Arabian Sea,12° 06.10' N,44° 26.50' E,In international waters,1
4,SBI Orion,9705330.0,Marshall Islands,36421.0,Bulk carrier,2021-01-03,03:30,South China Sea,0° 15.80' S,117° 34.30' E,In port area,1


In [None]:
data.shape

(10489, 12)

In [None]:
data = data.drop(['IMO Number'], axis=1)

In [None]:
data.shape

(10489, 11)

In [None]:
data = data.dropna(axis=0)

In [None]:
data['Label'].value_counts()

0    7004
1    3485
Name: Label, dtype: int64

In [None]:
data['Flag'].value_counts()

Panama                              1770
Liberia                             1212
Singapore                           1137
Marshall Islands                    1109
Hong Kong, China                     581
                                    ... 
Lebanon                                1
St. Pierre and Miquelon (France)       1
Mauritius                              1
Cameroon                               1
Sao Tome and Principe                  1
Name: Flag, Length: 144, dtype: int64

In [None]:
data['Type of ship'].value_counts()

Bulk carrier                      2001
Tanker                            1234
Container ship                    1158
Chemical tanker                    823
General cargo ship                 698
                                  ... 
Salvage search and rescue ship       1
Tug/Supply Ship (O.R.S.V.)           1
Dry Cargo ship                       1
Diving Support Vessel                1
Passenger  ship                      1
Name: Type of ship, Length: 195, dtype: int64

In [None]:
data['Area'].value_counts()

South China Sea         2298
East Africa             1832
West Africa             1413
Indian Ocean            1234
North Atlantic Ocean    1092
Malacca Strait           920
Arabian Sea              468
Mediterranean Sea        330
South America (A)        252
South America (P)        234
South America (C)        220
Yellow Sea               109
Far East                  45
Persian Gulf              24
North Pacific Ocean       14
China Sea                  3
North Sea                  1
Name: Area, dtype: int64

In [None]:
data['Gross tonnage'].shape

(10489,)

In [None]:
data['Time'].value_counts()

00:00    358
03:00    170
01:00    166
02:00    135
03:30    134
        ... 
17:33      1
06:51      1
00:32      1
00:54      1
09:27      1
Name: Time, Length: 1154, dtype: int64

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pickle
import numpy as np
import time 
import datetime 

data['ShortTime'] = data['Time'].apply(lambda x: int(x.split(':')[0])//2)
cat_cols = ['Flag', 'Type of ship', 'Area', 'Place', 'ShortTime'] 

encoders = OneHotEncoder()
encoders.fit(data[cat_cols])

with open('/content/drive/MyDrive/Shared/encoders.pickle', 'wb') as f:
    pickle.dump(encoders, f)

In [None]:
def get_timestamp(x):
    try:
        element = datetime.datetime.strptime(x,"%Y-%m-%d %H:%M")
        a = time.mktime(element.timetuple())
        element = datetime.datetime.strptime("1990-01-01","%Y-%m-%d") 
        b = time.mktime(element.timetuple())
        return (a - b)/ 1e9
    except ValueError as e:
        print(x)
        raise e

data['Timestamp'] = data['Date'] + ' ' + data['Time']
data['Timestamp'] = data['Timestamp'].apply(get_timestamp)
data.head()

Unnamed: 0,Ship name,Flag,Gross tonnage,Type of ship,Date,Time,Area,Latitude,Longitude,Place,Label,ShortTime,Timestamp
0,Vantage Wave,Liberia,22064.0,General cargo ship,2021-01-28,18:20,Malacca Strait,1° 16.87' N,104° 18.70' E,In international waters,1,9,0.980706
1,Ark Royal,Barbados,27011.0,Bulk carrier,2021-01-25,03:25,South China Sea,1° 16.58' N,104° 15.70' E,In international waters,1,1,0.980393
2,Maersk Cardiff,Singapore,50869.0,Container ship,2021-01-13,19:20,West Africa,2° 23.58' N,5° 31.00' E,In international waters,1,9,0.979414
3,Myrto,Marshall Islands,43012.0,Bulk carrier,2021-01-14,08:49,Arabian Sea,12° 06.10' N,44° 26.50' E,In international waters,1,4,0.979462
4,SBI Orion,Marshall Islands,36421.0,Bulk carrier,2021-01-03,03:30,South China Sea,0° 15.80' S,117° 34.30' E,In port area,1,1,0.978493


In [None]:
def extract_features(encoders, data):
    cat_cols = ['Flag', 'Type of ship', 'Area', 'Place', 'ShortTime']
    Xs = []
    X = encoders.transform(data[cat_cols]).toarray()
    Xs.append(X)

    X_gross = data['Gross tonnage'] / 10000
    X_gross = X_gross.to_numpy().reshape(X_gross.shape[0], 1)
    Xs.append(X_gross)
    Xs.append(data['Timestamp'].to_numpy().reshape(data['Timestamp'].shape[0], 1))
    return np.hstack(Xs)

X = extract_features(encoders, data)
print(X.shape)
print(X)

(10489, 373)
[[0.         0.         0.         ... 0.         2.2064     0.980706  ]
 [0.         0.         0.         ... 0.         2.7011     0.9803931 ]
 [0.         0.         0.         ... 0.         5.0869     0.9794136 ]
 ...
 [0.         0.         0.         ... 0.         1.3881     0.41395008]
 [0.         0.         0.         ... 0.         1.0941     0.37391376]
 [0.         0.         0.         ... 0.         2.455      0.37283808]]


In [None]:
y = data['Label'].to_numpy()

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, \
    cross_val_score, train_test_split

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, train_size = 0.8, random_state = 12)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((8391, 373), (8391,), (2098, 373), (2098,))

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression

lr = LogisticRegression(penalty='l2', random_state=12, 
                        solver='lbfgs',max_iter=100000)
lr.fit(X_train, y_train)
with open('/content/drive/MyDrive/Shared/risk_assessment_model.pickle', 'wb') as f:
    pickle.dump(lr, f)

In [None]:
y_pred = lr.predict(X_test)
print(y_pred)

[0 0 0 ... 0 0 0]


In [None]:
from sklearn.metrics import accuracy_score, \
    classification_report, confusion_matrix
    
accuracy_score(y_test, y_pred)

0.6487130600571973

In [None]:
print(confusion_matrix(y_test, y_pred))

[[1180  220]
 [ 517  181]]


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.84      0.76      1400
           1       0.45      0.26      0.33       698

    accuracy                           0.65      2098
   macro avg       0.57      0.55      0.55      2098
weighted avg       0.61      0.65      0.62      2098



In [None]:
y_proba = lr.predict_proba(X_test)
y_proba[:10]

array([[0.61208083, 0.38791917],
       [0.58265499, 0.41734501],
       [0.97814205, 0.02185795],
       [0.67981837, 0.32018163],
       [0.98315711, 0.01684289],
       [0.95845795, 0.04154205],
       [0.50556999, 0.49443001],
       [0.74350476, 0.25649524],
       [0.97944086, 0.02055914],
       [0.59859826, 0.40140174]])

In [None]:
data

Unnamed: 0,Ship name,Flag,Gross tonnage,Type of ship,Date,Time,Area,Latitude,Longitude,Place,Label,ShortTime,Timestamp
0,Vantage Wave,Liberia,22064.0,General cargo ship,2021-01-28,18:20,Malacca Strait,1° 16.87' N,104° 18.70' E,In international waters,1,9,0.980706
1,Ark Royal,Barbados,27011.0,Bulk carrier,2021-01-25,03:25,South China Sea,1° 16.58' N,104° 15.70' E,In international waters,1,1,0.980393
2,Maersk Cardiff,Singapore,50869.0,Container ship,2021-01-13,19:20,West Africa,2° 23.58' N,5° 31.00' E,In international waters,1,9,0.979414
3,Myrto,Marshall Islands,43012.0,Bulk carrier,2021-01-14,08:49,Arabian Sea,12° 06.10' N,44° 26.50' E,In international waters,1,4,0.979462
4,SBI Orion,Marshall Islands,36421.0,Bulk carrier,2021-01-03,03:30,South China Sea,0° 15.80' S,117° 34.30' E,In port area,1,1,0.978493
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10484,GLORY,Democratic People's Republic of Korea,8842.0,General cargo ship,2006-04-03,08:43,Indian Ocean,30° 06.74' N,47° 55.90' E,In port area,0,4,0.512902
10485,REGAL STAR,Saint Vincent and the Grenadines,12539.0,Bulk carrier,2006-04-01,20:08,East Africa,13° 32.70' N,49° 32.80' E,In international waters,0,10,0.512770
10486,HANDY RAINBOW,"Hong Kong, China",13881.0,Bulk carrier,2003-02-13,02:08,South China Sea,3° 20.00' N,111° 25.00' E,In territorial waters,0,1,0.413950
10487,SHA HE KOU,China,10941.0,General cargo ship,2001-11-06,16:56,South China Sea,11° 31.00' N,110° 42.00' E,In territorial waters,0,8,0.373914


In [None]:
sample = data.loc[0,:].to_dict()
sample

{'Area': 'Malacca Strait',
 'Date': '2021-01-28',
 'Flag': 'Liberia',
 'Gross tonnage': 22064.0,
 'Label': 1,
 'Latitude': "1° 16.87' N",
 'Longitude': "104° 18.70' E",
 'Place': 'In international waters',
 'Ship name': 'Vantage Wave',
 'ShortTime': 9,
 'Time': '18:20',
 'Timestamp': 0.980706,
 'Type of ship': 'General cargo ship'}

In [None]:
encoders_path = '/content/drive/MyDrive/Shared/encoders.pickle'
model_path = '/content/drive/MyDrive/Shared/risk_assessment_model.pickle'

encoders = pickle.load(open(encoders_path, 'rb'))
model = pickle.load(open(model_path, 'rb'))

def get_timestamp(x):
    element = datetime.datetime.strptime(x,"%Y-%m-%d %H:%M")
    a = time.mktime(element.timetuple())
    element = datetime.datetime.strptime("1990-01-01","%Y-%m-%d") 
    b = time.mktime(element.timetuple())
    return (a - b)/ 1e9

def extract_sample_features(encoders, sample):
    sample['Timestamp'] = get_timestamp(sample['Date'] + ' ' + sample['Time'])
    df = pd.DataFrame([sample])
    df['ShortTime'] = df['Time'].apply(lambda x: int(x.split(':')[0])//2)
    cat_cols = ['Flag', 'Type of ship', 'Area', 'Place', 'ShortTime']
    Xs = []
    X = encoders.transform(df[cat_cols]).toarray()
    Xs.append(X)

    X_gross = sample['Gross tonnage'] / 10000
    X_gross = np.array(X_gross).reshape(1, 1)
    Xs.append(X_gross)

    Xs.append(df['Timestamp'].to_numpy().reshape(df['Timestamp'].shape[0], 1))

    return np.hstack(Xs)

model.predict_proba(extract_sample_features(encoders, sample))

array([[0.29824376, 0.70175624]])