In [47]:
import pandas as pd
import geopandas as gpd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import scale
from matplotlib import pyplot as plt
import  numpy as np
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
%matplotlib inline

In [2]:
# feature extraction functions
def get_mean_meeting_time(meeting):
    meeting_mean = meetings.groupby('ves_id2')['duration_min'].mean().reset_index()
    meeting_mean.columns = ['vessel_id', 'meeting_duration_min']
    return meeting_mean

def get_num_visited_ports(ports):
    vessels_ports = ports.groupby(by='vessel_id')['port_id'].unique().reset_index()
    vessels_ports['num_ports'] = vessels_ports['port_id'].apply(lambda x: len(x))
    return vessels_ports

def get_mean_duration_in_ports(ports):
    mean_duration = ports.groupby(by='vessel_id')['duration_min'].mean().reset_index()
    mean_duration.columns = ['vessel_id', 'durat_mean']
    return mean_duration

def get_num_country_visited(ports):
    num_country_visited = ports.groupby(by='vessel_id')['country'].count().reset_index()
    num_country_visited.columns = ['vessel_id', 'num_countries']
    return num_country_visited

In [3]:
labels = pd.read_csv('./input/vessels_labels_train.csv')
ports = pd.read_csv('./input/port_visits_train.csv')
meetings  = pd.read_csv('./input/meetings_train.csv')
meeting_one_hot = pd.read_csv('./input/meeting_with_whom_num.csv')
countries = pd.read_csv('./input/meeting_countries_num.csv')

In [57]:
# data is unbalanced!
labels['type'].value_counts()

Bulk Carrier        7404
Container Vessel    4057
Tug                 3419
Oil Tanker          2981
Fishing Vessel      2200
Passenger Vessel    1038
Reefer               543
Name: type, dtype: int64

In [4]:
meeting_one_hot.drop('Unnamed: 0', inplace=True, axis=1)
countries.drop(['Unnamed: 0', 'Unnamed: 5'], axis=1, inplace=True)

In [67]:
vassels_types = labels['type']
types = vassels_types.unique()
types2values = dict()
values2dict = dict()
for i, t in enumerate(types):
    types2values[t] = i
    values2dict[i] = t

In [60]:
types2values

{'Bulk Carrier': 1,
 'Container Vessel': 0,
 'Fishing Vessel': 5,
 'Oil Tanker': 2,
 'Passenger Vessel': 6,
 'Reefer': 4,
 'Tug': 3}

In [61]:
labels['num_types'] = labels['type'].apply(lambda x: types2values[x])

In [8]:
ports.columns = ['vessel_id', 'start_time', 'duration_min', 'port_id', 'country', 'Lat',
       'Long', 'port_name'] 

In [9]:
ports_full = pd.merge(ports, labels, left_on='vessel_id', right_on='vessel_id')

In [10]:
vessels_ports = get_num_visited_ports(ports_full)

In [11]:
vessels_ports = pd.merge(vessels_ports, labels, left_on='vessel_id', right_on='vessel_id')

In [12]:
vessels_ports.shape

(21122, 5)

In [13]:
mean_duration_in_ports = get_mean_duration_in_ports(ports_full)
vessels_ports = pd.merge(vessels_ports, mean_duration_in_ports, left_on='vessel_id', right_on='vessel_id')

In [14]:
vessels_ports.shape

(21122, 6)

In [15]:
meeting_mean = get_mean_meeting_time(meetings)
vessels_ports = pd.merge(vessels_ports, meeting_mean, left_on='vessel_id', right_on='vessel_id', how='outer')

In [16]:
vessels_ports.shape

(42823, 7)

In [17]:
vessels_ports.head()

Unnamed: 0,vessel_id,port_id,num_ports,type,num_types,durat_mean,meeting_duration_min
0,56d83489e4b01c7098934f48,"[54880c9b2f16092e9efe8a57, 5358fc77b68ca120a07...",7.0,Bulk Carrier,1.0,7532.875,
1,56d83489e4b01c7098934f4f,"[5358fc78b68ca120a07dbbed, 5358fc78b68ca120a07...",3.0,Fishing Vessel,5.0,4490.444444,93.5
2,56d83489e4b01c7098934f50,"[5358fc77b68ca120a07dae0b, 5358fc78b68ca120a07...",11.0,Bulk Carrier,1.0,5388.75,
3,56d83489e4b01c7098934f51,"[5358fc78b68ca120a07db895, 5358fc78b68ca120a07...",14.0,Bulk Carrier,1.0,469.272727,
4,56d83489e4b025a920ad5e6e,"[5358fc78b68ca120a07dbcd9, 5358fc77b68ca120a07...",6.0,Bulk Carrier,1.0,9145.833333,280.25


In [18]:
num_country_visited = get_num_country_visited(ports_full)
vessels_ports = pd.merge(vessels_ports, num_country_visited, left_on='vessel_id', right_on='vessel_id', how='outer')

In [19]:
vessels_ports.shape

(42823, 8)

In [20]:
vessels_ports = pd.merge(vessels_ports, meeting_one_hot, left_on='vessel_id', right_on='vessel_id', how='outer')

In [21]:
vessels_ports = pd.merge(vessels_ports, countries, left_on='vessel_id', right_on='vessel_id', how='outer')
vessels_ports.shape

(42823, 171)

In [22]:
vessels_ports.shape

(42823, 171)

In [23]:
vessels_ports = vessels_ports[pd.notnull(vessels_ports['port_id'])]
vessels_ports.shape

(21122, 171)

In [24]:
vessels_ports = vessels_ports.drop('type', axis=1)

In [25]:
vessels_ports.head()

Unnamed: 0,vessel_id,port_id,num_ports,num_types,durat_mean,meeting_duration_min,num_countries,type_0,type_1,type_2,...,libya,italy,slovenia,namibia,newzealand,mauritius,angola,oman,solomonislands,senegal
0,56d83489e4b01c7098934f48,"[54880c9b2f16092e9efe8a57, 5358fc77b68ca120a07...",7.0,1.0,7532.875,,8.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,56d83489e4b01c7098934f4f,"[5358fc78b68ca120a07dbbed, 5358fc78b68ca120a07...",3.0,5.0,4490.444444,93.5,9.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,56d83489e4b01c7098934f50,"[5358fc77b68ca120a07dae0b, 5358fc78b68ca120a07...",11.0,1.0,5388.75,,12.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,56d83489e4b01c7098934f51,"[5358fc78b68ca120a07db895, 5358fc78b68ca120a07...",14.0,1.0,469.272727,,44.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,56d83489e4b025a920ad5e6e,"[5358fc78b68ca120a07dbcd9, 5358fc77b68ca120a07...",6.0,1.0,9145.833333,280.25,6.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
vessels_ports.shape

(21122, 170)

In [27]:
ports_by_vessels = pd.read_csv('./input/ports_by_vessel.csv')

In [28]:
ports_by_vessels.head()
cols = ports_by_vessels.columns.values
cols[0] = 'vessel_id' 
ports_by_vessels.columns = cols

In [29]:
print(vessels_ports.shape)
print(ports_by_vessels.shape)

(21122, 170)
(21122, 3591)


In [30]:
vessels_all = pd.merge(vessels_ports, ports_by_vessels, left_on='vessel_id', right_on='vessel_id')

In [31]:
vessels = vessels_all.drop(['port_id', 'num_types', 'vessel_id'], axis=1)
answer = vessels_all['num_types']

In [42]:
vessels.head(30)

Unnamed: 0,num_ports,durat_mean,meeting_duration_min,num_countries,type_0,type_1,type_2,type_3,type_4,type_5,...,57d5268a376b25d85e8577d9,57d5300d8f3ea8dd5e7e5995,57d530a5376b25d85e8577e3,57d531d4be57e5de5e75ff1e,57d536d2376b25d85e8577e9,57d53f2f2e4e5fe35e7b30e5,57d5408b2e4e5fe35e7b30e9,57d543d0be57e5de5e75ff33,57d547ab8f3ea8dd5e7e59a6,57d552ff376b25d85e8577ff
0,7.0,7532.875,,8.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,3.0,4490.444444,93.5,9.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,11.0,5388.75,,12.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,14.0,469.272727,,44.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,6.0,9145.833333,280.25,6.0,0.0,0.0,2.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,26.0,1705.4,,40.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
6,17.0,890.591837,,49.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
7,8.0,1723.1,,10.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
8,9.0,2115.416667,,12.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
9,13.0,3365.405405,,37.0,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [33]:
print(vessels.shape)
print(answer.shape)

(21122, 3757)
(21122,)


In [34]:
imputer = Imputer()
scaler = StandardScaler()

In [35]:
vessels1 = imputer.fit_transform(vessels)
# vessels1 = scaler.fit_transform(vessels1)

In [36]:
num = vessels1.shape[0]

In [37]:
folding = KFold(num, n_folds=5, shuffle=True, random_state=42)

In [38]:
clf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)

In [39]:
score = cross_val_score(clf, vessels1, answer, cv=folding, scoring='accuracy')
print(score)
print(np.mean(score))

[ 0.85940828  0.86627219  0.87144886  0.87144886  0.85961174]
0.865637988614


In [40]:
clf = RandomForestClassifier(n_estimators=200, bootstrap=True, random_state=42)

In [41]:
score = cross_val_score(clf, vessels1, answer, cv=folding, scoring='accuracy')
print(score)
print(np.mean(score))

[ 0.86130178  0.86863905  0.87097538  0.87097538  0.86079545]
0.866537408105


In [43]:
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(vessels1,
                                                                            answer, 
                                                                            random_state=42,
                                                                            test_size=0.3)

In [45]:
clf = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [46]:
y_train_predict = clf.predict(X_train)
y_test_predict = clf.predict(X_test)

In [48]:
print('prediction on train', accuracy_score(y_train_predict, y_train))

prediction on train 0.999594183294


In [49]:
print('prediction on test', accuracy_score(y_test_predict, y_test))

prediction on test 0.858923780969


In [65]:
y_test_predict_names = list(map(lambda x: values2dict[x], y_test_predict))
y_test_names = list(map(lambda x: values2dict[x], y_test))

In [53]:
print('classification report on test\n', classification_report(y_test, y_test_predict))

classification report on test
              precision    recall  f1-score   support

        0.0       0.93      0.93      0.93      1232
        1.0       0.89      0.95      0.92      2190
        2.0       0.89      0.72      0.80       836
        3.0       0.71      0.87      0.78       950
        4.0       0.95      0.53      0.68       180
        5.0       0.84      0.84      0.84       628
        6.0       0.91      0.51      0.65       321

avg / total       0.87      0.86      0.86      6337



In [66]:
print('classification report on test\n', classification_report(y_test_names, y_test_predict_names))

classification report on test
              precision    recall  f1-score   support

Bulk Carrier       0.89      0.95      0.92      2190
Container Vessel       0.93      0.93      0.93      1232
Fishing Vessel       0.84      0.84      0.84       628
 Oil Tanker       0.89      0.72      0.80       836
Passenger Vessel       0.91      0.51      0.65       321
     Reefer       0.95      0.53      0.68       180
        Tug       0.71      0.87      0.78       950

avg / total       0.87      0.86      0.86      6337



In [71]:
vessels.columns[clf.feature_importances_> 0.005]

Index(['num_ports', 'durat_mean', 'meeting_duration_min', 'num_countries',
       'type_0', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6',
       'japan', 'indonesia', 'australia', 'brazil', 'southkorea', 'china',
       'turkey', 'netherlands', 'unitedstates', 'india', 'russia', 'singapore',
       '5358fc78b68ca120a07db85b', '5358fc78b68ca120a07dbc71',
       '5358fc78b68ca120a07dbce5', '5358fc78b68ca120a07dbcfb',
       '5358fc78b68ca120a07dbcfc', '53748c958925ee691cb9b4cd'],
      dtype='object')

In [75]:
ports_full[ports_full.port_id == '5358fc78b68ca120a07db85b']['port_name'].unique()

array(['Taicang'], dtype=object)

In [76]:
ports_full[ports_full.port_id == '5358fc78b68ca120a07dbc71']['port_name'].unique()

array(['Busan'], dtype=object)

In [77]:
ports_full[ports_full.port_id == '5358fc78b68ca120a07dbce5']['port_name'].unique()

array(['Hong Kong'], dtype=object)

In [78]:
ports_full[ports_full.port_id == '5358fc78b68ca120a07dbcfb']['port_name'].unique()

array(['Shanghai'], dtype=object)

In [79]:
ports_full[ports_full.port_id == '5358fc78b68ca120a07dbcfc']['port_name'].unique()

array(['Singapore'], dtype=object)