In [1]:
import pandas as pd

In [2]:
from fuzzywuzzy import fuzz

In [3]:
from fuzzywuzzy import process

In [4]:
port_name_path = './data/port_name.csv'

In [5]:
destination_path = './data/destination.csv'

In [6]:
df_port_name = pd.read_csv(port_name_path)

In [7]:
df_destination = pd.read_csv(destination_path)

In [8]:
df_port_name.head()

Unnamed: 0,port_name
0,Aabenraa
1,Aaheim m
2,Aalborg
3,Aalvik
4,Aandalsnes


In [9]:
df_destination.head()

Unnamed: 0,destination,imo,update_time
0,AJAIO,1008205,21/4/2020 03:41:37.744+00
1,AMSTERDAM,9682370,21/4/2020 03:41:37.744+00
2,BALBOA,9694414,21/4/2020 03:41:37.744+00
3,BARCELONA,9297163,21/4/2020 03:41:37.744+00
4,BCN ANCHORAGE,9470894,21/4/2020 03:41:37.744+00


In [10]:
df_target = df_destination[['imo', 'destination']]

In [11]:
df_target.head()

Unnamed: 0,imo,destination
0,1008205,AJAIO
1,9682370,AMSTERDAM
2,9694414,BALBOA
3,9297163,BARCELONA
4,9470894,BCN ANCHORAGE


In [12]:
len(df_target)

75141

In [13]:
df_target['imo'].value_counts()

1000000    63
1234567    45
8388608    32
1048576    28
9999999    28
           ..
9079688     1
9271858     1
9126833     1
9314478     1
9351414     1
Name: imo, Length: 69994, dtype: int64

In [14]:
port_name_ls = df_port_name['port_name'].tolist()

In [15]:
destination_list = df_target['destination'].tolist()

In [16]:
data = []
for des in destination_list:
    ratio_list = []
    for name in port_name_ls:
        ratio = fuzz.token_set_ratio(des, name)
        ratio_list.append((name, ratio))    
    ratio_rank = sorted(ratio_list, key=lambda x: x[1],reverse=True)[:5]
    
    scores_list = []
    for (name, ratio) in ratio_rank:
        scores_list.append(name)
        scores_list.append(ratio)
    
    data.append(scores_list)

In [17]:
print(data[:5])

[['Ajaccio', 83, 'Itajai', 73, 'Pajaritos', 71, 'Aioi', 67, 'Nanaimo', 67], ['Amsterdam', 100, 'New Amsterdam', 100, 'Mosterhamn', 74, 'Vasteras', 71, 'Monsteras', 67], ['Balboa', 100, 'Balamban', 71, 'Balongan', 71, 'Alotau', 67, 'Balhaf', 67], ['Barcelona, Spain', 100, 'Balongan', 71, 'Barahona', 71, 'Ancona', 67, 'Balboa', 67], ['Anchorage', 100, 'Muara Berau Anchorage', 82, 'Muara Pantai Anchorage', 82, 'Muara Sabak Anchorage', 82, 'Taboneo Anchorage', 82]]


In [18]:
col_name = ['top1', 'score1','top2', 'score2', 'top3', 'score3', 'top4', 'score4', 'top5', 'score5']

In [19]:
data_df = pd.DataFrame(data, columns=col_name)

In [20]:
data_df.head()

Unnamed: 0,top1,score1,top2,score2,top3,score3,top4,score4,top5,score5
0,Ajaccio,83,Itajai,73,Pajaritos,71,Aioi,67,Nanaimo,67
1,Amsterdam,100,New Amsterdam,100,Mosterhamn,74,Vasteras,71,Monsteras,67
2,Balboa,100,Balamban,71,Balongan,71,Alotau,67,Balhaf,67
3,"Barcelona, Spain",100,Balongan,71,Barahona,71,Ancona,67,Balboa,67
4,Anchorage,100,Muara Berau Anchorage,82,Muara Pantai Anchorage,82,Muara Sabak Anchorage,82,Taboneo Anchorage,82


In [21]:
df_result = pd.concat([df_target, data_df], axis=1)

In [22]:
len(df_result)

75141

In [25]:
df_result.head()

Unnamed: 0,imo,destination,top1,score1,top2,score2,top3,score3,top4,score4,top5,score5
0,1008205,AJAIO,Ajaccio,83,Itajai,73,Pajaritos,71,Aioi,67,Nanaimo,67
1,9682370,AMSTERDAM,Amsterdam,100,New Amsterdam,100,Mosterhamn,74,Vasteras,71,Monsteras,67
2,9694414,BALBOA,Balboa,100,Balamban,71,Balongan,71,Alotau,67,Balhaf,67
3,9297163,BARCELONA,"Barcelona, Spain",100,Balongan,71,Barahona,71,Ancona,67,Balboa,67
4,9470894,BCN ANCHORAGE,Anchorage,100,Muara Berau Anchorage,82,Muara Pantai Anchorage,82,Muara Sabak Anchorage,82,Taboneo Anchorage,82


In [27]:
df_result.to_csv('./data/result_total.csv', encoding='utf_8', header=1, index=0)

In [28]:
port_code_path = './data/port_code.csv'

In [29]:
df_port_code = pd.read_csv(port_code_path)

In [30]:
df_port_code.head()

Unnamed: 0,unctad,name,country_code
0,CNSZX,Shenzhen,CN
1,VNSGN,Ho Chi Min City (formerly Saigon),VN
2,DKAAB,Aabenraa,DK
3,SGSIN,Singapore,SG
4,INNSA,NHAVA SHEVA,IN


In [31]:
code_list = df_port_code['unctad'].tolist()

In [32]:
name_list = df_port_code['name'].tolist()

In [33]:
code_dic = {}
for code, name in zip(code_list, name_list):
    code_dic[code] = name

In [37]:
print(code_dic)

{'CNSZX': 'Shenzhen', 'VNSGN': 'Ho Chi Min City (formerly Saigon)', 'DKAAB': 'Aabenraa', 'SGSIN': 'Singapore', 'INNSA': 'Jawaharlal Nehru Port', '123': 'Nanjingjing', 'ARROS': 'Rosario, Argentina', 'NOAHM': 'Aaheim m', 'COMAM': 'Mamonal', 'JPHSM': 'Hososhima', 'XZANT': 'Ardjuna', 'FIPRS': 'Jacobstad', 'VEMTV': 'Matanzas, Venezuela', 'MYTMP': 'Tanjung Manis', 'AERWP': 'RUWAIS', 'AESHJ': 'Sharjah', 'AEMZD': 'Umm Al Nar', 'JPKUC': 'Kuchinotsu', 'AEZUR': 'Zirku Island', 'GRKLL': 'Kali Limenes', 'CMKBI': 'Kribi', 'CNHUI': 'Huizhou', 'USBTJ': 'Bethel', 'INIXY': 'Kandla', 'CUANT': 'Antilla', 'VEPCZ': 'Puerto La Cruz', 'NZNPL': 'New Plymouth', 'FIKRS': 'Kristinestad', 'ARMDQ': 'Mar del Plata', 'AUWEI': 'Weipa', 'AUKUR': 'Kurnell', 'BRBEL': 'Belem', 'BRPNG': 'Paranagua', 'CLCNR': 'Chanaral', 'CNLSN': 'Lanshan', 'CRLIO': 'Puerto Limon', 'COTCO': 'Tumaco', 'ESALD': 'Alcudia', 'FRMRS': 'Marseilles', 'IEWAT': 'Waterford', 'ILETH': 'Eilat', 'GBHRW': 'Harwich', 'GBHTP': 'Hartlepool', 'GBRUN': 'Runcor

In [45]:
scores = process.extract(destination_list[0], port_name_ls, limit=5)


print(scores)

[('Ajaccio', 83), ('Itajai', 73), ('Bandar e Shahid Rajai', 72), ('Pajaritos', 72), ('Aioi', 67)]


In [43]:
ratio_list = []
for name in port_name_ls:
    ratio = fuzz.token_set_ratio(destination_list[0], name)
    ratio_list.append((name, ratio))
ratio_sort = sorted(ratio_list, key=lambda x: x[1],reverse=True)[:5]


print(ratio_sort)

[('Ajaccio', 83), ('Itajai', 73), ('Pajaritos', 71), ('Aioi', 67), ('Nanaimo', 67)]


In [56]:
scores = process.extract('CAPE PRESTON AUS.', port_name_ls, limit=5)


print(scores)





ratio_list = []
for name in port_name_ls:
    ratio = fuzz.token_set_ratio('CAPE PRESTON AUS.', name)
    ratio_list.append((name, ratio))
ratio_sort = sorted(ratio_list, key=lambda x: x[1],reverse=True)[:5]


print(ratio_sort)

[('Cape Preston', 95), ('Preston', 90), ('Cape Town', 86), ('Sandy Cape', 86), ('Brest', 72)]
[('Cape Preston', 100), ('Preston', 100), ('Cape Charles', 64), ('Cape Town', 64), ('Charleston', 62)]


In [2]:
s = 'GUARD AREA'

In [3]:
'guard' in str.lower(s)

True

In [25]:
tmp = []
for (name, score) in scores:
    tmp.append(name)
    tmp.append(score)

In [27]:
print(tmp)

['Ajaccio', 83, 'Itajai', 73, 'Bandar e Shahid Rajai', 72, 'Pajaritos', 72, 'Aioi', 67]


In [32]:
l = []

In [33]:
l.append(tmp)

In [35]:
print(l)

[['Ajaccio', 83, 'Itajai', 73, 'Bandar e Shahid Rajai', 72, 'Pajaritos', 72, 'Aioi', 67]]


In [38]:
col_name = ['top1', 'score1','top2', 'score2', 'top3', 'score3', 'top4', 'score4', 'top5', 'score5']

In [39]:
tmp_df = pd.DataFrame(l, columns=col_name)

In [40]:
tmp_df

Unnamed: 0,top1,score1,top2,score2,top3,score3,top4,score4,top5,score5
0,Ajaccio,83,Itajai,73,Bandar e Shahid Rajai,72,Pajaritos,72,Aioi,67
