In [1]:
import pandas as pd
import os

In [2]:
locs = pd.read_csv("ais/locs.csv").set_index("Unnamed: 0").reset_index(drop=True)
locs

Unnamed: 0,name,longitude,latitude
0,SHAKOTAN (RU),146.833333,43.866667
1,EL GHARDAQA (EG),33.850000,27.216667
2,RAS SHUKHIER (EG),33.283333,28.133333
3,RAS GHARIB (EG),33.100000,28.350000
4,AS SUWAYS (EG),32.550000,29.966667
...,...,...,...
147,PORT OKTYABRSK (UA),31.933333,46.833333
148,UST-DUNAISK (UA),29.700000,45.466667
149,SIDI KERIR (KURAYR) (EG),29.616667,31.100000
150,PORT BERINGOVSKY (RU),179.366667,63.066667


In [3]:
dfs = []
for fname in sorted(os.listdir("ais")):
    if not fname.endswith(".csv"):
        continue
    if fname == "locs.csv":
        continue
    fullname = os.path.join("ais", fname)
    cur = pd.read_csv(fullname, dtype={
        'vessel_type': 'string',
        'mmsi': 'string',
        'date_year': 'int64',
        'date_month': 'int64',
        'geo_name': 'string',
    }).set_index("Unnamed: 0").reset_index(drop=True)
    dfs.append(cur)
df = pd.concat(dfs).sort_values(["date_year", "date_month", "geo_name", "mmsi"]).reset_index(drop=True)
df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name
0,Tanker,311000953.0,2021,7,AL ISKANDARIYH (ALEXANDRIA) (EG)
1,Tanker,215473000.0,2021,7,AL KHAIR OIL TERMINAL (SD)
2,Cargo,218350000.0,2021,7,BUR SAID (PORT SAID) (EG)
3,Tanker,311133000.0,2021,7,BUR SAID (PORT SAID) (EG)
4,Cargo,636017867.0,2021,7,BUR SAID (PORT SAID) (EG)
...,...,...,...,...,...
96462,Cargo,636017435,2022,10,ZEIT BAY LPG TERMINAL (EG)
96463,Cargo,636018028,2022,10,ZEIT BAY LPG TERMINAL (EG)
96464,Cargo,671160100,2022,10,ZEIT BAY LPG TERMINAL (EG)
96465,Cargo,620552000,2022,10,ZEIT BAY TERMINAL (EG)


In [4]:
df["mmsi"].unique()

<StringArray>
['311000953.0', '215473000.0', '218350000.0', '311133000.0', '636017867.0',
 '371587000.0', '240716000.0', '636014494.0', '374408000.0', '311000937.0',
 ...
   '538003231',   '538010007',   '636015750',   '636020574',   '210452000',
   '352001447',   '613199630',   '538010161',   '447161000',   '677071708']
Length: 20699, dtype: string

In [5]:
df["mmsi"] = "S" + df["mmsi"].str.replace("\..+", "", regex=True)

In [6]:
df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name
0,Tanker,S311000953,2021,7,AL ISKANDARIYH (ALEXANDRIA) (EG)
1,Tanker,S215473000,2021,7,AL KHAIR OIL TERMINAL (SD)
2,Cargo,S218350000,2021,7,BUR SAID (PORT SAID) (EG)
3,Tanker,S311133000,2021,7,BUR SAID (PORT SAID) (EG)
4,Cargo,S636017867,2021,7,BUR SAID (PORT SAID) (EG)
...,...,...,...,...,...
96462,Cargo,S636017435,2022,10,ZEIT BAY LPG TERMINAL (EG)
96463,Cargo,S636018028,2022,10,ZEIT BAY LPG TERMINAL (EG)
96464,Cargo,S671160100,2022,10,ZEIT BAY LPG TERMINAL (EG)
96465,Cargo,S620552000,2022,10,ZEIT BAY TERMINAL (EG)


In [7]:
full_df = df.merge(locs, left_on="geo_name", right_on="name")
full_df = full_df.drop(columns=["name"])
full_df = full_df.sort_values(["date_year", "date_month", "geo_name", "mmsi"]).reset_index(drop=True)
full_df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name,longitude,latitude
0,Tanker,S311000953,2021,7,AL ISKANDARIYH (ALEXANDRIA) (EG),29.833333,31.166667
1,Tanker,S215473000,2021,7,AL KHAIR OIL TERMINAL (SD),37.250000,19.583333
2,Cargo,S218350000,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667
3,Tanker,S311133000,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667
4,Cargo,S636017867,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667
...,...,...,...,...,...,...,...
96452,Cargo,S636017435,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000
96453,Cargo,S636018028,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000
96454,Cargo,S671160100,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000
96455,Cargo,S620552000,2022,10,ZEIT BAY TERMINAL (EG),33.600000,27.833333


In [8]:
full_df["iso2"] = full_df["geo_name"].str.replace("^.+\(([^)]+)\)$", lambda g: g.group(1), regex=True)

In [9]:
full_df.to_csv("all_ships.csv", index=False)

In [10]:
full_df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name,longitude,latitude,iso2
0,Tanker,S311000953,2021,7,AL ISKANDARIYH (ALEXANDRIA) (EG),29.833333,31.166667,EG
1,Tanker,S215473000,2021,7,AL KHAIR OIL TERMINAL (SD),37.250000,19.583333,SD
2,Cargo,S218350000,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667,EG
3,Tanker,S311133000,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667,EG
4,Cargo,S636017867,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667,EG
...,...,...,...,...,...,...,...,...
96452,Cargo,S636017435,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
96453,Cargo,S636018028,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
96454,Cargo,S671160100,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
96455,Cargo,S620552000,2022,10,ZEIT BAY TERMINAL (EG),33.600000,27.833333,EG
