In [1]:
import pandas as pd
import os

In [2]:
path = "ais_3"

In [3]:
locs = pd.read_csv(os.path.join(path, "locs.csv"))
locs

Unnamed: 0,name,longitude,latitude
0,SHAKOTAN (RU),146.833333,43.866667
1,EL GHARDAQA (EG),33.850000,27.216667
2,RAS SHUKHIER (EG),33.283333,28.133333
3,RAS GHARIB (EG),33.100000,28.350000
4,AS SUWAYS (EG),32.550000,29.966667
...,...,...,...
167,PORT BERINGOVSKY (RU),179.366667,63.066667
168,SALIF (YE),42.666667,15.300000
169,RENI (UA),28.300000,45.433333
170,Suez (EG),32.355877,30.443370


In [4]:
dfs = []
for fname in sorted(os.listdir(path)):
    if not fname.endswith(".csv"):
        continue
    if not (
            fname.startswith("black_sea")
            or fname.startswith("rus_ukr")
            or fname.startswith("strait")
            ):
        continue
    fullname = os.path.join(path, fname)
    print(fname)
    cur = pd.read_csv(fullname, dtype={
        'vessel_type': 'string',
        'mmsi': 'string',
        'date_year': 'int64',
        'date_month': 'int64',
        'geo_name': 'string',
    }).set_index("Unnamed: 0").reset_index(drop=True)
    dfs.append(cur)
df = pd.concat(dfs).sort_values(["date_year", "date_month", "geo_name", "mmsi"]).reset_index(drop=True)
df

black_sea_horn_africa_ships_2018-12-01.csv
black_sea_horn_africa_ships_2019-01-01.csv
black_sea_horn_africa_ships_2019-02-01.csv
black_sea_horn_africa_ships_2019-03-01.csv
black_sea_horn_africa_ships_2019-04-01.csv
black_sea_horn_africa_ships_2019-05-01.csv
black_sea_horn_africa_ships_2019-06-01.csv
black_sea_horn_africa_ships_2019-07-01.csv
black_sea_horn_africa_ships_2019-08-01.csv
black_sea_horn_africa_ships_2019-09-01.csv
black_sea_horn_africa_ships_2019-10-01.csv
black_sea_horn_africa_ships_2019-11-01.csv
black_sea_horn_africa_ships_2019-12-01.csv
black_sea_horn_africa_ships_2020-01-01.csv
black_sea_horn_africa_ships_2020-02-01.csv
black_sea_horn_africa_ships_2020-03-01.csv
black_sea_horn_africa_ships_2020-04-01.csv
black_sea_horn_africa_ships_2020-05-01.csv
black_sea_horn_africa_ships_2020-06-01.csv
black_sea_horn_africa_ships_2020-07-01.csv
black_sea_horn_africa_ships_2020-08-01.csv
black_sea_horn_africa_ships_2020-09-01.csv
black_sea_horn_africa_ships_2020-10-01.csv
black_sea_h

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name
0,Tanker,219598000,2018,12,AIN SUKHNA TERMINAL (EG)
1,Tanker,235108533,2018,12,AIN SUKHNA TERMINAL (EG)
2,Cargo,248206000,2018,12,AIN SUKHNA TERMINAL (EG)
3,Tanker,248642000,2018,12,AIN SUKHNA TERMINAL (EG)
4,Tanker,249437000,2018,12,AIN SUKHNA TERMINAL (EG)
...,...,...,...,...,...
558135,Cargo,565762000,2023,10,ZEIT BAY LPG TERMINAL (EG)
558136,Tanker,565829000,2023,10,ZEIT BAY LPG TERMINAL (EG)
558137,Tanker,622120925,2023,10,ZEIT BAY LPG TERMINAL (EG)
558138,Tanker,622120926,2023,10,ZEIT BAY LPG TERMINAL (EG)


In [5]:
df["mmsi"].unique()

<StringArray>
['219598000', '235108533', '248206000', '248642000', '249437000', '305859000',
 '309918000', '370167000', '370653000', '372066000',
 ...
 '244371282', '244660697', '244670581', '244890469', '267131516', '269056666',
 '538010999', '620999144', '563160100', '636022921']
Length: 41014, dtype: string

In [6]:
df["mmsi"] = "S" + df["mmsi"].str.replace("\..+", "", regex=True)

In [7]:
df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name
0,Tanker,S219598000,2018,12,AIN SUKHNA TERMINAL (EG)
1,Tanker,S235108533,2018,12,AIN SUKHNA TERMINAL (EG)
2,Cargo,S248206000,2018,12,AIN SUKHNA TERMINAL (EG)
3,Tanker,S248642000,2018,12,AIN SUKHNA TERMINAL (EG)
4,Tanker,S249437000,2018,12,AIN SUKHNA TERMINAL (EG)
...,...,...,...,...,...
558135,Cargo,S565762000,2023,10,ZEIT BAY LPG TERMINAL (EG)
558136,Tanker,S565829000,2023,10,ZEIT BAY LPG TERMINAL (EG)
558137,Tanker,S622120925,2023,10,ZEIT BAY LPG TERMINAL (EG)
558138,Tanker,S622120926,2023,10,ZEIT BAY LPG TERMINAL (EG)


In [8]:
full_df = df.merge(locs, left_on="geo_name", right_on="name")
full_df = full_df.drop(columns=["name"])
full_df = full_df.sort_values(["date_year", "date_month", "geo_name", "mmsi"]).reset_index(drop=True)
full_df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name,longitude,latitude
0,Tanker,S219598000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333
1,Tanker,S235108533,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333
2,Cargo,S248206000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333
3,Tanker,S248642000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333
4,Tanker,S249437000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333
...,...,...,...,...,...,...,...
558098,Cargo,S565762000,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000
558099,Tanker,S565829000,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000
558100,Tanker,S622120925,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000
558101,Tanker,S622120926,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000


In [9]:
full_df["iso2"] = full_df["geo_name"].str.replace("^.+\(([^)]+)\)$", lambda g: g.group(1), regex=True)

In [10]:
full_df.to_csv("all_ships.csv", index=False)

In [11]:
full_df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name,longitude,latitude,iso2
0,Tanker,S219598000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333,EG
1,Tanker,S235108533,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333,EG
2,Cargo,S248206000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333,EG
3,Tanker,S248642000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333,EG
4,Tanker,S249437000,2018,12,AIN SUKHNA TERMINAL (EG),32.366667,29.583333,EG
...,...,...,...,...,...,...,...,...
558098,Cargo,S565762000,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
558099,Tanker,S565829000,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
558100,Tanker,S622120925,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
558101,Tanker,S622120926,2023,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
