In [16]:
import pandas as pd
import collections

In [21]:
locs = pd.read_csv("ais/locs.csv").set_index("Unnamed: 0").reset_index(drop=True)
locs["iso2"] = locs["name"].str.replace("^.+\(([^)]+)\)$", lambda g: g.group(1), regex=True)
locs

Unnamed: 0,name,longitude,latitude,iso2
0,SHAKOTAN (RU),146.833333,43.866667,RU
1,EL GHARDAQA (EG),33.850000,27.216667,EG
2,RAS SHUKHIER (EG),33.283333,28.133333,EG
3,RAS GHARIB (EG),33.100000,28.350000,EG
4,AS SUWAYS (EG),32.550000,29.966667,EG
...,...,...,...,...
147,PORT OKTYABRSK (UA),31.933333,46.833333,UA
148,UST-DUNAISK (UA),29.700000,45.466667,UA
149,SIDI KERIR (KURAYR) (EG),29.616667,31.100000,EG
150,PORT BERINGOVSKY (RU),179.366667,63.066667,RU


In [2]:
df = pd.read_csv("all_ships.csv")
df

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name,longitude,latitude,iso2
0,Tanker,S311000953,2021,7,AL ISKANDARIYH (ALEXANDRIA) (EG),29.833333,31.166667,EG
1,Tanker,S215473000,2021,7,AL KHAIR OIL TERMINAL (SD),37.250000,19.583333,SD
2,Cargo,S218350000,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667,EG
3,Tanker,S311133000,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667,EG
4,Cargo,S636017867,2021,7,BUR SAID (PORT SAID) (EG),32.300000,31.266667,EG
...,...,...,...,...,...,...,...,...
96452,Cargo,S636017435,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
96453,Cargo,S636018028,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
96454,Cargo,S671160100,2022,10,ZEIT BAY LPG TERMINAL (EG),33.566667,27.800000,EG
96455,Cargo,S620552000,2022,10,ZEIT BAY TERMINAL (EG),33.600000,27.833333,EG


In [36]:
list(df.columns)

['vessel_type',
 'mmsi',
 'date_year',
 'date_month',
 'geo_name',
 'longitude',
 'latitude',
 'iso2']

In [9]:
df[df["iso2"] == "UA"]

Unnamed: 0,vessel_type,mmsi,date_year,date_month,geo_name,longitude,latitude,iso2
1604,Cargo,S636020153,2021,8,BALAKLAVA (UA),33.600000,44.500000,UA
1605,Tanker,S677081200,2021,8,BALAKLAVA (UA),33.600000,44.500000,UA
1657,Cargo,S272785000,2021,8,BELGOROD-DNESTROVSKY (UA),30.366667,46.183333,UA
1658,Cargo,S214181219,2021,8,BERDYANSK (UA),36.783333,46.750000,UA
1659,Cargo,S214182689,2021,8,BERDYANSK (UA),36.783333,46.750000,UA
...,...,...,...,...,...,...,...,...
96411,Cargo,S667001896,2022,10,YUZHNYY (UA),31.016667,46.600000,UA
96412,Cargo,S671304100,2022,10,YUZHNYY (UA),31.016667,46.600000,UA
96413,Cargo,S671480000,2022,10,YUZHNYY (UA),31.016667,46.600000,UA
96414,Cargo,S671954000,2022,10,YUZHNYY (UA),31.016667,46.600000,UA


In [35]:
ship_bags = {}

def hnd(cur):
    mmsi = cur["mmsi"].iloc[0]
    places = cur.value_counts(["date_year", "date_month", "iso2"]).reset_index()
    places = places.sort_values(["date_year", "date_month"])
    prev_ports = set()
    cur_month = None
    cur_year = None
    cur_ports = set()
    for _, row in places.iterrows():
        if row["date_year"] != cur_year or row["date_month"] != cur_month:
            if cur_year is not None and cur_month is not None:
                ship_bags[(mmsi, cur_year, cur_month)] = cur_ports | prev_ports
            prev_ports = set()
            if (
                    cur_year is None or cur_month is None
                    or (row["date_year"] == cur_year and row["date_month"] == cur_month + 1)
                    or (row["date_year"] == cur_year + 1 and row["date_month"] == 1 and cur_month == 12)):
                prev_ports = cur_ports
            cur_month = row["date_month"]
            cur_year = row["date_year"]
            cur_ports = set()
        cur_ports.add(row["iso2"])
    if cur_year is not None and cur_month is not None:
        ship_bags[(mmsi, cur_year, cur_month)] = cur_ports | prev_ports

df.groupby("mmsi").apply(hnd)

len(ship_bags)

59888

In [44]:
res = collections.defaultdict(lambda: {"total": 0, "UA": 0, "RU": 0})

nan_count = 0
for _, row in df.iterrows():
    if f"{row['mmsi']}" == "nan":
        nan_count += 1
        continue
    ship_key = (row["mmsi"], row["date_year"], row["date_month"])
    port_key = (row["geo_name"], row["vessel_type"], row["date_year"], row["date_month"])
    ports = ship_bags[ship_key]
    res[port_key]["total"] += 1
    if "UA" in ports:
        res[port_key]["UA"] += 1
    if "RU" in ports:
        res[port_key]["RU"] += 1
    
nan_count

178

In [47]:
agg_df = {
    "geo_name": [],
    "date_year": [],
    "date_month": [],
    "vessel_type": [],
    "total": [],
    "UA": [],
    "RU": [],
}
for key, counts in res.items():
    agg_df["geo_name"].append(key[0])
    agg_df["vessel_type"].append(key[1])
    agg_df["date_year"].append(key[2])
    agg_df["date_month"].append(key[3])
    agg_df["total"].append(counts["total"])
    agg_df["UA"].append(counts["UA"])
    agg_df["RU"].append(counts["RU"])

port_df = pd.DataFrame(agg_df).merge(locs, left_on="geo_name", right_on="name").sort_values(["geo_name", "date_year", "date_month"])
port_df = port_df.drop(columns=["name"])
port_df

Unnamed: 0,geo_name,date_year,date_month,vessel_type,total,UA,RU,longitude,latitude,iso2
0,AL ISKANDARIYH (ALEXANDRIA) (EG),2021,7,Tanker,1,0,0,29.833333,31.166667,EG
1,AL ISKANDARIYH (ALEXANDRIA) (EG),2021,8,Cargo,195,22,24,29.833333,31.166667,EG
2,AL ISKANDARIYH (ALEXANDRIA) (EG),2021,8,Tanker,85,4,9,29.833333,31.166667,EG
3,AL ISKANDARIYH (ALEXANDRIA) (EG),2021,9,Cargo,187,29,33,29.833333,31.166667,EG
4,AL ISKANDARIYH (ALEXANDRIA) (EG),2021,9,Tanker,79,9,17,29.833333,31.166667,EG
...,...,...,...,...,...,...,...,...,...,...
1803,SOCHI (RU),2022,7,Tanker,2,0,2,39.733333,43.583333,RU
1804,SOCHI (RU),2022,9,Tanker,2,0,2,39.733333,43.583333,RU
1805,SOCHI (RU),2022,10,Tanker,1,0,1,39.733333,43.583333,RU
1806,KHORLY (UA),2022,8,Cargo,1,1,0,33.283333,46.083333,UA
