<a href="https://colab.research.google.com/github/nemuulen/INFOSCI301_Final_Project/blob/main/intl_students_migration_vis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
from pathlib import Path
import pandas as pd
import plotly.express as px

# ‚îÄ‚îÄ‚îÄ 0) Point to your Colab upload folder ‚îÄ‚îÄ‚îÄ
DATA_DIR = Path("/content")

# ‚îÄ‚îÄ‚îÄ 1) Generic World Bank reader for both Excel & CSV exports ‚îÄ‚îÄ‚îÄ
def read_wb(path: Path, var_name: str) -> pd.DataFrame:
    if path.suffix.lower() in (".xls", ".xlsx"):
        # original Excel logic
        xls = pd.ExcelFile(path)
        df = pd.read_excel(xls, sheet_name=xls.sheet_names[0], header=None)
        df.columns = df.iloc[4]  # row 5 has real headers
        df = df.iloc[5:].rename(columns={df.columns[0]:"Country", df.columns[1]:"Country Code"})
    else:
        # CSV export: header on first row
        df = pd.read_csv(path)
        # rename the first two columns
        df = df.rename(columns={"Country Name":"Country","Country Code":"Country Code"})
        # drop indicator cols if present
        for col in ("Indicator Name","Indicator Code"):
            if col in df.columns: df = df.drop(columns=col)
    # melt all year-columns into long form
    df = df.melt(id_vars=["Country","Country Code"], var_name="Year", value_name=var_name)
    # ensure Year is integer
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df.dropna(subset=["Year"])
    df["Year"] = df["Year"].astype(int)
    # ensure values are numeric
    df[var_name] = pd.to_numeric(df[var_name], errors="coerce")
    return df

# ‚îÄ‚îÄ‚îÄ 2) Load GDP & Education-expenditure series ‚îÄ‚îÄ‚îÄ
gdp = read_wb(DATA_DIR/"GDP.csv", "GDP_USD")

# ‚îÄ‚îÄ‚îÄ 3) Country lookup (CSV) ‚îÄ‚îÄ‚îÄ
country_map = (
    pd.read_csv(DATA_DIR/"Country_names.csv")
      .rename(columns={"COUNTRY_ID":"Country Code","COUNTRY_NAME_EN":"Country"})
)

# ‚îÄ‚îÄ‚îÄ 4) Inbound student counts (CSV) ‚îÄ‚îÄ‚îÄ
inb = pd.read_csv(DATA_DIR/"inbound_intl.csv")\
    .rename(columns={"geoUnit":"Country Code","year":"Year","value":"Inbound"})
inb["Year"] = pd.to_numeric(inb["Year"], errors="coerce").dropna().astype(int)
inb["Inbound"] = pd.to_numeric(inb["Inbound"], errors="coerce")
inb = (
    inb.merge(country_map, on="Country Code", how="left")
       .dropna(subset=["Country"])
       [["Country","Country Code","Year","Inbound"]]
)

# ‚îÄ‚îÄ‚îÄ 5) Net = (inbound ‚àí outbound) (CSV) ‚îÄ‚îÄ‚îÄ
net = pd.read_csv(DATA_DIR/"inbound-outbound_intl.csv")\
    .rename(columns={"geoUnit":"Country Code","year":"Year","value":"Net"})
net["Year"] = pd.to_numeric(net["Year"], errors="coerce").dropna().astype(int)
net["Net"] = pd.to_numeric(net["Net"], errors="coerce")
net = (
    net.merge(country_map, on="Country Code", how="left")
       .dropna(subset=["Country"])
       [["Country","Country Code","Year","Net"]]
)

# ‚îÄ‚îÄ‚îÄ 6) Compute Outbound = Inbound ‚àí Net (no negatives) ‚îÄ‚îÄ‚îÄ
df_io = inb.merge(net, on=["Country","Country Code","Year"], how="inner")
df_io["Outbound"] = (df_io["Inbound"] - df_io["Net"]).clip(lower=0)

# ‚îÄ‚îÄ‚îÄ 7) Merge flows with macro series ‚îÄ‚îÄ‚îÄ
df = (
    df_io[["Country","Year","Inbound","Outbound"]]
      .merge(gdp, on=["Country","Year"], how="left")
)

# ‚îÄ‚îÄ‚îÄ 8) Keep only 2000‚Äì2022 ‚îÄ‚îÄ‚îÄ
df = df[df["Year"].between(2000, 2022)]

# ‚îÄ‚îÄ‚îÄ 9) Prepare hover-text so missing show ‚ÄúN/A‚Äù ‚îÄ‚îÄ‚îÄ
df["GDP_text"] = df["GDP_USD"].apply(lambda x: f"{x:,.0f}" if pd.notna(x) else "N/A")

# ‚îÄ‚îÄ‚îÄ 10) Melt to long for Plotly ‚îÄ‚îÄ‚îÄ
long = df.melt(
    id_vars=["Country","Year","GDP_text"],
    value_vars=["Inbound","Outbound"],
    var_name="Type",
    value_name="Students"
).dropna(subset=["Students"])
long = long[long["Students"] >= 0]
long["Year"] = long["Year"].astype(str)
years = sorted(long["Year"].unique())

# ‚îÄ‚îÄ‚îÄ 11) Draw animated geo‚Äêscatter with custom colors & N/A in hover ‚îÄ‚îÄ‚îÄ
color_map = {"Inbound":"#1E90FF","Outbound":"#FF69B4"}

fig = px.scatter_geo(
    long,
    locations="Country",
    locationmode="country names",
    size="Students",
    color="Type",
    color_discrete_map=color_map,
    hover_name="Country",
    hover_data={
        "Students":":,",
        "GDP_text":True,
        "Type":False,
        "Year":False
    },
    labels={"GDP_text":"GDP (USD)"},
    animation_frame="Year",
    projection="natural earth",
    size_max=40,
    template="plotly_white",
    category_orders={"Year": years},
    title=(
        "üåê International Student Migration (2000‚Äì2022)<br>"
        "<sub>Blue = Inbound | Pink = Outbound; bubble size ‚àù student count</sub>"
    )
)

fig.update_traces(marker=dict(opacity=0.6, line_color="darkgrey", line_width=0.5))
fig.update_geos(showcountries=True, countrycolor="lightgray",
                showland=True, landcolor="whitesmoke",
                showocean=True, oceancolor="lightblue")
fig.update_layout(
    margin=dict(l=0, r=0, t=70, b=0),
    legend_title_text="Flow Type",
    updatemenus=[{
        "type":"buttons","direction":"left","showactive":True,
        "x":0.1,"xanchor":"right","y":0,"yanchor":"top",
        "pad":{"r":10,"t":70},
        "buttons":[
            {"method":"animate","label":"Play",
             "args":[None,{"frame":{"duration":1500,"redraw":True},"fromcurrent":True}]},
            {"method":"animate","label":"Pause",
             "args":[[None],{"frame":{"duration":0,"redraw":False},
                              "mode":"immediate","transition":{"duration":0}}]}
        ]
    }],
    sliders=[{"pad":{"b":10,"t":50},"currentvalue":{"prefix":"Year: "}}]
)

fig.show()

In [None]:
# Install packages
!pip install pandas plotly geopandas

In [120]:
import requests
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go

# ‚îÄ‚îÄ‚îÄ 0) Paths to your Colab CSVs ‚îÄ‚îÄ‚îÄ
DATA_DIR = Path("/content")
share_fp = DATA_DIR / "Share_students_origin_to_destination.csv"
total_fp = DATA_DIR / "Total_num_students_going_abroad.csv"

# ‚îÄ‚îÄ‚îÄ 1) Load & clean origin‚Äìdestination share data ‚îÄ‚îÄ‚îÄ
share = pd.read_csv(share_fp)
share = share.rename(columns={
    "REF_AREA":               "Origin_Code",
    "TIME_PERIOD":            "Year",
    "Percentage_of_students": "Share_pct"
})
share["Share_pct"] = pd.to_numeric(share["Share_pct"].astype(str).str.replace(",", "."), errors="coerce")
share = share.dropna(subset=["Origin_Code", "Origin", "Destination", "Share_pct"])

# ‚îÄ‚îÄ‚îÄ 2) Load & clean total outbound students data ‚îÄ‚îÄ‚îÄ
total = pd.read_csv(total_fp)
total = total.rename(columns={
    "REF_AREA":        "Origin_Code",
    "TOTAL_STUDENTS":  "Total_Outbound"
})
total["Total_Outbound"] = pd.to_numeric(total["Total_Outbound"].astype(str).str.replace(",", "."), errors="coerce")
total = total.dropna(subset=["Origin_Code", "Total_Outbound"])

# ‚îÄ‚îÄ‚îÄ 3) Merge & compute absolute flows ‚îÄ‚îÄ‚îÄ
df = pd.merge(
    share,
    total[["Origin_Code", "Total_Outbound"]],
    on="Origin_Code",
    how="inner"
)
df["Flow"] = df["Total_Outbound"] * df["Share_pct"] / 100.0

# ‚îÄ‚îÄ‚îÄ 4) Fetch country centroids and continents ‚îÄ‚îÄ‚îÄ
resp = requests.get("https://restcountries.com/v3.1/all").json()
iso3_to_latlon = {}
iso3_to_continent = {}
name_to_iso3 = {}
for c in resp:
    iso3 = c.get("cca3", "").upper()
    latlng = c.get("latlng", [])
    name = c.get("name", {}).get("common", "")
    continent = c.get("region", "Other")
    if iso3 and len(latlng) == 2:
        iso3_to_latlon[iso3] = latlng
        iso3_to_continent[iso3] = continent
    if name and iso3:
        name_to_iso3[name] = iso3

def resolve_latlon(code_or_name):
    key = str(code_or_name)
    if key.upper() in iso3_to_latlon:
        return iso3_to_latlon[key.upper()]
    iso = name_to_iso3.get(key)
    if iso and iso in iso3_to_latlon:
        return iso3_to_latlon[iso]
    return None

def resolve_continent(code_or_name):
    key = str(code_or_name)
    if key.upper() in iso3_to_continent:
        return iso3_to_continent[key.upper()]
    iso = name_to_iso3.get(key)
    if iso and iso in iso3_to_continent:
        return iso3_to_continent[iso]
    return "Other"

# ‚îÄ‚îÄ‚îÄ 5) Attach coordinates and continent ‚îÄ‚îÄ‚îÄ
coords = df.apply(lambda r: pd.Series({
    "origin_latlon": resolve_latlon(r["Origin_Code"]),
    "dest_latlon":   resolve_latlon(r["Destination"]),
    "continent":     resolve_continent(r["Origin_Code"])
}), axis=1)
df = pd.concat([df, coords], axis=1).dropna(subset=["origin_latlon", "dest_latlon"])

# ‚îÄ‚îÄ‚îÄ 6) Keep only flows > 500 students ‚îÄ‚îÄ‚îÄ
df = df[df["Flow"] > 1000]

# ‚îÄ‚îÄ‚îÄ 7) Color by continent ‚îÄ‚îÄ‚îÄ
continent_colors = {
    "Asia": "red",
    "Europe": "blue",
    "Africa": "green",
    "Oceania": "orange",
    "Americas": "purple",
    "Antarctic": "cyan",
    "Other": "gray"
}

# ‚îÄ‚îÄ‚îÄ 8) Draw flow map with arrows ‚îÄ‚îÄ‚îÄ
fig = go.Figure()
max_flow = df["Flow"].max()

for _, r in df.iterrows():
    lat0, lon0 = r["origin_latlon"]
    lat1, lon1 = r["dest_latlon"]
    width = max(1.0, (r["Flow"] / max_flow) * 8)
    color = continent_colors.get(r["continent"], "gray")

    # Main line from origin to destination
    fig.add_trace(go.Scattergeo(
        lon=[lon0, lon1],
        lat=[lat0, lat1],
        mode="lines",
        line=dict(width=width, color=color),
        hoverinfo="text",
        text=f"<b>From:</b> {r['Destination']}<br><b>To:</b> {r['Origin']}<br><b>Students:</b> {int(r['Flow']):,}",
        name=f"{r['Origin']} ‚Üí {r['Destination']}"
    ))

    # Arrow (small marker)

    fig.add_trace(go.Scattergeo(
        lon=[lon0],
        lat=[lat0],
        mode="markers",
        marker=dict(size=8, symbol="circle", color=color, angleref="north", angle=(90 if lon1 > lon0 else 270)),
        showlegend=False,
        hoverinfo="skip"
    ))

fig.update_layout(
    title_text="Major Student Migrations (>1000 Students, 2022, OECD countries)",
    showlegend=False,
    geo=dict(
        projection_type="natural earth",
        showcountries=True, countrycolor="lightgray",
        showland=True, landcolor="whitesmoke",
        showocean=True, oceancolor="lightblue"
    ),
    margin=dict(l=0, r=0, t=50, b=0)
)

fig.show()