# Here we will explore international collaboration in the corpus

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import nbformat

pio.renderers.default = 'notebook'

import pickle
import re
from itertools import combinations

### load in the information on the countries and their coordinates

In [2]:
# using publicly available data on countries and their latitude/longitude
# load the data, dropping the unneeded index column
countries = pd.read_csv("countries.csv", index_col=0)

# inspect the data
countries.head()

Unnamed: 0,countries,continent,Longitude,Latitude
0,AFGHANISTAN,ASIA,66.004734,33.835231
1,ALBANIA,EUROPE,20.049834,41.14245
2,ALGERIA,AFRICA,2.617323,28.158938
3,ANDORRA,EUROPE,1.560544,42.542291
4,ANGOLA,AFRICA,17.537368,-12.293361


### load in our dataframe

In [3]:
df = pd.read_pickle("peds_spine_v1_from_R.pkl")
# inspect the data
df.head()

Unnamed: 0,AU,DE,ID,C1,CR,AB,PA,affiliations,AR,EM,...,web.of.science.index,PY,RP,DB,J9,AU_UN,AU1_UN,AU_UN_NR,SR_FULL,SR
0,SAKAI N;YAMADA H;NISHIMURA Y;SHIRAKAMI S;FUTAM...,CAVERNOUS ANGIOMA; SURGERY; LITERATURE REVIEW,HEMANGIOMA; MALFORMATIONS; DIAGNOSIS; CHILDREN...,"SAKAI, N (CORRESPONDING AUTHOR), GIFU UNIV, SC...","CHADDUCK WM, 1985, NEUROSURGERY, V16, P189, DO...",THREE CASES OF SYMPTOMATIC CAVERNOUS ANGIOMA O...,"233 SPRING ST, NEW YORK, NY 10013 USA",,,,...,SCIENCE CITATION INDEX EXPANDED (SCI-EXPANDED),1992,"SAKAI, N (CORRESPONDING AUTHOR), GIFU UNIV, SC...",ISI,CHILDS NERV SYST,GIFU UNIV,GIFU UNIV,,"SAKAI N, 1992, CHILDS NERV SYST","SAKAI N, 1992, CHILDS NERV SYST"
1,TEWARI MK;TRIPATHI LN;MATHURIYA SN;KHANDELWAL ...,SPONTANEOUS SPINAL EXTRADURAL HEMATOMA; SPINAL...,EPIDURAL HEMATOMA; PARAPLEGIA; RECOVERY,"POSTGRAD INST MED EDUC \& RES,DEPT NEUROSURG,C...","BRAWN LA, 1986, POSTGRAD MED J, V62, P885, DOI...",THREE YOUNG CHILDREN WHO PRESENTED WITH ACUTE ...,"175 FIFTH AVE, NEW YORK, NY 10010",POST GRADUATE INSTITUTE OF MEDICAL EDUCATION \...,,,...,SCIENCE CITATION INDEX EXPANDED (SCI-EXPANDED),1992,"POSTGRAD INST MED EDUC \& RES,DEPT NEUROSURG,C...",ISI,CHILDS NERV SYST,POSTGRAD INST MED EDUC AND RES;POSTGRAD INST M...,POSTGRAD INST MED EDUC AND RES,,"TEWARI MK, 1992, CHILDS NERV SYST","TEWARI MK, 1992, CHILDS NERV SYST"
2,MIGLIORE A;CALZOLARI F;MARZOLA A;GHADIRPOUR MM,PEDIATRIC CRANIOPHARYNGIOMA; III-VENTRICLE; TR...,INTRAVENTRICULAR CRANIOPHARYNGIOMA; TOTAL REMOVAL,"MIGLIORE, A (CORRESPONDING AUTHOR), UNIV FERRA...","ALTINORS N, 1984, J NEUROSURG, V60, P842, DOI ...",A CASE OF INTRINSIC III VENTRICLE CRANIOPHARYN...,"175 FIFTH AVE, NEW YORK, NY 10010",UNIVERSITY OF FERRARA; UNIVERSITY OF FERRARA; ...,,,...,SCIENCE CITATION INDEX EXPANDED (SCI-EXPANDED),1992,"MIGLIORE, A (CORRESPONDING AUTHOR), UNIV FERRA...",ISI,CHILDS NERV SYST,UNIV FERRARA;UNIV FERRARA,UNIV FERRARA,,"MIGLIORE A, 1992, CHILDS NERV SYST","MIGLIORE A, 1992, CHILDS NERV SYST"
3,SCHWEITZER JS;BATZDORF U,CAUDA EQUINA; EPENDYMOMA; MAGNETIC RESONANCE I...,SACROCOCCYGEAL MYXOPAPILLARY EPENDYMOMA; SPINA...,"UNIV CALIF LOS ANGELES,CTR HLTH SCI,DIV NEUROS...","BARONE BM, 1970, J NEUROSURG, V33, P428, DOI 1...",NEW DIAGNOSTIC IMAGING TECHNIQUES MAKE POSSIBL...,"351 WEST CAMDEN ST, BALTIMORE, MD 21201-2436",UNIVERSITY OF CALIFORNIA SYSTEM; UNIVERSITY OF...,,,...,SCIENCE CITATION INDEX EXPANDED (SCI-EXPANDED),1992,"UNIV CALIF LOS ANGELES,CTR HLTH SCI,DIV NEUROS...",ISI,NEUROSURGERY,UNIV CALIF LOS ANGELES;UNIV CALIF LOS ANGELES;...,UNIV CALIF LOS ANGELES,,"SCHWEITZER JS, 1992, NEUROSURGERY","SCHWEITZER JS, 1992, NEUROSURGERY"
4,WONG CC;PEREIRA B;PHO RWH,,,"NATL UNIV SINGAPORE HOSP,DEPT ORTHOPAED SURG,L...",,FOUR CASES OF SYMPTOMATIC CERVICAL DISC CALCIF...,"227 EAST WASHINGTON SQ, PHILADELPHIA, PA 19106",NATIONAL UNIVERSITY OF SINGAPORE,,,...,SCIENCE CITATION INDEX EXPANDED (SCI-EXPANDED),1992,"NATL UNIV SINGAPORE HOSP,DEPT ORTHOPAED SURG,L...",ISI,SPINE,NATL UNIV SINGAPORE HOSP,NATL UNIV SINGAPORE HOSP,,"WONG CC, 1992, SPINE","WONG CC, 1992, SPINE"


In [4]:
# here we are going to extract the countries for all the authors in the 'C1' column

# first we need to create a regex pattern that matches any of the country names
pattern = re.compile(r"\b(" + "|".join(countries["countries"]) + r")\b", re.IGNORECASE)

# define a function that will extract the countries from a string
def extract_countries(s):
    if isinstance(s, str):
        return pattern.findall(s)
    else:
        return []

# apply the function to the 'C1' column
df["countries"] = df["C1"].apply(extract_countries)

# inspect the data for a random sample of 10 rows showing the 'C1' and 'countries' columns
df[["C1", "countries"]].sample(10)


Unnamed: 0,C1,countries
4150,"KRENZLIN, H (CORRESPONDING AUTHOR), HARVARD ME...","[USA, GERMANY, USA, GERMANY, GERMANY]"
6335,"HUANG, JH (CORRESPONDING AUTHOR), FOURTH MIL M...","[CHINA, CHINA, CHINA, CHINA]"
5471,"ERSAHIN, Y (CORRESPONDING AUTHOR), EGE UNIV,FA...",[TURKEY]
9504,"FILIPOVIC, V (CORRESPONDING AUTHOR), UNIV ZAGR...","[CROATIA, CROATIA, CROATIA]"
4974,"SHASHWAT, M (CORRESPONDING AUTHOR), ALL INDIA ...","[INDIA, INDIA, INDIA, INDIA]"
10645,"MACKERLE, Z (CORRESPONDING AUTHOR), BRNO FAC H...","[CZECH REPUBLIC, CZECH REPUBLIC]"
2108,"SPONSELLER, PD (CORRESPONDING AUTHOR), JOHNS H...","[USA, USA]"
8916,"VASSILYADI, M (CORRESPONDING AUTHOR), CHILDREN...","[CANADA, CANADA, CANADA]"
4013,"PIATT, J (CORRESPONDING AUTHOR), ALFRED I DUPO...","[USA, USA, USA, USA, USA]"
11450,"KELLY, DM (CORRESPONDING AUTHOR), CAMPBELL CLI...","[USA, USA, USA, USA, USA]"


In [5]:
# now we need to extract the unique countries from the list of countries for each paper
df["countries"] = df["countries"].apply(set)
# convert the set back to a list
df["countries"] = df["countries"].apply(list)

# renaming USA to UNITED STATES to match the countries dataframe
df["countries"] = df["countries"].apply(
    lambda x: ["USA" if i == "UNITED STATES" else i for i in x]
)

In [6]:
# now we need to look at the unique pairings of countries for each paper
# we will use the itertools package to generate the unique pairings

# Function to generate unique pairings
def generate_pairings(countries):
    pairings = combinations(sorted(countries), 2)
    return list(pairings)


# Apply function to 'countries' column to create new 'pairings' column
df["pairings"] = df["countries"].apply(generate_pairings)

# Expand the dataframe so that each row contains a single pairing
df_expanded = df.explode("pairings")

# Generate 'intl_collab_country_pair_tallies_by_year' DataFrame
intl_collab_country_pair_tallies_by_year = (
    df_expanded.groupby(["PY", "pairings"]).size().reset_index(name="count")
)

# convert to 'from_country' and 'to_country' columns
intl_collab_country_pair_tallies_by_year[
    "from_country"
] = intl_collab_country_pair_tallies_by_year["pairings"].apply(lambda x: x[0])
intl_collab_country_pair_tallies_by_year[
    "to_country"
] = intl_collab_country_pair_tallies_by_year["pairings"].apply(lambda x: x[1])

# now we need to get the latitude and longitude for each country
longitudes = countries.set_index("countries")["Longitude"].to_dict()
latitudes = countries.set_index("countries")["Latitude"].to_dict()

# map the longitudes and latitudes to the 'from_country' and 'to_country' columns
intl_collab_country_pair_tallies_by_year[
    "from_country_Longitude"
] = intl_collab_country_pair_tallies_by_year["from_country"].map(longitudes)
intl_collab_country_pair_tallies_by_year[
    "from_country_Latitude"
] = intl_collab_country_pair_tallies_by_year["from_country"].map(latitudes)
intl_collab_country_pair_tallies_by_year[
    "to_country_Longitude"
] = intl_collab_country_pair_tallies_by_year["to_country"].map(longitudes)
intl_collab_country_pair_tallies_by_year[
    "to_country_Latitude"
] = intl_collab_country_pair_tallies_by_year["to_country"].map(latitudes)

# inspect a sample of the data
intl_collab_country_pair_tallies_by_year.sample(10)

Unnamed: 0,PY,pairings,count,from_country,to_country,from_country_Longitude,from_country_Latitude,to_country_Longitude,to_country_Latitude
179,2003,"(ENGLAND, SWITZERLAND)",1,ENGLAND,SWITZERLAND,-2.865632,54.123872,8.208675,46.797859
1067,2015,"(ARGENTINA, NETHERLANDS)",1,ARGENTINA,NETHERLANDS,-65.179807,-35.381349,5.281448,52.10079
774,2012,"(MEXICO, USA)",1,MEXICO,USA,-102.523452,23.947537,-112.461674,45.679547
2898,2023,"(NEW ZEALAND, USA)",1,NEW ZEALAND,USA,171.484923,-41.811136,-112.461674,45.679547
1213,2016,"(AUSTRALIA, CANADA)",2,AUSTRALIA,CANADA,134.491,-25.732887,-98.30777,61.362063
1478,2017,"(POLAND, SWITZERLAND)",1,POLAND,SWITZERLAND,19.390128,52.127596,8.208675,46.797859
50,1998,"(CANADA, ITALY)",1,CANADA,ITALY,-98.30777,61.362063,12.070013,42.796626
1155,2015,"(FRANCE, JAPAN)",1,FRANCE,JAPAN,-2.761729,42.17344,138.030896,37.592301
707,2011,"(KOREA, SWITZERLAND)",1,KOREA,SWITZERLAND,127.839161,36.38524,8.208675,46.797859
1357,2016,"(NETHERLANDS, WALES)",1,NETHERLANDS,WALES,5.281448,52.10079,-2.865632,54.123872


In [7]:
# save the dataframe as a csv for inspection
intl_collab_country_pair_tallies_by_year.to_csv(
    "intl_collab_country_pair_tallies_by_year.csv"
)
# save the dataframe as a pickle to preserve the data types
with open("intl_collab_country_pair_tallies_by_year.pkl", "wb") as f:
    pickle.dump(intl_collab_country_pair_tallies_by_year, f)

## Creating figure 4 (as an animation here for the notebook)

In [18]:
# Defining the periods for which we want to calculate international collaboration
periods = [
    ("1900", "1910"),
    ("1911", "1920"),
    ("1921", "1930"),
    ("1931", "1940"),
    ("1941", "1950"),
    ("1951", "1960"),
    ("1961", "1970"),
    ("1971", "1980"),
    ("1981", "1990"),
    ("1991", "1995"),
    ("1996", "2000"),
    ("2001", "2005"),
    ("2006", "2010"),
    ("2011", "2023"),
]

dfs = []  # List to store dataframes for each period

# Loop through the periods
for start, end in periods:
    # Filter the original dataframe for the given period
    period_data = intl_collab_country_pair_tallies_by_year[
        (intl_collab_country_pair_tallies_by_year["PY"] >= int(start))
        & (intl_collab_country_pair_tallies_by_year["PY"] <= int(end))
    ]
    
    # Group by countries pair and calculate the frequency
    period_data = period_data.groupby(["from_country", "to_country"]).agg(
        Frequency=('count', 'sum'),
        From_lat_long=('from_country_Latitude', 'first'),
        From_long=('from_country_Longitude', 'first'),
        To_lat_long=('to_country_Latitude', 'first'),
        To_long=('to_country_Longitude', 'first')
    ).reset_index()
    
    # Renaming columns and formatting data to match desired output
    period_data["Period"] = f"{start}-{end}"
    period_data["From"] = period_data["from_country"]
    period_data["To"] = period_data["to_country"]
    period_data["From_lat_long"] = list(zip(period_data["From_lat_long"], period_data["From_long"]))
    period_data["To_lat_long"] = list(zip(period_data["To_lat_long"], period_data["To_long"]))

    # Select necessary columns
    period_data = period_data[["From", "To", "Frequency", "Period", "From_lat_long", "To_lat_long"]]

    # Append period data to the dfs list
    dfs.append(period_data)

# Concatenate all dataframes in the dfs list to get the final dataframe
converted_df = pd.concat(dfs).reset_index(drop=True)

# inspect the data for a random sample of 10 rows
converted_df.sample(10)

Unnamed: 0,From,To,Frequency,Period,From_lat_long,To_lat_long
868,ETHIOPIA,GREECE,1,2011-2023,"(8.6227867931, 39.6008009763)","(39.0746962307, 22.9555579369)"
976,GHANA,GREECE,1,2011-2023,"(7.95345643541, -1.21676565807)","(39.0746962307, 22.9555579369)"
616,CAMEROON,WALES,1,2011-2023,"(5.69109848986, 12.7396415575)","(54.1238715577, -2.86563164084)"
844,ENGLAND,NEW ZEALAND,9,2011-2023,"(54.1238715577, -2.86563164084)","(-41.811135569, 171.484923466)"
1338,SERBIA,U ARAB EMIRATES,1,2011-2023,"(44.2215031993, 20.7895833363)","(23.9052818785, 54.3001671016)"
1095,IRELAND,ISRAEL,1,2011-2023,"(53.175448704, -8.13793568667)","(31.4611010118, 35.0044469277)"
1098,IRELAND,NETHERLANDS,1,2011-2023,"(53.175448704, -8.13793568667)","(52.1007899002, 5.28144793007)"
1073,IRAN,SPAIN,3,2011-2023,"(32.575032915, 54.2740700448)","(40.2444869811, -3.64755047323)"
1188,LEBANON,SCOTLAND,1,2011-2023,"(33.9230663057, 35.880160715)","(54.1238715577, -2.86563164084)"
284,ENGLAND,GERMANY,4,2006-2010,"(54.1238715577, -2.86563164084)","(51.1069818075, 10.385780508)"


In [17]:
# Define the periods
periods = list(converted_df["Period"].unique())

# Create an empty list to store the frames
frames = []

# Loop through each period
for period in periods:
    # Create a data frame for the current period
    data_period = converted_df.query(f"Period == '{period}'")

    # Create a trace for each country
    traces = []
    all_countries = set(converted_df["From"]).union(set(converted_df["To"]))
    for country in all_countries:
        df_country = data_period.query(f"`From` == '{country}' or `To` == '{country}'")
        if not data_period[data_period["From"] == country].empty:
            line_width = data_period[data_period["From"] == country][
                "Frequency"
            ].values[0]
        elif not data_period[data_period["To"] == country].empty:
            line_width = data_period[data_period["To"] == country]["Frequency"].values[
                0
            ]
        else:
            line_width = 0  # set line width to 0 if there are no interactions
        if not df_country.empty:
            to_lat_long = [df_country["To_lat_long"].iloc[-1][1]]
        else:
            to_lat_long = []
        traces.append(
            go.Scattergeo(
                locationmode="country names",
                lon=df_country["From_lat_long"].apply(lambda x: x[1]).tolist()
                + to_lat_long,
                lat=df_country["From_lat_long"].apply(lambda x: x[0]).tolist()
                + [df_country["To_lat_long"].iloc[-1][0]]
                if not df_country.empty
                else [],
                mode="lines",
                line=dict(width=line_width, color="red"),
                opacity=0.25,
                name=country,
                showlegend=False,
            )
        )

    # Create the figure for this period
    fig_period = go.Figure(data=traces)
    fig_period.update_layout(
        title_text=f"Country interactions for {period}",
        geo=dict(
            projection=dict(type="natural earth"),
            showland=True,
            landcolor="rgb(243, 243, 243)",
            countrycolor="rgb(204, 204, 204)",
        ),
    )

    # Add this figure to the frames list as a frame
    frames.append(fig_period)

# Create an empty figure and add the first frame to it
fig = go.Figure(frames[0])

# Update the layout of the figure
fig.update_layout(
    title={
        "text": "Country interactions",
        "font": {"size": 50},  # Change the size to 24
    },
    geo=dict(
        projection=dict(type="natural earth"),
        showland=True,
        landcolor="rgb(243, 243, 243)",
        countrycolor="rgb(204, 204, 204)",
    ),
)

# create all the new frames
new_frames = [
    go.Frame(data=frame["data"], layout=frame["layout"]) for frame in frames[1:]
]
fig.frames = new_frames

updatemenu = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 1000, 'redraw': True}, 'fromcurrent': True}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate', 'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
    }
]


fig.update_layout(updatemenus=updatemenu)
fig.show()

