# Erasmus Data Exploration
This notebook explores and analyses the data. The plots can also be found here: https://chart-studio.plotly.com/~Mahuvej#/


First I created some fucntions for plotting an read in the data:

In [1]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import pycountry
import plotly.subplots as sp
from geopy.geocoders import Nominatim
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)

def plot_world_dist(data,column):
    counts = data.groupby(column).size().reset_index(name='Count')
    all_countries = pd.DataFrame(
        {'Country': [c.name for c in pycountry.countries]})
    merged = all_countries.merge(
        counts, left_on='Country', right_on=column, how='left')
    merged['Count'].fillna(0, inplace=True)
    fig = px.choropleth(merged, locations='Country', locationmode='country names', color='Count',
                        color_continuous_scale='Cividis',  hover_data=['Count'],
                        center={'lat': 50, 'lon': 10})
    fig.update_layout(title=column)
    fig.show()
    

def plot_hist(data, column):
    data = data.groupby(column).size().reset_index(name='Count')
    fig = px.bar(data_frame=data, x=column,y = 'Count', title='Histogram of '+column)
    fig = fig.update_layout(xaxis_categoryorder='total descending')
    fig.update_xaxes(tickangle=45)

    fig.show()
    

data = pd.read_pickle('Erasmus_Data/data_clean.pkl')
    

Now I created some distribution plots:

In [2]:
plot_world_dist(data, 'Participant Nationality')
plot_world_dist(data, 'Sending Country')
plot_world_dist(data, 'Receiving Country')

plot_hist(data, 'Activity')
plot_hist(data, 'Education Level')
plot_hist(data, 'Mobility Duration')


Next a plot for the duration by each possible activity

In [3]:
fig = px.bar(data_frame=data.groupby('Activity')['Mobility Duration'].mean().reset_index(),
             x='Activity', y='Mobility Duration',
             title='Mean Mobility Duration by Activity',
             labels={'Activity': 'Activity', 'Mobility Duration': 'Mean Mobility Duration'})

fig = fig.update_layout(xaxis_categoryorder='total descending')
fig.show()


Next distribution of age and age by activity

In [4]:
plot_hist(data, 'Participant Age')

fig = px.bar(data_frame=data.groupby('Activity')['Participant Age'].mean().reset_index(),
             x='Activity', y='Participant Age',
             title='Mean Participant Age by Activity',
             labels={'Activity': 'Activity', 'Participant Age': 'Mean Participant Age'})

fig = fig.update_layout(xaxis_categoryorder='total descending')
fig.show()

The distribution of the gender:

In [5]:
plot_hist(data, 'Participant Gender')

And the distribution of the field of education

In [6]:
plot_hist(data, 'Field of Education')

In which month did the participants start their project:

In [7]:
plot_hist(data,'Mobility Start Month')


How many participants with special needs:

In [8]:
print(data['Special Needs'].sum())
print(data['Special Needs'].sum() / data.shape[0])


29969
0.007136495689860457


And with fewer opportunities:

In [9]:
print(data['Fewer Opportunities'].sum())
print(data['Fewer Opportunities'].sum() / data.shape[0])


316749
0.07542720388626947


Now to the more complicated plot. I wanted to show the movement from one country to the others. For that I first filtered the data to only have the necessatry information.

In [10]:
sending_country = 'Germany'  # Replace with the country you want to focus on
filtered_df = data[data['Sending Country'] == sending_country][['Sending Country', 'Receiving Country']]

geolocator = Nominatim(user_agent="erasmus_app", timeout=10) 

I created a function to get the latitude and longitude

In [11]:
def get_latitude_longitude(country):
    manual_mapping = {
        'Georgia': (42.3154, 43.3569),
        'Palestine': (31.9466, 35.2731)
    }

    if country in manual_mapping:
        return manual_mapping[country]

    location = geolocator.geocode(country)

    if location is not None:
        return location.latitude, location.longitude
    else:
        print(f"Failed to find coordinates for {country}")
        return None, None
sending_lat, sending_lon = get_latitude_longitude(sending_country)


Than I grouped by receiving country and count the number of students

In [12]:
country_counts = filtered_df.groupby('Receiving Country').size().reset_index(name='Number of Students')
country_counts['Receiving Latitude'], country_counts['Receiving Longitude'] = zip(*country_counts['Receiving Country'].apply(get_latitude_longitude))
country_counts.dropna(subset=['Receiving Latitude', 'Receiving Longitude'], inplace=True)


Failed to find coordinates for Taiwan, Province of China


For the plot I create a subplot with a world map in the background and added a choropleth map for the receiving countries with custom hovertemplate

In [13]:
fig = sp.make_subplots(rows=1, cols=1, specs=[[{"type": "scattergeo"}]])

fig.add_trace(go.Choropleth(
    locations=country_counts['Receiving Country'],
    z=country_counts['Number of Students'],
    text=country_counts['Receiving Country'],
    hovertemplate='%{text}<br>Number of Students: %{z}',
    colorscale='Viridis',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix='',
    colorbar_title='Number of Students',
    locationmode="country names",
))

I wanted to show the movement with arrors

In [14]:
arrow_width = 1.5
base_opacity = 0.8

for index, row in country_counts.iterrows():
    fig.add_trace(
        go.Scattergeo(
            lon=[sending_lon, row['Receiving Longitude']],
            lat=[sending_lat, row['Receiving Latitude']],
            mode='lines',
            line=dict(width=arrow_width, color='red'),
            opacity=base_opacity,
            showlegend=False,
            hoverinfo='none',  # Suppress the display of coordinates when hovering over the arrows
        )
    )

# Update the layout for the world map
fig.update_geos(
    showland=True,
    landcolor='rgb(243, 243, 243)',
    countrycolor='rgb(204, 204, 204)',
    showcountries=True,
    projection_type="equirectangular",
)

fig.update_layout(title_text=f'Movement of Erasmus Students from {sending_country}')

fig.show()