In [None]:
import altair as alt
import pandas as pd
from vega_datasets import data

#### First visualization: Repos grouped by topic/keyword and connections by shared contributors

In [None]:
# Read our cleaned data
repos = pd.read_csv('../cleaned_data/repo_info_stop75.csv')
contributors = pd.read_csv('../raw_data/10_contributor_info_all.csv')
repo_contrib_relation = pd.read_csv('../raw_data/repo_contributor_relationship_table_all.csv')
repo_topic_relation = pd.read_csv('../raw_data/topic_relationship_table_stop75.csv')

In [None]:
# Was originally the background image for the airports
states = alt.topo_feature(data.us_10m.url, feature="states")

# Original example, TODO: remove
airports = data.airports.url
flights_airport = data.flights_airport.url

# Original example, TODO: remove
states = alt.topo_feature(data.us_10m.url, feature="states")

# Original example, TODO: remove
select_city = alt.selection_single(
    # Decides which blob is selected by the mouse hover-over.
    # Can maybe forgo this for the static graph
    on="mouseover", nearest=True, fields=["origin"], empty="none"
)

# NEW
# Create mouseover selection
# select_repo = alt.selection_single(
#     # Display the topic keyword on mouseover (assuming we grouped by topic)
#     on="mouseover", nearest=True, fields=["topic"], empty="none"
# )


# TODO:
# It seems we have to do a mapping from topics to (self chosen) coordinates 
# to draw them in the right spots of the graph. -> build x and y to topic map somehow


# Original example, TODO: remove
lookup_data = alt.LookupData(
    airports, key="iata", fields=["state", "latitude", "longitude"]
)

# TODO:
# Define which attributes to lookup from repos
# Might need to build a new dataframe joining repos and the repo_topic_relations so that we can group by topic
# lookup_data = alt.LookupData(
#     # We have to actually build x and y first!
#     repos, key="iata", fields=["topic", "x", "y"]
#     # Don't quite know what key 'iata' is
# )


# TODO: Need to change the background to maybe just white instead of the united states
background = alt.Chart(states).mark_geoshape(
    fill="lightgray",
    stroke="white"
).properties(
    width=750,
    height=500
).project("albersUsa")

# TODO: Probably need to change lat and long to x and y defined above
# This should be the actual network edges
# We need to build another dataframe that connects topics that share 
# a certain amount of contributers between them (have to go over their 
# contributed repositories)
connections = alt.Chart(flights_airport).mark_rule(opacity=0.35).encode(
    latitude="latitude:Q",
    longitude="longitude:Q",
    latitude2="lat2:Q",
    longitude2="lon2:Q"
).transform_lookup(
    # origin and destination have to be chosen from that new dataframe
    # where the topics with enough support are connected
    lookup="origin",
    from_=lookup_data
).transform_lookup(
    lookup="destination",
    from_=lookup_data,
    as_=["state", "lat2", "lon2"]
).transform_filter(
    select_city
)

# TODO: Again need to change lat and long to x and y defined above
points = alt.Chart(flights_airport).mark_circle().encode(
    latitude="latitude:Q",
    longitude="longitude:Q",
    # TODO: Do the sizing according to the amount of repos corresponding to the topic
    size=alt.Size("routes:Q", scale=alt.Scale(range=[0, 1000]), legend=None),
    order=alt.Order("routes:Q", sort="descending"),
    # TODO: Change tooltip to the topic name and amount of repos corresponding to it
    tooltip=["origin:N", "routes:Q"]
).transform_aggregate(
    # TODO: Count repos
    routes="count()",
    # TODO: Grouped by topic
    groupby=["origin"]
).transform_lookup(
    # TODO: Lookup repo name
    lookup="origin",
    from_=lookup_data
).transform_filter(
    # Probably not needed
    (alt.datum.state != "PR") & (alt.datum.state != "VI")
).add_selection(
    select_city
)

# Combine the different graphs
(background + connections + points).configure_view(stroke=None)