In [3]:
import pandas as pd
import numpy as np
import plotly.express as px

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-03-29/sports.csv')




In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132327 entries, 0 to 132326
Data columns (total 28 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  132327 non-null  int64  
 1   unitid                132327 non-null  int64  
 2   institution_name      132327 non-null  object 
 3   city_txt              132282 non-null  object 
 4   state_cd              132282 non-null  object 
 5   zip_text              132282 non-null  float64
 6   classification_code   132327 non-null  int64  
 7   classification_name   132327 non-null  object 
 8   classification_other  1679 non-null    object 
 9   ef_male_count         132327 non-null  int64  
 10  ef_female_count       132327 non-null  int64  
 11  ef_total_count        132327 non-null  int64  
 12  sector_cd             132327 non-null  int64  
 13  sector_name           132282 non-null  object 
 14  sportscode            132327 non-null  int64  
 15  

In [7]:
df.head()

Unnamed: 0,year,unitid,institution_name,city_txt,state_cd,zip_text,classification_code,classification_name,classification_other,ef_male_count,ef_female_count,ef_total_count,sector_cd,sector_name,sportscode,partic_men,partic_women,partic_coed_men,partic_coed_women,sum_partic_men,sum_partic_women,rev_men,rev_women,total_rev_menwomen,exp_men,exp_women,total_exp_menwomen,sports
0,2015,100654,Alabama A & M University,Normal,AL,35762.0,2,NCAA Division I-FCS,,1923,2300,4223,1,"Public, 4-year or above",1,31.0,,,,31,0,345592.0,,345592.0,397818.0,,397818.0,Baseball
1,2015,100654,Alabama A & M University,Normal,AL,35762.0,2,NCAA Division I-FCS,,1923,2300,4223,1,"Public, 4-year or above",2,19.0,16.0,,,19,16,1211095.0,748833.0,1959928.0,817868.0,742460.0,1560328.0,Basketball
2,2015,100654,Alabama A & M University,Normal,AL,35762.0,2,NCAA Division I-FCS,,1923,2300,4223,1,"Public, 4-year or above",3,61.0,46.0,,,61,46,183333.0,315574.0,498907.0,246949.0,251184.0,498133.0,All Track Combined
3,2015,100654,Alabama A & M University,Normal,AL,35762.0,2,NCAA Division I-FCS,,1923,2300,4223,1,"Public, 4-year or above",7,99.0,,,,99,0,2808949.0,,2808949.0,3059353.0,,3059353.0,Football
4,2015,100654,Alabama A & M University,Normal,AL,35762.0,2,NCAA Division I-FCS,,1923,2300,4223,1,"Public, 4-year or above",8,9.0,,,,9,0,78270.0,,78270.0,83913.0,,83913.0,Golf


In [12]:
df = df[['year','ef_total_count','total_exp_menwomen','total_rev_menwomen','sports']]

In [15]:
df = df.dropna()

In [19]:
df['log_total_exp_menwomen'] = np.log10(df['total_exp_menwomen'])
df['log_total_rev_menwomen'] = np.log10(df['total_rev_menwomen'])

In [21]:
df.sports.unique()

array(['Baseball', 'Basketball', 'All Track Combined', 'Football', 'Golf', 'Soccer', 'Softball', 'Tennis',
       'Volleyball', 'Bowling', 'Beach Volleyball', 'Ice Hockey', 'Lacrosse', 'Gymnastics', 'Rowing',
       'Swimming and Diving', 'Track and Field, X-Country', 'Equestrian', 'Track and Field, Indoor',
       'Track and Field, Outdoor', 'Wrestling', 'Other Sports', 'Skiing', 'Swimming', 'Water Polo', 'Rodeo',
       'Archery', 'Field Hockey', 'Fencing', 'Sailing', 'Badminton', 'Squash', 'Diving', 'Rifle',
       'Synchronized Swimming', 'Table Tennis', 'Weight Lifting'], dtype=object)

In [37]:
fig = px.scatter(df, 
                    x = 'log_total_exp_menwomen', 
                    y = 'log_total_rev_menwomen', 
                    size = 'ef_total_count',
                    template = 'ggplot2',
                    color = 'sports', 
                    title = 'How profitable can college sports be? - USA Facts', 
                    width=1344, height=756)
fig.update_layout(
    title = dict(
        font = dict(size=48, color="black", family="Noto Sans", weight='bold'), 
        x = .5, 
        xanchor = 'center'
    ), 
    yaxis=dict(
        title=dict(
            text = "Total expenditure",
            font=dict(family="Noto Sans", size=24, color="black")),
        tickfont=dict(family="Noto Sans", size=20, color="black")
    ),
    xaxis=dict(
        title=dict(
            text = "Total Revenue",
            font=dict(family="Noto Sans", size=24, color="black")),
        tickfont=dict(family="Noto Sans", size=20, color="black")),
    showlegend = False, 
    annotations = [
        dict(
            text = "Original Work =  Federica Gazzelloni| Recrated by MhKirmizi", 
            xref = "paper", yref= "paper", 
            x = 1, y = -.06, 
            showarrow = False, 
            font = dict(size = 14, color = "black"), 
            align = "left"
        )], 
    margin=dict(t=72, b=50), 
    paper_bgcolor = "#DCDCDC", 
    plot_bgcolor = "#DCDCDC")
fig.show()
fig.write_image("colligate_sport.png", width=1920, height=1080, scale=2)