In [1]:
import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output, callback
import plotly.graph_objects as go, plotly.express as px

In [2]:
file = r"clean_startup_data_replace.csv"
df = pd.read_csv(file)

In [3]:
pd.set_option('display.max_columns', 50)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,name,labels,founded_at,closed_at,first_funding_at,last_funding_at,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,is_NY,is_MA,is_TX,is_otherstate,category_code,is_software,is_web,is_mobile,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,0,CA,42.35888,-71.05682,92101,c:6669,San Diego,Bandsintown,True,2007-01-01,2007-01-01,2009-04-01,2010-01-01,2.2493,3.0027,4.6685,6.7041,3.0,3.0,375000.0,3.0,True,False,False,False,False,music,False,False,False,False,False,False,False,False,False,True,c:6669,False,True,False,False,False,False,1.0,False,acquired
1,1,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,TriCipher,True,2000-01-01,2000-01-01,2005-02-14,2009-12-28,5.126,9.9973,3.0,7.0055,9.0,4.0,40100000.0,1.0,True,False,False,False,False,enterprise,False,False,False,True,False,False,False,False,False,False,c:16283,True,False,False,True,True,True,4.75,True,acquired
2,2,CA,32.901049,-117.192656,92121,c:65620,San Diego,Plixi,True,2009-03-18,2009-03-18,2010-03-30,2010-03-30,1.0329,1.0329,1.4575,2.2055,5.0,1.0,2600000.0,2.0,True,False,False,False,False,web,False,True,False,False,False,False,False,False,False,False,c:65620,False,False,True,False,False,False,4.0,True,acquired
3,3,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Solidcore Systems,True,2002-01-01,2002-01-01,2005-02-17,2007-04-25,3.1315,5.3151,6.0027,6.0027,5.0,3.0,40000000.0,1.0,True,False,False,False,False,software,True,False,False,False,False,False,False,False,False,False,c:42668,False,False,False,True,True,True,3.3333,True,acquired
4,4,CA,37.779281,-122.419236,94105,c:65806,San Francisco,Inhale Digital,False,2010-08-01,2012-10-01,2010-08-01,2012-04-01,1.8301,1.6685,0.0384,0.0384,2.0,2.0,1300000.0,1.0,True,False,False,False,False,games_video,False,False,False,False,False,True,False,False,False,False,c:65806,True,True,False,False,False,False,1.0,True,closed


In [5]:
df.shape

(923, 47)

In [6]:
app = Dash(__name__)

# below code is used for selecting variable for histogram
hist_list = list()
hist_list = df.columns[12:20].tolist()
hist_list.append(df.columns[-3])

# below code is used for selecting x and y variable for scatter plot
scatter_list = list()
scatter_list = df.columns[12:20].tolist()
scatter_list.append(df.columns[-3])

# below code is used for selecting variable for pie charts
pie_list = list()
pie_list = df.columns[20:36].tolist()
for item in df.columns[37:43]:
  pie_list.append(item)
pie_list.append(df.columns[-2])
pie_list.append(df.columns[0])

app.layout = html.Div(
    children = [
          html.H1('Visualizing Data - Exploratory Data Analysis of Startup Data'),
          html.P('Select the variable you want to see the distribution for:'),
          dcc.Dropdown(
                  id='hist_column',
                  options=hist_list,
                  value=hist_list[0],
                  # users can't have no selection
                  clearable=False,
                  style={
                      'width': '50%',
                      'color': 'blue',
                      'font-size': '16px',
                      'margin-bottom': '10px'
                  }
            ),
            dcc.Graph(id='histogram'),

            # add a line break
            html.Br(),

            html.P('Select the variable you want to see the pie charts for:'),
                        dcc.Dropdown(
                            id='col_name',
                            options=pie_list,
                            value=pie_list[-1],
                            clearable=False,
                            style={
                            'width': '50%',
                            'color': 'blue',
                            'font-size': '16px',
                            'margin-bottom': '10px'
                        }
                        ),
              html.P('Select number of top categories you want to view:'),
                        dcc.Slider(
                          0, 20, step=1, value=5, id='top_cat',
                          tooltip={"placement": "bottom", "always_visible": True}
                        ),


              dcc.Graph(id='pie_chart_acquired'),
              dcc.Graph(id='pie_chart_closed'),

              # add a line break
              html.Br(),

              html.P('Select the x variable:'),
                    dcc.Dropdown(
                        id='x',
                        options=scatter_list,
                        # plus avg_participants column
                        value=scatter_list[0],
                        clearable=False,
                        style={
                            'width': '50%',
                            'color': 'blue',
                            'font-size': '16px',
                            'margin-bottom': '10px'
                        }
                    ),
                html.P('Select the y variable:'),
                    dcc.Dropdown(
                      id='y',
                      options=scatter_list,
                      value=scatter_list[1],
                      clearable=False,
                      style={
                            'width': '50%',
                            'color': 'blue',
                            'font-size': '16px',
                            'margin-bottom': '10px'
                        }
                    ),
                dcc.Graph(id='scatter_plot')
      ])


# define callback which triggers visualization updates
@app.callback(
    Output('histogram', 'figure'),
    Input('hist_column', 'value')
)
def histogram(hist_column):
  """ Returns histogram figure given user inputs
  Args:
    hist_column (str): name of interested column name for histogram
  Returns:
    fig_hist: histogram figure
  """
  # closed - 327 rows, acquired - 597 rows -- be careful with the interpretation
  fig_hist = px.histogram(df, x=hist_column, color='status')

  # set background to transparent
  fig_hist.update_layout(title_text='Distribution of ' + hist_column, paper_bgcolor='rgba(0, 0, 0, 0)')

  return fig_hist


# note -- comparing the two pie chart can be a little misleading since the
# coloring may be different for the same state between the two pie charts
@app.callback(
    Output('pie_chart_acquired', 'figure'),
    Input('col_name', 'value'),
    Input('top_cat', 'value')
)
def pie_a(col_name, top_cat):
  """ Returns pie chart figure of acquired status given user inputs. Shows
  frequency of each category
  Args:
    col_name (str): column name of interest
    top_cat (int): top categories of interest
  Returns:
    fig_pie_a: pie chart figure of acquired status
  """
  df_acquired = df[df['status'] == 'acquired']

  # get the top 10 categories if there is more than 10
  # there is descending default for values and names
  # names is the category/color the data is split into
  fig_pie_a = px.pie(df_acquired,
                     values=df_acquired[col_name].value_counts().tolist()[:top_cat],
                    names=df_acquired[col_name].value_counts().index.tolist()[:top_cat],
                     title='Acquired Status Pie Chart for ' + col_name + ' variable')

  fig_pie_a.update_layout(paper_bgcolor='rgba(0, 0, 0, 0)')

  # label and value here is just default names used -- not related to the naming convention above
  fig_pie_a.update_traces(hovertemplate=col_name + ': %{label}<br>Count: %{value}</br>')

  return fig_pie_a


@app.callback(
    Output('pie_chart_closed', 'figure'),
    Input('col_name', 'value'),
    Input('top_cat', 'value')
)
def pie_c(col_name, top_cat):
  """ Returns pie chart figure of closed status given user inputs. Shows
  frequency of each category
  Args:
    col_name (str): column name of interest
    top_cat (int): top categories of interest
  Returns:
    fig_pie_c: pie chart figure of closed status
  """
  df_closed = df[df['status'] == 'closed']

  # get the top 10 categories if there is more than 10
  fig_pie_c = px.pie(df_closed,
                     values=df_closed[col_name].value_counts().tolist()[:top_cat],
                    names=df_closed[col_name].value_counts().index.tolist()[:top_cat],
                     title='Closed Status Pie Chart for ' + col_name + ' variable')

  fig_pie_c.update_layout(paper_bgcolor='rgba(0, 0, 0, 0)')

  # label and value here is just default names used -- not related to the naming convention above
  fig_pie_c.update_traces(hovertemplate=col_name + ': %{label}<br><b>Count: %{value}</br>')

  return fig_pie_c

@app.callback(
    Output('scatter_plot', 'figure'),
    Input('x', 'value'),
    Input('y', 'value')
)
def scatter(x, y):
  """ Returns scatter plot figure given user inputs
  Args:
    x (str): column name for x
    y (str): column name for y
  Returns:
    fig_scatter: scatter plot figure
  """
  fig_scatter = px.scatter(df, x=x, y=y, color='status',
                           title='Relationship between ' + y + ' and ' + x)
  fig_scatter.update_layout(paper_bgcolor='rgba(0, 0, 0, 0)')
  return fig_scatter

# run the app
app.run(jupyter_mode="external")

Dash app running on http://127.0.0.1:8050/
