In [1]:
import pandas as pd
from arrow import Arrow

def parse_dates(arg):
    pieces = arg.split('/')
    pieces = [int(item) for item in pieces]
    return Arrow(year=2000 + pieces[2], month=pieces[0], day=pieces[1])
    
df = pd.read_csv(filepath_or_buffer='/kaggle/input/largest-us-venture-funding-deals-of-2023/Largest US Venture Funding Deals Of 2023.csv',)
df['amount'] = df['Amount'].apply(func=lambda x: int(x.replace('$', '').replace(',', '')))
df['date reported'] = df['Date reported'].apply(func=parse_dates)
df['month'] = df['date reported'].apply(func=lambda x: x.month)
df['day of year'] = df['date reported'].apply(func=lambda x: x.datetime.timetuple().tm_yday)

df.head()

Unnamed: 0,Company,Amount,Lead investors,Valuation,Industry,Date reported,amount,date reported,month,day of year
0,OpenAI,"$10,000,000,000",Microsoft,,Artificial intelligence,1/23/23,10000000000,2023-01-23T00:00:00+00:00,1,23
1,Stripe,"$6,500,000,000",,"$50,000,000,000",Fintech,3/15/23,6500000000,2023-03-15T00:00:00+00:00,3,74
2,Inflection AI,"$1,300,000,000","Microsoft, Reid Hoffman, Bill Gates, Eric Schm...","$4,000,000,000",Artificial intelligence,6/29/23,1300000000,2023-06-29T00:00:00+00:00,6,180
3,Anthropic,"$1,250,000,000",Amazon,"$4,000,000,000",Artificial intelligence,9/25/23,1250000000,2023-09-25T00:00:00+00:00,9,268
4,Generate Capital,"$1,030,900,000",,,Energy,1/6/23,1030900000,2023-01-06T00:00:00+00:00,1,6


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Company         171 non-null    object
 1   Amount          171 non-null    object
 2   Lead investors  145 non-null    object
 3   Valuation       47 non-null     object
 4   Industry        171 non-null    object
 5   Date reported   171 non-null    object
 6   amount          171 non-null    int64 
 7   date reported   171 non-null    object
 8   month           171 non-null    int64 
 9   day of year     171 non-null    int64 
dtypes: int64(3), object(7)
memory usage: 13.5+ KB


In [3]:
df.nunique()

Company           161
Amount             69
Lead investors    133
Valuation          29
Industry           63
Date reported      97
amount             69
date reported      97
month               9
day of year        97
dtype: int64

In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='amount', y='amount', log_y=True, )

In [5]:
from plotly.express import bar
bar(data_frame=df[['month', 'amount']].groupby(by='month').sum().reset_index(), x='month', y='amount')

In [6]:
industry_df = df[['Industry', 'amount']].groupby(by='Industry').sum().reset_index().sort_values(ascending=False, by='amount')
bar(data_frame=industry_df, x='Industry', y='amount')

This is probably the nut graf right here.

In [7]:
from plotly.express import pie
top_ten = industry_df['Industry'].values[:10]
other_df = df[['Industry', 'amount']].copy()
other_df['Industry'] = other_df['Industry'].apply(lambda x: x if x in top_ten else 'Other')
industry_top_df = other_df.groupby(by='Industry').sum().reset_index().sort_values(ascending=False, by='amount')
pie(data_frame=industry_top_df, names='Industry', values='amount', color='amount')

If we roll up the tail (outside the top ten) as an Other slice the AI slice still looks big but not as big as it does in the bar graph.

In [8]:
from plotly.express import scatter
from plotly.express import colors
df['industry'] = df['Industry'].apply(lambda x: x if x in top_ten else 'Other')
scatter(data_frame=df, x='day of year', y='amount', hover_name='Company', color='industry', log_y=True,
        color_discrete_sequence=colors.qualitative.Vivid)

If we use the day of year we can build a scatter plot that will let us look at the different deals individually in context. We need a different color palette because we have more than ten colors.