In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from tempfile import mkdtemp
from shutil import rmtree


from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler, MultiLabelBinarizer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv("../predictor/raw_data/X_y_data2.csv")
data = data[data.last_funding_amount > 10000]
X = data.iloc[:, :-1]
y = np.log(data.loc[:, ["last_funding_amount"]])

In [3]:
np.log(y.sort_values(by=["last_funding_amount"]).astype("int"))

Unnamed: 0,last_funding_amount
1817,2.197225
323,2.197225
2150,2.197225
1310,2.197225
1367,2.197225
...,...
1760,2.995732
1114,2.995732
473,3.091042
1647,3.091042


In [4]:
np.exp(y.sort_values(by=["last_funding_amount"]).astype("int"))

Unnamed: 0,last_funding_amount
1817,8.103084e+03
323,8.103084e+03
2150,8.103084e+03
1310,8.103084e+03
1367,8.103084e+03
...,...
1760,4.851652e+08
1114,4.851652e+08
473,3.584913e+09
1647,3.584913e+09


In [5]:
data.funding_status.value_counts().index


Index(['Seed', 'Early Stage Venture', 'M&A', 'Late Stage Venture',
       'Private Equity', 'IPO'],
      dtype='object')

In [6]:
one_hot_category = [
    "funding_status", "industry_groups"
]

"""multi_hot_category = [
    "industry"
]"""

ordinal_category = [
    "no_employees", "revenue_range"
]

numerical_features = [
    'months_since_founded', "no_founders", "lat", "lon",
    'no_investors', 'no_fund_rounds',
    'no_sub_orgs', 'has_preseed', 'has_seed', 'has_series_a', 'has_series_b', 'has_series_c',
    'has_debt_financing', 'has_grant'
]

In [7]:
no_employees_ordinal = [
    '11-50', '51-100', '101-250', '251-500', '501-1000', '1001-5000', '5001-10000', '10001+'
]

In [8]:
revenue_range_ordinal = [
    'Less than $1M', '$1M to $10M', '$10M to $50M', '$50M to $100M', '$100M to $500M', '$500M to $1B', '$1B to $10B', '$10B+'
]

In [9]:
industry_multi = [
 'Biotechnology',
 'Energy',
 'Professional Services',
 'Government and Military',
 'Media and Entertainment',
 'Data and Analytics',
 'Consumer Electronics',
 'Apps',
 'Blockchain and Cryptocurrency',
 'Community and Lifestyle',
 'Software',
 'Gaming',
 'Consumer Goods',
 'Clothing and Apparel',
 'Artificial Intelligence (AI)',
 'Commerce and Shopping',
 'Health Care',
 'Sustainability',
 'Design',
 'Transportation',
 'Administrative Services',
 'Hardware',
 'Content and Publishing',
 'Information Technology',
 'Education',
 'Mobile',
 'Natural Resources',
 'Food and Beverage',
 'Travel and Tourism',
 'Events',
 'Agriculture and Farming',
 'Science and Engineering',
 'Other',
 'Privacy and Security',
 'Financial Services',
 'Manufacturing',
 'Sports',
 'Sales and Marketing',
 'Real Estate',
 'Advertising',
 'Internet Services',
 'Platforms',
 'Payments',
 'Lending and Investments',
 'Video',
 'Music and Audio',
 'Navigation and Mapping',
 'Social Impact',
 'Messaging and Telecommunications'
]

In [10]:
feat_ordinal_dict = {
    "no_employees": no_employees_ordinal,
    "revenue_range": revenue_range_ordinal
}

In [11]:
encoder_ordinal = OrdinalEncoder(
    categories = [feat_ordinal_dict[i] for i in ordinal_category],
    dtype = np.int64
)


preproc_ordinal = make_pipeline(
    SimpleImputer(strategy = "most_frequent"),
    encoder_ordinal,
    MinMaxScaler()
)

preproc_ordinal

In [12]:
preproc_min_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler())

preproc_min_numerical

In [13]:
preproc_nominal = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preproc_nominal

In [14]:
preproc_multi = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    MultiLabelBinarizer(classes=industry_multi)
)

preproc_multi

In [15]:
preproc_robust_numerical = make_pipeline(
    KNNImputer(),
    RobustScaler())

preproc_robust_numerical

In [16]:
preproc = make_column_transformer(
        (preproc_ordinal, ordinal_category),
        (preproc_min_numerical, numerical_features),
        (preproc_nominal, one_hot_category),
        # (preproc_multi, multi_hot_category),
        # (preproc_robust_numerical, robust_category),
        remainder="drop"
)

In [17]:
preproc

In [18]:
X_preprocessed = pd.DataFrame(preproc.fit_transform(X, y))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y.iloc[:, -1:], test_size=.20, random_state=1)


In [20]:
! pip install tensorflow



In [21]:
from tensorflow import keras
from keras import Model, Sequential, layers, regularizers, optimizers
from keras.callbacks import EarlyStopping

2024-06-05 11:12:56.639935: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-05 11:12:56.751053: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-05 11:12:56.756230: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-05 11:12:56.756249: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [22]:
X_preprocessed.shape

(1839, 41)

In [23]:
reg = regularizers.l1_l2(l2=0.005)

model = Sequential()
model.add(layers.Input(shape=(X_preprocessed.shape[1],)))
model.add(layers.Dense(50, activation="relu", kernel_regularizer=reg))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(20, activation="tanh"))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(1, activation="linear"))

2024-06-05 11:12:57.927357: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-05 11:12:57.927515: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-06-05 11:12:57.927557: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-06-05 11:12:57.927594: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-06-05 11:12:57.927630: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

In [24]:
learning_rate = 0.005

optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='mae',
              optimizer='adam',
              metrics=['mse'])

In [25]:
es = EarlyStopping(
    patience=20,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=16,
    callbacks=[es],
    verbose=0
)

Restoring model weights from the end of the best epoch: 124.
Epoch 144: early stopping


In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                2100      
                                                                 
 batch_normalization (BatchN  (None, 50)               200       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 20)                1020      
                                                                 
 batch_normalization_1 (Batc  (None, 20)               80        
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 20)                0

In [27]:
pd.DataFrame(history.history)

Unnamed: 0,loss,mse,val_loss,val_mse
0,17.415997,230.871017,16.534567,217.324417
1,16.138178,215.637955,15.122737,194.451004
2,14.705515,189.378021,13.765849,168.696106
3,12.910334,152.158615,11.944830,131.747543
4,10.707806,110.933899,9.635286,89.901894
...,...,...,...,...
139,1.325903,2.503954,1.005441,1.526778
140,1.284622,2.356836,1.022559,1.558445
141,1.341088,2.511621,1.069060,1.652118
142,1.326927,2.493844,1.018364,1.563029


In [28]:
#df = pd.read_csv("../predictor/raw_data/X_y_data2.csv")

In [29]:
#X_pred = df.iloc[:, :-1]

In [30]:
#X_pred_preprocessed = preproc.transform(X_pred)

In [31]:
#model.predict(X_pred_preprocessed)

In [32]:
#df["y_pred"] = model.predict(X_pred_preprocessed)

In [33]:
#df

In [34]:
#df.to_csv("../predictor/raw_data/predictions3.csv")

In [36]:
model.save("second_model.keras")

In [38]:
np.exp(15.349234923)

4635408.667508026