In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from tempfile import mkdtemp
from shutil import rmtree


from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [33]:
data = pd.read_csv("../raw_data/X_y_data2.csv")
data = data[data.y < 2]
X = data.iloc[:, :-2]
y = data.iloc[:, -1:]

In [43]:
dict(X.iloc[:1, :])

{'Unnamed: 0': 1    1
 Name: Unnamed: 0, dtype: int64,
 'state': 1     Hamburg
 Name: state, dtype: object,
 'funding_status': 1    Early Stage Venture
 Name: funding_status, dtype: object,
 'revenue_range': 1    $1M to $10M
 Name: revenue_range, dtype: object,
 'no_employees': 1    51-100
 Name: no_employees, dtype: object,
 'no_founders': 1    1.0
 Name: no_founders, dtype: float64,
 'industry_groups': 1    Healthcare and Biotechnology
 Name: industry_groups, dtype: object,
 'website': 1    1
 Name: website, dtype: int64,
 'phone': 1    0
 Name: phone, dtype: int64,
 'email': 1    1
 Name: email, dtype: int64,
 'linkedin': 1    1
 Name: linkedin, dtype: int64,
 'twitter': 1    0
 Name: twitter, dtype: int64,
 'facebook': 1    0
 Name: facebook, dtype: int64,
 'founded_year': 1    2018
 Name: founded_year, dtype: int64,
 'no_investors': 1    14.0
 Name: no_investors, dtype: float64,
 'no_fund_rounds': 1    8.0
 Name: no_fund_rounds, dtype: float64,
 'private_ipo': 1    1
 Name: privat

In [3]:
one_hot_category = [
    "state", "funding_status", "industry_groups",
]

ordinal_category = [
    "no_employees", "revenue_range"
]

numerical_features = [
    'founded_year', 'website', 'phone', "no_founders",
    'email', 'linkedin', 'twitter', 'facebook', 'no_investors', 'no_fund_rounds',
    'no_sub_orgs', 'has_preseed', 'has_seed', 'has_series_a', 'has_series_b', 'has_series_c',
    'has_series_d', 'has_series_e', 'has_angel', 'has_debt_financing',
    'has_grant', 'has_corporate_round', 'has_series_x'
]

In [4]:
no_employees_ordinal = [
    '11-50', '51-100', '101-250', '251-500', '501-1000', '1001-5000', '5001-10000', '10001+'
]

In [5]:
revenue_range_ordinal = [
    'Less than $1M', '$1M to $10M', '$10M to $50M', '$50M to $100M', '$100M to $500M', '$500M to $1B', '$1B to $10B', '$10B+'
]

In [6]:
feat_ordinal_dict = {
    "no_employees": no_employees_ordinal,
    "revenue_range": revenue_range_ordinal
}

In [7]:
encoder_ordinal = OrdinalEncoder(
    categories = [feat_ordinal_dict[i] for i in ordinal_category],  
    dtype = np.int64
)


preproc_ordinal = make_pipeline(
    SimpleImputer(strategy = "most_frequent"),
    encoder_ordinal, 
    MinMaxScaler()
)

preproc_ordinal

In [8]:
preproc_min_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler())

preproc_min_numerical

In [9]:
preproc_nominal = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preproc_nominal

In [10]:
preproc_robust_numerical = make_pipeline(
    KNNImputer(),
    RobustScaler())

preproc_robust_numerical

In [11]:
preproc = make_column_transformer(
        (preproc_ordinal, ordinal_category),
        (preproc_min_numerical, numerical_features),
        (preproc_nominal, one_hot_category),
        # (preproc_robust_numerical, robust_category),
        remainder="drop"
)

In [12]:
preproc

In [13]:
X_preprocessed = pd.DataFrame(preproc.fit_transform(X, y))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y.iloc[:, -1:], test_size=.20, random_state=1)


In [15]:
! pip install tensorflow



In [16]:
from tensorflow import keras
from keras import Model, Sequential, layers, regularizers, optimizers
from keras.callbacks import EarlyStopping

2024-05-31 13:09:53.110324: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-31 13:09:53.114441: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-31 13:09:53.117447: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-31 13:09:53.202183: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
X_preprocessed.shape

(855, 66)

In [18]:
reg = regularizers.l1_l2(l2=0.005)

model = Sequential()
model.add(layers.Input(shape=(X_preprocessed.shape[1],)))
model.add(layers.Dense(50, activation="relu", kernel_regularizer=reg))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(20, activation="tanh"))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

2024-05-31 13:09:54.726354: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-31 13:09:54.726720: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [19]:
learning_rate = 0.01

optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [20]:
es = EarlyStopping(
    patience=20,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=700,
    batch_size=32,
    callbacks=[es],
    verbose=0
)

Epoch 54: early stopping
Restoring model weights from the end of the best epoch: 34.


In [39]:
model.summary()

In [21]:
history.history

{'accuracy': [0.5338208675384521,
  0.5831809639930725,
  0.7001827955245972,
  0.7404022216796875,
  0.7586837410926819,
  0.7824497222900391,
  0.8043875694274902,
  0.8446069359779358,
  0.8574039936065674,
  0.8903107643127441,
  0.868372917175293,
  0.8976234197616577,
  0.8866544961929321,
  0.9031078815460205,
  0.9049360156059265,
  0.9104204773902893,
  0.8994515538215637,
  0.9396709203720093,
  0.9378427863121033,
  0.9360146522521973,
  0.9414991140365601,
  0.9414991140365601,
  0.9506398439407349,
  0.9341864585876465,
  0.9305301904678345,
  0.9451553821563721,
  0.9469835758209229,
  0.9469835758209229,
  0.9634369015693665,
  0.9579524397850037,
  0.9597806334495544,
  0.9652650952339172,
  0.9524679780006409,
  0.9652650952339172,
  0.9670932292938232,
  0.9542961716651917,
  0.9616087675094604,
  0.9634369015693665,
  0.97074955701828,
  0.9634369015693665,
  0.974405825138092,
  0.9762340188026428,
  0.9616087675094604,
  0.9652650952339172,
  0.9670932292938232,
  

In [23]:
df = pd.read_csv("../raw_data/X_y_data2.csv")
df = df[df.y > 1]
X_pred = df.iloc[:, :-2]

In [24]:
X_pred_preprocessed = preproc.transform(X_pred)

In [25]:
model.predict(X_pred_preprocessed)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


array([[0.9611442 ],
       [0.9547018 ],
       [0.64973736],
       ...,
       [0.8771408 ],
       [0.4069949 ],
       [0.10386123]], dtype=float32)

In [26]:
df["y_pred"] = model.predict(X_pred_preprocessed)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [27]:
df

Unnamed: 0.1,Unnamed: 0,state,funding_status,revenue_range,no_employees,no_founders,industry_groups,website,phone,email,...,has_series_e,has_angel,has_debt_financing,has_grant,has_corporate_round,has_series_x,has_ico,total_funding_usd,y,y_pred
0,0,Bayern,Early Stage Venture,$100M to $500M,251-500,5.0,Retail and E-commerce,1,0,1,...,0,0,0,0,0,1,0,67922929.0,2,0.961144
4,4,Nordrhein-Westfalen,Early Stage Venture,$50M to $100M,251-500,3.0,Technology and Software,1,0,1,...,0,0,0,0,0,0,0,73000000.0,2,0.954702
8,8,Berlin,Early Stage Venture,$1M to $10M,51-100,2.0,Technology and Software,1,0,1,...,0,0,0,0,0,1,0,40196721.0,2,0.649737
9,9,Berlin,Seed,Less than $1M,101-250,3.0,Technology and Software,1,1,1,...,0,0,0,0,0,0,0,95000000.0,2,0.379408
13,13,Nordrhein-Westfalen,Early Stage Venture,$10M to $50M,101-250,1.0,Retail and E-commerce,1,1,0,...,0,0,0,0,0,1,0,116787884.0,2,0.923170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2169,2180,Baden-Wurttemberg,Seed,$1M to $10M,11-50,1.0,Technology and Software,1,1,1,...,0,0,0,0,0,0,0,27284.0,2,0.784431
2170,2181,Hessen,Seed,Less than $1M,11-50,1.0,Business Services,1,1,1,...,0,0,0,1,0,0,0,81868.0,2,0.649491
2173,2184,Berlin,Seed,$1M to $10M,11-50,1.0,Technology and Software,1,0,1,...,0,0,0,0,0,0,0,145515.0,2,0.877141
2174,2185,Hessen,Seed,$1M to $10M,11-50,2.0,Consumer Products,1,0,1,...,0,0,0,0,0,0,0,75000.0,2,0.406995


In [28]:
df.to_csv("../raw_data/predictions.csv")

In [31]:
model.save("dummy_model.keras")

In [32]:
new_df = pd.read_csv("../raw_data/new_df2.csv")

In [34]:
new_df["old_y"] = data["y"]

In [37]:
new_df.funding_status

0       Venture - Series Unknown
1                       Series B
2                       Series C
3                       Series C
4                       Series B
                  ...           
2184                        Seed
2185                        Seed
2186                        Seed
2187                        Seed
2188                         M&A
Name: funding_status, Length: 2189, dtype: object