In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from tempfile import mkdtemp
from shutil import rmtree


from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [41]:
data = pd.read_csv("../raw_data/X_y_data1.csv")
data = data[data.y < 2]
X = data.iloc[:, :-2]
y = data.iloc[:, -2:]

In [43]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 897 entries, 1 to 2183
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           897 non-null    int64  
 1   state                897 non-null    object 
 2   funding_status       897 non-null    object 
 3   revenue_range        897 non-null    object 
 4   no_employees         897 non-null    object 
 5   no_founders          897 non-null    float64
 6   industry_groups      897 non-null    object 
 7   website              897 non-null    int64  
 8   phone                897 non-null    int64  
 9   email                897 non-null    int64  
 10  linkedin             897 non-null    int64  
 11  twitter              897 non-null    int64  
 12  facebook             897 non-null    int64  
 13  founded_year         897 non-null    int64  
 14  no_investors         897 non-null    float64
 15  no_fund_rounds       896 non-null    fl

In [44]:
X.head(5)

Unnamed: 0.1,Unnamed: 0,state,funding_status,revenue_range,no_employees,no_founders,industry_groups,website,phone,email,...,has_series_a,has_series_b,has_series_c,has_series_d,has_series_e,has_angel,has_debt_financing,has_grant,has_corporate_round,has_series_x
1,1,Hamburg,Early Stage Venture,$1M to $10M,51-100,1.0,Health and Biotechnology,1,0,1,...,1,1,0,0,0,0,0,1,0,1
2,2,Baden-Wurttemberg,Late Stage Venture,$1M to $10M,101-250,2.0,Industrial and Manufacturing,1,1,1,...,1,1,1,0,0,0,0,0,0,0
3,3,Bayern,Late Stage Venture,$10M to $50M,251-500,3.0,Technology and Software,1,0,1,...,1,1,0,0,0,0,0,0,0,0
5,5,Sachsen,Late Stage Venture,$1M to $10M,101-250,8.0,Technology and Software,1,1,1,...,1,1,1,0,0,0,0,1,0,0
6,6,Bayern,M&A,$50M to $100M,501-1000,3.0,Technology and Software,1,1,1,...,0,0,1,0,0,0,0,0,0,1


In [45]:
X.drop(columns=["Unnamed: 0"])

Unnamed: 0,state,funding_status,revenue_range,no_employees,no_founders,industry_groups,website,phone,email,linkedin,...,has_series_a,has_series_b,has_series_c,has_series_d,has_series_e,has_angel,has_debt_financing,has_grant,has_corporate_round,has_series_x
1,Hamburg,Early Stage Venture,$1M to $10M,51-100,1.0,Health and Biotechnology,1,0,1,1,...,1,1,0,0,0,0,0,1,0,1
2,Baden-Wurttemberg,Late Stage Venture,$1M to $10M,101-250,2.0,Industrial and Manufacturing,1,1,1,1,...,1,1,1,0,0,0,0,0,0,0
3,Bayern,Late Stage Venture,$10M to $50M,251-500,3.0,Technology and Software,1,0,1,1,...,1,1,0,0,0,0,0,0,0,0
5,Sachsen,Late Stage Venture,$1M to $10M,101-250,8.0,Technology and Software,1,1,1,1,...,1,1,1,0,0,0,0,1,0,0
6,Bayern,M&A,$50M to $100M,501-1000,3.0,Technology and Software,1,1,1,1,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,Berlin,M&A,$1M to $10M,11-50,3.0,Consumer Goods and Services,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2177,Berlin,M&A,Less than $1M,11-50,2.0,Technology and Software,1,1,1,1,...,0,0,0,0,0,0,0,1,0,0
2178,Berlin,Early Stage Venture,$1M to $10M,11-50,2.0,Technology and Software,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2181,Bayern,Seed,$1M to $10M,11-50,2.0,Miscellaneous,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
X.private_ipo

1       1
2       1
3       1
5       1
6       1
       ..
2170    1
2177    1
2178    1
2181    1
2183    1
Name: private_ipo, Length: 897, dtype: int64

In [47]:
X.columns

Index(['Unnamed: 0', 'state', 'funding_status', 'revenue_range',
       'no_employees', 'no_founders', 'industry_groups', 'website', 'phone',
       'email', 'linkedin', 'twitter', 'facebook', 'founded_year',
       'no_investors', 'no_fund_rounds', 'private_ipo', 'company_type',
       'operting_status', 'no_lead_investors', 'no_sub_orgs', 'has_preseed',
       'has_seed', 'has_series_a', 'has_series_b', 'has_series_c',
       'has_series_d', 'has_series_e', 'has_angel', 'has_debt_financing',
       'has_grant', 'has_corporate_round', 'has_series_x'],
      dtype='object')

In [48]:
one_hot_category = ["state", "funding_status", "no_founders", "industry_groups", 'private_ipo', ]
ordinal_category = ["no_employees", "revenue_range"]
numerical_features = ['founded_year', 'private_ipo', 'website', 'phone',
       'email', 'linkedin', 'twitter', 'facebook', 'no_investors', 'no_fund_rounds', 'operting_status',
       'no_sub_orgs', 'has_preseed', 'has_seed', 'has_series_a', 'has_series_b', 'has_series_c',
       'has_series_d', 'has_series_e', 'has_angel', 'has_debt_financing',
       'has_grant', 'has_corporate_round', 'has_series_x']

In [49]:
no_employees_ordinal = [
    '11-50', '51-100', '101-250', '251-500', '501-1000', '1001-5000', '5001-10000', '10001+'
]

In [50]:
revenue_range_ordinal = [
    'Less than $1M', '$1M to $10M', '$10M to $50M', '$50M to $100M', '$100M to $500M', '$500M to $1B', '$1B to $10B', '$10B+'
]

In [51]:
feat_ordinal_dict = {
    "no_employees": no_employees_ordinal,
    "revenue_range": revenue_range_ordinal
}

In [52]:
encoder_ordinal = OrdinalEncoder(
    categories = [feat_ordinal_dict[i] for i in ordinal_category],  
    dtype = np.int64
)


preproc_ordinal = make_pipeline(
    SimpleImputer(strategy = "most_frequent"),
    encoder_ordinal, 
    MinMaxScaler()
)

preproc_ordinal

In [53]:
preproc_min_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler())

preproc_min_numerical

In [54]:
preproc_nominal = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preproc_nominal

In [55]:
preproc_robust_numerical = make_pipeline(
    KNNImputer(),
    RobustScaler())

preproc_robust_numerical

In [56]:
preproc = make_column_transformer(
        (preproc_ordinal, ordinal_category),
        (preproc_min_numerical, numerical_features),
        (preproc_nominal, one_hot_category),
        # (preproc_robust_numerical, robust_category),
        remainder="drop"
)

In [57]:
preproc

In [179]:
X_preprocessed = pd.DataFrame(preproc.fit_transform(X, y))

In [180]:
X_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,58,59,60,61,62,63,64,65,66,67
0,0.142857,0.142857,0.642857,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.285714,0.142857,0.642857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.428571,0.285714,0.642857,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.285714,0.142857,0.571429,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.571429,0.428571,0.642857,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,0.000000,0.142857,0.357143,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
893,0.000000,0.000000,0.357143,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
894,0.000000,0.142857,0.357143,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
895,0.000000,0.142857,0.357143,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [182]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y.iloc[:, -1:], test_size=.20, random_state=1)


In [61]:
! pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Using cached h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata

In [62]:
from tensorflow import keras
from keras import Model, Sequential, layers, regularizers, optimizers
from keras.callbacks import EarlyStopping

2024-05-30 16:07:33.525847: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-30 16:07:33.529335: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-30 16:07:33.666845: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-30 16:07:34.132995: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [64]:
X_preprocessed.shape

(897, 68)

In [183]:
reg = regularizers.l1_l2(l2=0.005)

model = Sequential()
model.add(layers.Input(shape=(X_preprocessed.shape[1],)))
model.add(layers.Dense(50, activation="relu", kernel_regularizer=reg))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(20, activation="tanh"))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

In [184]:
learning_rate = 0.01

optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [185]:
es = EarlyStopping(
    patience=20,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=700,
    batch_size=32,
    callbacks=[es],
    verbose=0
)

Epoch 41: early stopping
Restoring model weights from the end of the best epoch: 21.


In [186]:
history.history

{'accuracy': [0.49389180541038513,
  0.5759162306785583,
  0.6300174593925476,
  0.6858638525009155,
  0.6806282997131348,
  0.7312390804290771,
  0.7521815299987793,
  0.7783595323562622,
  0.7870855331420898,
  0.8115183115005493,
  0.830715537071228,
  0.8272251486778259,
  0.8481675386428833,
  0.8394415378570557,
  0.8551483154296875,
  0.8621291518211365,
  0.8638743162155151,
  0.8621291518211365,
  0.8708551526069641,
  0.8917975425720215,
  0.8726003766059875,
  0.8813263773918152,
  0.8900523781776428,
  0.904013991355896,
  0.8795811533927917,
  0.8935427665710449,
  0.9075043797492981,
  0.9179755449295044,
  0.9109947681427002,
  0.9179755449295044,
  0.9127399921417236,
  0.9127399921417236,
  0.9232111573219299,
  0.9249563813209534,
  0.9249563813209534,
  0.933682382106781,
  0.926701545715332,
  0.9301919937133789,
  0.9249563813209534,
  0.9232111573219299,
  0.9284467697143555],
 'loss': [1.0661908388137817,
  0.9466670155525208,
  0.8869998455047607,
  0.8208155035

In [187]:
results = model.evaluate(X_test, y_test, )


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7358 - loss: 0.6788 
