In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from tempfile import mkdtemp
from shutil import rmtree


from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.ensemble import AdaBoostRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, mutual_info_regression, VarianceThreshold, SelectFromModel
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv("../raw_data/X_y_data2 (2).csv")
data = data[data.y < 2]
X = data.iloc[:, :-2]
y = data.iloc[:, -2:]

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 855 entries, 1 to 2177
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           855 non-null    int64  
 1   state                855 non-null    object 
 2   funding_status       855 non-null    object 
 3   revenue_range        855 non-null    object 
 4   no_employees         855 non-null    object 
 5   no_founders          855 non-null    float64
 6   industry_groups      855 non-null    object 
 7   website              855 non-null    int64  
 8   phone                855 non-null    int64  
 9   email                855 non-null    int64  
 10  linkedin             855 non-null    int64  
 11  twitter              855 non-null    int64  
 12  facebook             855 non-null    int64  
 13  founded_year         855 non-null    int64  
 14  no_investors         855 non-null    float64
 15  no_fund_rounds       855 non-null    fl

In [4]:
X.head(5)

Unnamed: 0.1,Unnamed: 0,state,funding_status,revenue_range,no_employees,no_founders,industry_groups,website,phone,email,...,has_series_b,has_series_c,has_series_d,has_series_e,has_angel,has_debt_financing,has_grant,has_corporate_round,has_series_x,has_ico
1,1,Hamburg,Early Stage Venture,$1M to $10M,51-100,1.0,Healthcare and Biotechnology,1,0,1,...,1,0,0,0,0,0,1,0,1,0
2,2,Baden-Wurttemberg,Late Stage Venture,$1M to $10M,101-250,2.0,Energy and Natural Resources,1,1,1,...,1,1,0,0,0,0,0,0,0,0
3,3,Bayern,Late Stage Venture,$10M to $50M,251-500,3.0,Technology and Software,1,0,1,...,1,0,0,0,0,0,0,0,0,0
5,5,Sachsen,Late Stage Venture,$1M to $10M,101-250,8.0,Technology and Software,1,1,1,...,1,1,0,0,0,0,1,0,0,0
6,6,Bayern,M&A,$50M to $100M,501-1000,3.0,Technology and Software,1,1,1,...,0,1,0,0,0,0,0,0,1,0


In [5]:
X.drop(columns=["Unnamed: 0"])

Unnamed: 0,state,funding_status,revenue_range,no_employees,no_founders,industry_groups,website,phone,email,linkedin,...,has_series_b,has_series_c,has_series_d,has_series_e,has_angel,has_debt_financing,has_grant,has_corporate_round,has_series_x,has_ico
1,Hamburg,Early Stage Venture,$1M to $10M,51-100,1.0,Healthcare and Biotechnology,1,0,1,1,...,1,0,0,0,0,0,1,0,1,0
2,Baden-Wurttemberg,Late Stage Venture,$1M to $10M,101-250,2.0,Energy and Natural Resources,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
3,Bayern,Late Stage Venture,$10M to $50M,251-500,3.0,Technology and Software,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
5,Sachsen,Late Stage Venture,$1M to $10M,101-250,8.0,Technology and Software,1,1,1,1,...,1,1,0,0,0,0,1,0,0,0
6,Bayern,M&A,$50M to $100M,501-1000,3.0,Technology and Software,1,1,1,1,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2164,Berlin,M&A,$1M to $10M,11-50,3.0,Hardware and Electronics,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
2171,Berlin,M&A,Less than $1M,11-50,2.0,Technology and Software,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
2172,Berlin,Early Stage Venture,$1M to $10M,11-50,2.0,Technology and Software,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2175,Bayern,Seed,$1M to $10M,11-50,2.0,Manufacturing and Industry,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X.private_ipo

1       1
2       1
3       1
5       1
6       1
       ..
2164    1
2171    1
2172    1
2175    1
2177    1
Name: private_ipo, Length: 855, dtype: int64

In [7]:
X.columns

Index(['Unnamed: 0', 'state', 'funding_status', 'revenue_range',
       'no_employees', 'no_founders', 'industry_groups', 'website', 'phone',
       'email', 'linkedin', 'twitter', 'facebook', 'founded_year',
       'no_investors', 'no_fund_rounds', 'private_ipo', 'company_type',
       'operting_status', 'no_lead_investors', 'no_sub_orgs', 'has_preseed',
       'has_seed', 'has_series_a', 'has_series_b', 'has_series_c',
       'has_series_d', 'has_series_e', 'has_angel', 'has_debt_financing',
       'has_grant', 'has_corporate_round', 'has_series_x', 'has_ico'],
      dtype='object')

In [8]:
one_hot_category = ["state", "funding_status", "industry_groups", 'private_ipo', ]
ordinal_category = ["no_employees", "revenue_range"]
numerical_features = ['founded_year', 'private_ipo', 'website', 'phone', "no_founders",
       'email', 'linkedin', 'twitter', 'facebook', 'no_investors', 'no_fund_rounds', 'operting_status',
       'no_sub_orgs', 'has_preseed', 'has_seed', 'has_series_a', 'has_series_b', 'has_series_c',
       'has_series_d', 'has_series_e', 'has_angel', 'has_debt_financing',
       'has_grant', 'has_corporate_round', 'has_series_x']

In [9]:
no_employees_ordinal = [
    '11-50', '51-100', '101-250', '251-500', '501-1000', '1001-5000', '5001-10000', '10001+'
]

In [10]:
revenue_range_ordinal = [
    'Less than $1M', '$1M to $10M', '$10M to $50M', '$50M to $100M', '$100M to $500M', '$500M to $1B', '$1B to $10B', '$10B+'
]

In [11]:
feat_ordinal_dict = {
    "no_employees": no_employees_ordinal,
    "revenue_range": revenue_range_ordinal
}

In [12]:
encoder_ordinal = OrdinalEncoder(
    categories = [feat_ordinal_dict[i] for i in ordinal_category],
    dtype = np.int64
)


preproc_ordinal = make_pipeline(
    SimpleImputer(strategy = "most_frequent"),
    encoder_ordinal,
    MinMaxScaler()
)

preproc_ordinal

In [13]:
preproc_min_numerical = make_pipeline(
    KNNImputer(),
    MinMaxScaler())

preproc_min_numerical

In [14]:
preproc_nominal = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preproc_nominal

In [15]:
preproc_robust_numerical = make_pipeline(
    KNNImputer(),
    RobustScaler())

preproc_robust_numerical

In [16]:
preproc = make_column_transformer(
        (preproc_ordinal, ordinal_category),
        (preproc_min_numerical, numerical_features),
        (preproc_nominal, one_hot_category),
        # (preproc_robust_numerical, robust_category),
        remainder="drop"
)

In [17]:
preproc

In [18]:
X_preprocessed = pd.DataFrame(preproc.fit_transform(X, y))

In [19]:
X_preprocessed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.142857,0.142857,0.642857,1.0,1.0,0.0,0.000,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.285714,0.142857,0.642857,1.0,1.0,1.0,0.125,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.428571,0.285714,0.642857,1.0,1.0,0.0,0.250,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.285714,0.142857,0.571429,1.0,1.0,1.0,0.875,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.571429,0.428571,0.642857,1.0,1.0,1.0,0.250,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,0.000000,0.142857,0.357143,1.0,1.0,1.0,0.250,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
851,0.000000,0.000000,0.357143,1.0,1.0,1.0,0.125,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
852,0.000000,0.142857,0.357143,1.0,1.0,0.0,0.125,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
853,0.000000,0.142857,0.357143,1.0,1.0,0.0,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y.iloc[:, -1:], test_size=.20, random_state=1)

In [21]:
! pip install tensorflow



In [22]:
from tensorflow import keras
from keras import Model, Sequential, layers, regularizers, optimizers
from keras.callbacks import EarlyStopping

2024-05-30 21:52:06.135903: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-30 21:52:06.140827: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-30 21:52:06.272218: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-30 21:52:06.807808: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
X_preprocessed.shape

(855, 70)

In [24]:
reg = regularizers.l1_l2(l2=0.005)

model = Sequential()
model.add(layers.Input(shape=(X_preprocessed.shape[1],)))
model.add(layers.Dense(50, activation="relu", kernel_regularizer=reg))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(20, activation="tanh"))
model.add(layers.BatchNormalization(momentum=0.9))
model.add(layers.Dropout(rate=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

2024-05-30 21:52:12.520595: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-30 21:52:12.521884: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [25]:
learning_rate = 0.01

optimizer = optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [26]:
es = EarlyStopping(
    patience=20,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=700,
    batch_size=32,
    callbacks=[es],
    verbose=0
)

Epoch 77: early stopping
Restoring model weights from the end of the best epoch: 57.


In [27]:
history.history

{'accuracy': [0.4899451434612274,
  0.6014625430107117,
  0.6563071012496948,
  0.678244948387146,
  0.7568555474281311,
  0.7714807987213135,
  0.7824497222900391,
  0.8153564929962158,
  0.8135283589363098,
  0.835466206073761,
  0.8537477254867554,
  0.8775137066841125,
  0.8976234197616577,
  0.9159049391746521,
  0.9159049391746521,
  0.8957952260971069,
  0.9067641496658325,
  0.9049360156059265,
  0.9140768051147461,
  0.9213894009590149,
  0.9067641496658325,
  0.9250457286834717,
  0.9159049391746521,
  0.9378427863121033,
  0.9195612668991089,
  0.9305301904678345,
  0.9341864585876465,
  0.9323583245277405,
  0.9341864585876465,
  0.9305301904678345,
  0.9232175350189209,
  0.9506398439407349,
  0.9396709203720093,
  0.9451553821563721,
  0.9506398439407349,
  0.9396709203720093,
  0.9579524397850037,
  0.9451553821563721,
  0.9305301904678345,
  0.9488117098808289,
  0.9506398439407349,
  0.9506398439407349,
  0.9506398439407349,
  0.9451553821563721,
  0.9561243057250977,


In [28]:
results = model.evaluate(X_test, y_test)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8864 - loss: 0.4460 


In [None]:
model.save("../models/first_dummy.keras")