In [7]:
#import necessary libraries
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
warnings.filterwarnings('ignore')




In [8]:
company = pd.read_csv("C:\\Users\\Administrator\\Desktop\\companies.csv")

In [9]:
company.drop_duplicates(inplace=True)

In [10]:
company.drop(columns=['region', 'city', 'state_code'], inplace=True)
company.drop(columns=['id', 'Unnamed: 0.1', 'entity_type', 'entity_id', 'parent_id', 'created_by', 'created_at', 'updated_at'], inplace=True)
company.drop(columns=['domain', 'homepage_url', 'twitter_username', 'logo_url', 'logo_width', 'logo_height', 'short_description', 'description', 'overview', 'tag_list', 'name', 'normalized_name', 'permalink', 'invested_companies'], inplace=True)

In [11]:
company.drop(columns=['first_investment_at', 'last_investment_at','investment_rounds','ROI'], inplace=True)

In [12]:
columns_to_check = ['status', 'country_code', 'category_code', 'founded_at']
company = company.dropna(subset=columns_to_check)

In [13]:
company.columns

Index(['category_code', 'status', 'founded_at', 'closed_at', 'country_code',
       'first_funding_at', 'last_funding_at', 'funding_rounds',
       'funding_total_usd', 'first_milestone_at', 'last_milestone_at',
       'milestones', 'relationships', 'lat', 'lng'],
      dtype='object')

In [14]:
date_columns = ['founded_at', 'first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at']

for col in date_columns:
    company[col] = pd.to_datetime(company[col], errors='coerce')

# Extract the year from each date column and create new columns with the years
for col in date_columns:
    new_col_name = col + '_year'
    company[new_col_name] = company[col].dt.year

In [15]:
company.drop(columns=['founded_at', 'first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at'], inplace=True)

In [16]:
mapping = {'operating': 1, 'ipo': 1, 'acquired': 0, 'closed': 0}

# Use the 'map' function to create the 'isClosed' feature
company['isClosed'] = company['status'].map(mapping)

# Display the updated DataFrame
company

Unnamed: 0,category_code,status,closed_at,country_code,funding_rounds,funding_total_usd,milestones,relationships,lat,lng,founded_at_year,first_funding_at_year,last_funding_at_year,first_milestone_at_year,last_milestone_at_year,isClosed
0,web,operating,,USA,3.0,39750000.0,5.0,17.0,47.606209,-122.332071,2005,2005.0,2008.0,2010.0,2013.0,1
5,advertising,operating,,MAR,,,,2.0,30.427755,-9.598107,2007,,,,,1
6,cleantech,operating,,IND,,,,,22.307159,73.181219,2008,,,,,1
12,advertising,operating,,USA,,,1.0,2.0,35.686975,-105.937799,2008,,,2008.0,2008.0,1
13,web,acquired,,USA,1.0,5000000.0,3.0,14.0,37.386052,-122.083851,2007,2008.0,2008.0,2008.0,2012.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,operating,,USA,,,2.0,5.0,37.774929,-122.419415,2007,,,2013.0,2013.0,1
196549,public_relations,operating,,USA,1.0,750000.0,1.0,14.0,37.338208,-121.886329,2007,2008.0,2008.0,2013.0,2013.0,1
196550,consulting,operating,,USA,,,3.0,44.0,38.882334,-77.171091,1959,,,2012.0,2013.0,1
196551,search,operating,,USA,,,,1.0,34.052234,-118.243685,2008,,,,,1


In [17]:
def update_closed_at(row):
    if row['status'] in ['operating', 'ipo']:
        return '2021'
    elif row['status'] in ['acquired', 'closed']:
        return '0'
    else:
        return row['closed_at']  # Keep the original value if not in the specified statuses

# Apply the function to the 'closed_at' column
company['closed_at'] = company.apply(update_closed_at, axis=1)

In [18]:
company['closed_at'] = pd.to_numeric(company['closed_at'], errors='coerce')
company['founded_at_year'] = pd.to_numeric(company['founded_at_year'], errors='coerce')

# Calculate the difference in years
company['active_years'] = company['closed_at'] - company['founded_at_year']

In [19]:
columns_to_replace_null = ['funding_rounds', 'funding_total_usd', 'milestones', 'relationships', 'lat', 'lng', 
                            'first_funding_at_year', 'last_funding_at_year', 'first_milestone_at_year', 
                            'last_milestone_at_year']

# Calculate mean values for each of the specified columns
mean_values = company[columns_to_replace_null].mean()

# Replace null values with the mean values for each column
company[columns_to_replace_null] = company[columns_to_replace_null].fillna(mean_values)

In [20]:
company.isna().sum()

category_code              0
status                     0
closed_at                  0
country_code               0
funding_rounds             0
funding_total_usd          0
milestones                 0
relationships              0
lat                        0
lng                        0
founded_at_year            0
first_funding_at_year      0
last_funding_at_year       0
first_milestone_at_year    0
last_milestone_at_year     0
isClosed                   0
active_years               0
dtype: int64

In [21]:
company.isnull().sum().any()

False

In [22]:
company.head()

Unnamed: 0,category_code,status,closed_at,country_code,funding_rounds,funding_total_usd,milestones,relationships,lat,lng,founded_at_year,first_funding_at_year,last_funding_at_year,first_milestone_at_year,last_milestone_at_year,isClosed,active_years
0,web,operating,2021,USA,3.0,39750000.0,5.0,17.0,47.606209,-122.332071,2005,2005.0,2008.0,2010.0,2013.0,1,16
5,advertising,operating,2021,MAR,1.805688,15819780.0,1.415262,2.0,30.427755,-9.598107,2007,2009.948925,2010.937102,2009.536583,2010.17684,1,14
6,cleantech,operating,2021,IND,1.805688,15819780.0,1.415262,4.43452,22.307159,73.181219,2008,2009.948925,2010.937102,2009.536583,2010.17684,1,13
12,advertising,operating,2021,USA,1.805688,15819780.0,1.0,2.0,35.686975,-105.937799,2008,2009.948925,2010.937102,2008.0,2008.0,1,13
13,web,acquired,0,USA,1.0,5000000.0,3.0,14.0,37.386052,-122.083851,2007,2008.0,2008.0,2008.0,2012.0,0,-2007


In [23]:
company.columns


Index(['category_code', 'status', 'closed_at', 'country_code',
       'funding_rounds', 'funding_total_usd', 'milestones', 'relationships',
       'lat', 'lng', 'founded_at_year', 'first_funding_at_year',
       'last_funding_at_year', 'first_milestone_at_year',
       'last_milestone_at_year', 'isClosed', 'active_years'],
      dtype='object')

In [24]:
company.describe()

Unnamed: 0,closed_at,funding_rounds,funding_total_usd,milestones,relationships,lat,lng,founded_at_year,first_funding_at_year,last_funding_at_year,first_milestone_at_year,last_milestone_at_year,isClosed,active_years
count,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0,64099.0
mean,1846.674207,1.805688,15819780.0,1.415262,4.43452,37.308357,-50.638153,2005.464126,2009.948925,2010.937102,2009.536583,2010.17684,0.913743,-158.789919
std,567.386965,0.783298,39501680.0,0.54782,11.517605,15.494886,69.424185,9.668363,1.685916,1.479593,2.975963,2.873369,0.280746,566.593322
min,0.0,1.0,291.0,1.0,1.0,-50.942326,-159.485278,1901.0,1960.0,1960.0,1960.0,1960.0,0.0,-2013.0
25%,2021.0,1.805688,15000000.0,1.0,2.0,34.052234,-111.658534,2004.0,2009.948925,2010.937102,2009.536583,2010.17684,1.0,10.0
50%,2021.0,1.805688,15819780.0,1.415262,3.0,38.984652,-74.204868,2008.0,2009.948925,2010.937102,2009.536583,2010.17684,1.0,12.0
75%,2021.0,1.805688,15819780.0,1.415262,4.43452,44.786568,0.121817,2011.0,2009.948925,2010.937102,2011.0,2012.0,1.0,16.0
max,2021.0,15.0,5700000000.0,9.0,1189.0,77.553604,176.16513,2014.0,2013.0,2013.0,2014.0,2014.0,1.0,120.0


In [26]:
company_numeric = company.apply(pd.to_numeric, errors='coerce')
corr_matrix = company_numeric.corr()


In [27]:
corr_matrix["isClosed"].sort_values(ascending=False)

closed_at                  1.000000
isClosed                   1.000000
active_years               0.999856
last_funding_at_year       0.258929
first_funding_at_year      0.230185
lng                        0.091154
founded_at_year            0.090549
first_milestone_at_year    0.085742
last_milestone_at_year     0.037659
funding_rounds             0.005797
funding_total_usd         -0.005215
lat                       -0.029214
relationships             -0.036650
milestones                -0.140219
category_code                   NaN
status                          NaN
country_code                    NaN
Name: isClosed, dtype: float64

In [28]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [29]:
# One-hot encoding to category_code
dummies = pd.get_dummies(company.category_code, prefix='category')
dummies

Unnamed: 0,category_advertising,category_analytics,category_automotive,category_biotech,category_cleantech,category_consulting,category_design,category_ecommerce,category_education,category_enterprise,...,category_real_estate,category_search,category_security,category_semiconductor,category_social,category_software,category_sports,category_transportation,category_travel,category_web
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
5,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
196549,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
196550,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
196551,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


In [30]:
# Concatenate category with company dataset
company = pd.concat([company,dummies],axis='columns')
company

Unnamed: 0,category_code,status,closed_at,country_code,funding_rounds,funding_total_usd,milestones,relationships,lat,lng,...,category_real_estate,category_search,category_security,category_semiconductor,category_social,category_software,category_sports,category_transportation,category_travel,category_web
0,web,operating,2021,USA,3.000000,3.975000e+07,5.000000,17.00000,47.606209,-122.332071,...,False,False,False,False,False,False,False,False,False,True
5,advertising,operating,2021,MAR,1.805688,1.581978e+07,1.415262,2.00000,30.427755,-9.598107,...,False,False,False,False,False,False,False,False,False,False
6,cleantech,operating,2021,IND,1.805688,1.581978e+07,1.415262,4.43452,22.307159,73.181219,...,False,False,False,False,False,False,False,False,False,False
12,advertising,operating,2021,USA,1.805688,1.581978e+07,1.000000,2.00000,35.686975,-105.937799,...,False,False,False,False,False,False,False,False,False,False
13,web,acquired,0,USA,1.000000,5.000000e+06,3.000000,14.00000,37.386052,-122.083851,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,operating,2021,USA,1.805688,1.581978e+07,2.000000,5.00000,37.774929,-122.419415,...,False,False,False,False,False,False,False,False,False,False
196549,public_relations,operating,2021,USA,1.000000,7.500000e+05,1.000000,14.00000,37.338208,-121.886329,...,False,False,False,False,False,False,False,False,False,False
196550,consulting,operating,2021,USA,1.805688,1.581978e+07,3.000000,44.00000,38.882334,-77.171091,...,False,False,False,False,False,False,False,False,False,False
196551,search,operating,2021,USA,1.805688,1.581978e+07,1.415262,1.00000,34.052234,-118.243685,...,False,True,False,False,False,False,False,False,False,False


In [31]:
# Since, We've added the encoded country_code , let's delete original category_code
company.drop(['category_code'], axis=1,inplace=True)

In [32]:
# One-hot encoding to category_code
dummies = pd.get_dummies(company.country_code, prefix='country')
dummies

Unnamed: 0,country_AFG,country_AGO,country_ALB,country_AND,country_ANT,country_ARA,country_ARE,country_ARG,country_ARM,country_ATG,...,country_UZB,country_VCT,country_VEN,country_VGB,country_VIR,country_VNM,country_YEM,country_ZAF,country_ZMB,country_ZWE
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
196549,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
196550,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
196551,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [33]:
# Concatenate category with company dataset
company = pd.concat([company,dummies],axis='columns')
company

Unnamed: 0,status,closed_at,country_code,funding_rounds,funding_total_usd,milestones,relationships,lat,lng,founded_at_year,...,country_UZB,country_VCT,country_VEN,country_VGB,country_VIR,country_VNM,country_YEM,country_ZAF,country_ZMB,country_ZWE
0,operating,2021,USA,3.000000,3.975000e+07,5.000000,17.00000,47.606209,-122.332071,2005,...,False,False,False,False,False,False,False,False,False,False
5,operating,2021,MAR,1.805688,1.581978e+07,1.415262,2.00000,30.427755,-9.598107,2007,...,False,False,False,False,False,False,False,False,False,False
6,operating,2021,IND,1.805688,1.581978e+07,1.415262,4.43452,22.307159,73.181219,2008,...,False,False,False,False,False,False,False,False,False,False
12,operating,2021,USA,1.805688,1.581978e+07,1.000000,2.00000,35.686975,-105.937799,2008,...,False,False,False,False,False,False,False,False,False,False
13,acquired,0,USA,1.000000,5.000000e+06,3.000000,14.00000,37.386052,-122.083851,2007,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,operating,2021,USA,1.805688,1.581978e+07,2.000000,5.00000,37.774929,-122.419415,2007,...,False,False,False,False,False,False,False,False,False,False
196549,operating,2021,USA,1.000000,7.500000e+05,1.000000,14.00000,37.338208,-121.886329,2007,...,False,False,False,False,False,False,False,False,False,False
196550,operating,2021,USA,1.805688,1.581978e+07,3.000000,44.00000,38.882334,-77.171091,1959,...,False,False,False,False,False,False,False,False,False,False
196551,operating,2021,USA,1.805688,1.581978e+07,1.415262,1.00000,34.052234,-118.243685,2008,...,False,False,False,False,False,False,False,False,False,False


In [34]:
# Since, We've added the encoded country_code , let's delete original category_code
company.drop(['country_code'], axis=1,inplace=True)

In [35]:
company.drop(['status'], axis=1,inplace=True)

In [36]:
company.columns

Index(['closed_at', 'funding_rounds', 'funding_total_usd', 'milestones',
       'relationships', 'lat', 'lng', 'founded_at_year',
       'first_funding_at_year', 'last_funding_at_year',
       ...
       'country_UZB', 'country_VCT', 'country_VEN', 'country_VGB',
       'country_VIR', 'country_VNM', 'country_YEM', 'country_ZAF',
       'country_ZMB', 'country_ZWE'],
      dtype='object', length=221)

In [37]:
X = company.drop("isClosed", axis=1)
y = company["isClosed"]

In [41]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [42]:
from collections import Counter
Counter(y_train)

Counter({1: 46822, 0: 4457})

In [43]:
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()

In [44]:
scaling.fit_transform(X_train)

array([[ 3.08529245e-01,  5.98908332e-04, -1.97235659e-03, ...,
        -5.86858144e-02, -4.41605487e-03, -8.83236811e-03],
       [ 3.08529245e-01,  5.98908332e-04, -1.97235659e-03, ...,
        -5.86858144e-02, -4.41605487e-03, -8.83236811e-03],
       [ 3.08529245e-01, -1.02666129e+00, -3.82758561e-01, ...,
        -5.86858144e-02, -4.41605487e-03, -8.83236811e-03],
       ...,
       [ 3.08529245e-01,  5.98908332e-04, -1.97235659e-03, ...,
        -5.86858144e-02, -4.41605487e-03, -8.83236811e-03],
       [ 3.08529245e-01,  5.98908332e-04, -1.97235659e-03, ...,
        -5.86858144e-02, -4.41605487e-03, -8.83236811e-03],
       [ 3.08529245e-01,  1.52335911e+00,  9.95619288e-01, ...,
        -5.86858144e-02, -4.41605487e-03, -8.83236811e-03]])

In [45]:
scaling.fit_transform(X_test)

array([[ 0.30207544, -0.00241125,  0.01000489, ..., -0.05594542,
         0.        ,  0.        ],
       [ 0.30207544, -0.00241125,  0.01000489, ..., -0.05594542,
         0.        ,  0.        ],
       [ 0.30207544, -0.00241125,  0.01000489, ..., -0.05594542,
         0.        ,  0.        ],
       ...,
       [ 0.30207544, -0.00241125,  0.01000489, ..., -0.05594542,
         0.        ,  0.        ],
       [ 0.30207544, -1.03638657,  1.00329495, ..., -0.05594542,
         0.        ,  0.        ],
       [ 0.30207544, -0.00241125,  0.01000489, ..., -0.05594542,
         0.        ,  0.        ]])

In [46]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [47]:
log_reg_predict = log_reg.predict(X_test)

In [48]:
log_reg.score(X_train, y_train)

1.0

In [49]:
log_reg.score(X_test, y_test)

1.0

In [50]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(confusion_matrix(y_test,log_reg_predict))
print(accuracy_score(y_test,log_reg_predict))
print(classification_report(y_test,log_reg_predict))

[[ 1072     0]
 [    0 11748]]
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1072
           1       1.00      1.00      1.00     11748

    accuracy                           1.00     12820
   macro avg       1.00      1.00      1.00     12820
weighted avg       1.00      1.00      1.00     12820



In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

In [52]:
# Create a Gaussian Naive Bayes classifier
model = GaussianNB()

# Fit the model to the training data
model.fit(X_train, y_train)

In [53]:
# Use the model to make predictions on the test data
y_pred = model.predict(X_test)

In [54]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display a classification report with additional metrics
print(classification_report(y_test, y_pred))

Accuracy: 0.9139625585023401
              precision    recall  f1-score   support

           0       0.29      0.02      0.04      1072
           1       0.92      1.00      0.95     11748

    accuracy                           0.91     12820
   macro avg       0.60      0.51      0.50     12820
weighted avg       0.86      0.91      0.88     12820



In [55]:
from sklearn.metrics import confusion_matrix

# Assuming you have already made predictions and calculated the confusion matrix
confusion = confusion_matrix(y_test, y_pred)

# Calculate Specificity and Sensitivity
tn, fp, fn, tp = confusion.ravel()
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

print("Specificity (True Negative Rate):", specificity)
print("Sensitivity (True Positive Rate):", sensitivity)


Specificity (True Negative Rate): 0.01958955223880597
Sensitivity (True Positive Rate): 0.9955737146748382


In [57]:
from joblib import dump

In [58]:
dump(model,'./../savedModels/model.joblib')

['./../savedModels/model.joblib']