In [35]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn import metrics

In [71]:
bin_df = pd.read_csv('data/dataframes/df_after_cols_reduction3.csv').iloc[:,1:]
pca_df = pd.read_csv('data/dataframes/pca_df.csv').iloc[:,1:]

print(f'Binary dataframe shape: {bin_df.shape}')
print(f'PCA dataframe shape: {pca_df.shape}')


Binary dataframe shape: (10070, 1856)
PCA dataframe shape: (10070, 26)


In [72]:
success_rate = 4000000

# bin_df.loc[(bin_df["status"] == 1) & (bin_df['total_raised'] >= success_rate), 'suceeded'] = 1
# bin_df.loc[(bin_df["status"] == 0) | (bin_df['total_raised'] < success_rate), 'suceeded'] = 0

bin_df.suceeded.sum()

1620.0

In [73]:
cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','fund_stage','suceeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in bin_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in bin_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in bin_df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in bin_df.columns if col.startswith("industry_")]
technology_list = [col  for col in bin_df.columns if col.startswith("technology_")]


pca_cols = [col for col in pca_df.columns if col not in cat_cols and col not in num_cols]
bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list

In [40]:
print(f"Categorical cols : {len(cat_cols)}")
print(f"Numerical cols : {len(num_cols)}")
print(f"Tag cols : {len(tag_cols)}")
print(f"Targetmarket cols : {len(targetmarket_cols)}")
print(f"Sector cols : {len(sector_list)}")
print(f"Industry cols : {len(target_ind_list)}")
print(f"Technology cols : {len(technology_list)}")
print(f"Total binary cols : {len(bin_cols)}")
print(f"Toatl PCA cols : {len(pca_cols)}")



Categorical cols : 9
Numerical cols : 5
Tag cols : 1571
Targetmarket cols : 116
Sector cols : 41
Industry cols : 81
Technology cols : 79
Total binary cols : 1888
Toatl PCA cols : 12


In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(bin_df[num_cols + bin_cols], bin_df['suceeded'],test_size = 0.2, stratify=bin_df['suceeded'])
Xtrain_pca, Xtest_pca, ytrain_pca, ytest_pca = train_test_split(pca_df[num_cols + pca_cols], bin_df['suceeded'],test_size = 0.2, stratify=bin_df['suceeded'])

print(f"Xtrain shape: {Xtrain.shape}")
print(f"ytrain shape: {ytrain.shape}")

print(f"Xtrain_pca shape: {Xtrain_pca.shape}")
print(f"ytrain_pca shape: {ytrain_pca.shape}")



Xtrain shape: (8056, 1786)
ytrain shape: (8056,)
Xtrain_pca shape: (8056, 17)
ytrain_pca shape: (8056,)


In [7]:
lr = LogisticRegression(max_iter = 150)
lr.fit(Xtrain, ytrain)

LogisticRegression(max_iter=150)

In [8]:
ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

In [9]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Train results:
accuracy is: 0.7848808341608738
precision is: 0.4001384562132226
recall is: 1.0
f1 is: 0.5715698393077874
---------------------
Test results:
accuracy is: 0.7994041708043694
precision is: 0.417027417027417
recall is: 1.0
f1 is: 0.5885947046843177


Train results:  
accuracy is: 0.7919563058589871  
precision is: 0.4092351075079309  
recall is: 1.0  
f1 is: 0.5807903951975988  

Test results:  
accuracy is: 0.79493545183714  
precision is: 0.41251778093883357  
recall is: 1.0  
f1 is: 0.5840886203423967  

In [74]:
# create a new dataframe with suceeded = 1 and suceeded = 0

bin_df_suceeded = bin_df[bin_df['suceeded'] == 1]
bin_df_failed = bin_df[bin_df['suceeded'] == 0]

size = bin_df_suceeded.shape[0]
bin_df_fialed_sampled = bin_df_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_suceeded, bin_df_fialed_sampled])

print(equal_df.shape)
print(equal_df['suceeded'].sum())



(3240, 1856)
1620.0


In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[num_cols + bin_cols], equal_df['suceeded'],test_size = 0.2)
lr = LogisticRegression()
lr.fit(Xtrain, ytrain)


LogisticRegression()

In [12]:
ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

In [13]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Train results:
accuracy is: 0.8814878892733564
precision is: 0.8098542678695351
recall is: 1.0
f1 is: 0.8949386503067485
---------------------
Test results:
accuracy is: 0.8737024221453287
precision is: 0.792022792022792
recall is: 1.0
f1 is: 0.8839427662957074


In [76]:
Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[bin_cols], equal_df['suceeded'],test_size = 0.2)

print(f"Xtrain shape: {Xtrain.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"Xtest shape: {Xtest.shape}")
print(f"ytest shape: {ytest.shape}")

lr = LogisticRegression(max_iter=1500)
lr.fit(Xtrain, ytrain)

ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

print("\n\nTrain results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Xtrain shape: (2592, 1842)
ytrain shape: (2592,)
Xtest shape: (648, 1842)
ytest shape: (648,)


Train results:
accuracy is: 0.8969907407407407
precision is: 0.9033515198752923
recall is: 0.8901689708141322
f1 is: 0.8967117988394585
---------------------
Test results:
accuracy is: 0.6882716049382716
precision is: 0.6847133757961783
recall is: 0.6761006289308176
f1 is: 0.680379746835443


In [15]:
success_rate = 4200000

pca_df.loc[(pca_df["status"]==1)&(pca_df['total_raised']>=success_rate), 'suceeded'] = 1
pca_df.loc[(pca_df["status"]==0)|(pca_df['total_raised']<success_rate), 'suceeded'] = 0

# print(pca_df.head())

pca_df_suceeded = pca_df[pca_df['suceeded'] == 1]
pca_df_failed = pca_df[pca_df['suceeded'] == 0]

size = pca_df_suceeded.shape[0]
pca_df_fialed_sampled = pca_df_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([pca_df_suceeded, pca_df_fialed_sampled])

print(equal_df.shape)
print(equal_df['suceeded'].sum())

pca_cols = [col for col in equal_df.columns if col not in cat_cols and col not in num_cols]
Xtrain, Xtest, ytrain, ytest = train_test_split(equal_df[pca_cols], equal_df['suceeded'],test_size = 0.2)

print(f"Xtrain shape: {Xtrain.shape}")
print(f"ytrain shape: {ytrain.shape}")
print(f"Xtest shape: {Xtest.shape}")
print(f"ytest shape: {ytest.shape}")

lr = LogisticRegression(max_iter=1500)
lr.fit(Xtrain, ytrain)

ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

print("\n\nTrain results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

(2890, 26)
1445.0
Xtrain shape: (2312, 12)
ytrain shape: (2312,)
Xtest shape: (578, 12)
ytest shape: (578,)


Train results:
accuracy is: 0.6600346020761245
precision is: 0.6632173095014111
recall is: 0.6222418358340689
f1 is: 0.6420765027322405
---------------------
Test results:
accuracy is: 0.657439446366782
precision is: 0.7050359712230215
recall is: 0.6282051282051282
f1 is: 0.6644067796610169


In [16]:
from lib.unsupervised_learning import *

In [68]:
bin_df_suceeded = bin_df[bin_df['suceeded'] == 1]
bin_df_failed = bin_df[bin_df['suceeded'] == 0]

size = bin_df_suceeded.shape[0]
bin_df_fialed_sampled = bin_df_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([bin_df_suceeded, bin_df_fialed_sampled])

print(equal_df.shape)
print(equal_df['suceeded'].sum())

# print(get_best_init_params_for_k_means(equal_df[bin_cols],8,['k-means++','random'],range(5,50,5),42))

(4054, 1902)
2027.0


In [79]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=150)

scores = cross_val_score(clf, equal_df[bin_cols], equal_df['suceeded'], cv=cv, scoring='f1_macro')
print(scores)
print(f'Best score after cross-validation: {scores.max()}')


[0.66970274 0.70824926 0.71654088 0.66332046 0.66203623 0.68207151
 0.69582467 0.66819012 0.69711538 0.69798008 0.69903895 0.68300709
 0.69748476 0.72990675 0.67123288]
Best score after cross-validation: 0.7299067535220494


In [88]:
scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(clf, equal_df[bin_cols], equal_df['suceeded'],cv = cv, scoring=scoring, return_train_score=True)

In [85]:
for d in ['test','train']:
    print(f"{d}_precision_macro: {scores[f'{d}_precision_macro'].max()}")
    print(f"{d}_recall_macro: {scores[f'{d}_recall_macro'].max()}")
    print(f"{d}_f1_macro: {scores[f'{d}_f1_macro'].max()}")
    print("---------------------")

test_precision_macro: 0.7301166692084795
test_recall_macro: 0.7299785663253155
test_f1_macro: 0.7299067535220494
---------------------
train_precision_macro: 0.9012110170399743
train_recall_macro: 0.9012798250691363
train_f1_macro: 0.9012260996313126
---------------------


In [89]:
df_outliers = pd.read_csv("data/dataframes/final_cleaned.csv").iloc[:,1:]

success_rate = 4000000

df_outliers.loc[(df_outliers["status"] == 1) & (df_outliers['total_raised'] >= success_rate), 'suceeded'] = 1
df_outliers.loc[(df_outliers["status"] == 0) | (df_outliers['total_raised'] < success_rate), 'suceeded'] = 0

cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','fund_stage','suceeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in df_outliers.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_outliers.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_outliers.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_outliers.columns if col.startswith("industry_")]
technology_list = [col  for col in df_outliers.columns if col.startswith("technology_")]


pca_cols = [col for col in pca_df.columns if col not in cat_cols and col not in num_cols]
bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list

df_outliers_suceeded = df_outliers[df_outliers['suceeded'] == 1]
df_outliers_failed = df_outliers[df_outliers['suceeded'] == 0]

size = df_outliers_suceeded.shape[0]
df_outliers_fialed_sampled = df_outliers_failed.sample(n = size , random_state = 42)

equal_df = pd.concat([df_outliers_suceeded, df_outliers_fialed_sampled])

print(equal_df.shape)
print(equal_df['suceeded'].sum())

from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=15, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=150)

scoring = ['precision_macro', 'recall_macro','f1_macro']
scores = cross_validate(clf, equal_df[bin_cols], equal_df['suceeded'],cv = cv, scoring=scoring, return_train_score=True)

for d in ['test','train']:
    print(f"{d}_precision_macro: {scores[f'{d}_precision_macro'].max()}")
    print(f"{d}_recall_macro: {scores[f'{d}_recall_macro'].max()}")
    print(f"{d}_f1_macro: {scores[f'{d}_f1_macro'].max()}")
    print("---------------------")

(2992, 2864)
1496.0
test_precision_macro: 0.7113748320644873
test_recall_macro: 0.710577974079279
test_f1_macro: 0.710640139616056
---------------------
train_precision_macro: 0.9106606669544963
train_recall_macro: 0.910609032524857
train_f1_macro: 0.9105712381740049
---------------------
