In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
bin_df = pd.read_csv('data/dataframes/df_after_cols_reduction.csv').iloc[:,1:]
pca_df = pd.read_csv('data/dataframes/pca_df.csv').iloc[:,1:]

print(f'Binary dataframe shape: {bin_df.shape}')
print(f'PCA dataframe shape: {pca_df.shape}')


Binary dataframe shape: (10070, 1795)
PCA dataframe shape: (10070, 26)


In [3]:
cat_cols = ['company_name', 'company_about','founded', 'business model','employees','product stage','status','fund_stage','suceeded']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in bin_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in bin_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in bin_df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in bin_df.columns if col.startswith("industry_")]
technology_list = [col  for col in bin_df.columns if col.startswith("technology_")]


pca_cols = [col for col in pca_df.columns if col not in cat_cols and col not in num_cols]
bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list

In [6]:
print(f"Categorical cols : {len(cat_cols)}")
print(f"Numerical cols : {len(num_cols)}")
print(f"Tag cols : {len(tag_cols)}")
print(f"Targetmarket cols : {len(targetmarket_cols)}")
print(f"Sector cols : {len(sector_list)}")
print(f"Industry cols : {len(target_ind_list)}")
print(f"Technology cols : {len(technology_list)}")
print(f"Total binary cols : {len(bin_cols)}")
print(f"Toatl PCA cols : {len(pca_cols)}")



Categorical cols : 9
Numerical cols : 5
Tag cols : 1470
Targetmarket cols : 115
Sector cols : 40
Industry cols : 80
Technology cols : 76
Total binary cols : 1781
Toatl PCA cols : 12


In [24]:
Xtrain, Xtest, ytrain, ytest = train_test_split(bin_df[num_cols + bin_cols], bin_df['suceeded'],test_size = 0.2, stratify=bin_df['suceeded'])

In [34]:
lr = LogisticRegression(max_iter = 150)
lr.fit(Xtrain, ytrain)

LogisticRegression(max_iter=150)

In [35]:
ytrain_pred = lr.predict(Xtrain)
ytest_pred = lr.predict(Xtest)

In [37]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytrain_pred, y_true = ytrain))
print("precision is:",metrics.precision_score(y_pred = ytrain_pred, y_true = ytrain))
print("recall is:",metrics.recall_score(y_pred = ytrain_pred, y_true = ytrain))
print("f1 is:",metrics.f1_score(y_pred = ytrain_pred, y_true = ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(y_pred = ytest_pred, y_true = ytest))
print("precision is:",metrics.precision_score(y_pred = ytest_pred, y_true = ytest))
print("recall is:",metrics.recall_score(y_pred = ytest_pred, y_true = ytest))
print("f1 is:",metrics.f1_score(y_pred = ytest_pred, y_true = ytest))

Train results:
accuracy is: 0.7919563058589871
precision is: 0.4092351075079309
recall is: 1.0
f1 is: 0.5807903951975988
---------------------
Test results:
accuracy is: 0.79493545183714
precision is: 0.41251778093883357
recall is: 1.0
f1 is: 0.5840886203423967
