In [61]:
import pandas as pd
import numpy as np
from sklearn import preprocessing


In [35]:
def vectorize_and_replace(df, cols: list, prefix: str):
    """Vectorize the {cols} columns in {dataframe}\n
    and returns a {dataframe} copy with the vector after removing the {cols}\n
    new col name is {prefix}_vec """
    # df = dataframe.copy()

    lb = preprocessing.LabelBinarizer()
    vec = lb.fit_transform(df[cols]).tolist()

    df[f"{prefix}_vec"] = vec
    df = df.drop(cols, axis = 1)
    return df

In [70]:
def decode_price(price: str):
    '''
    format: $XXXX.XXXXA 
    where X is digit from the range [0,9]
    and A is an action multiplier where K means thousands and M means Millions. 
    '''

    if(price[0]!='$'):
        price = '$' + price

    k = 1000
    m = 1000000
    multiplier = 0
    try:
        symbol = price[-1].upper() # The upper method is used to reduce the need to check wether the symbol is 'k' or 'K'
    except:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    if(symbol=="K"):
        multiplier = k
    elif(symbol == "M"):
        multiplier = m
    else:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    
    new_price= int(float(price[1:-1]) * multiplier)
    print(f"Price before manipulation: {price}")
    print(f"Price after manipulation: {new_price}")

    # TODO: after the function test we can remove the prints above.
    
    return new_price

In [118]:
df = pd.read_csv("df0_2000.csv").iloc[:,1:]
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,tag_phone-calls,tag_salesforce,tag_professional-networking,tag_customer-analytics,tag_video-conferencing,tag_ethereum,tag_rent,tag_parking,tag_strategy,tag_data-mining
0,Tastewise,Tastewise is an AI platform designed to help f...,7/2017,B2B,51-200,ROUND A,$21.5M,Released,active,"australia, canada, france, india, united kingd...",...,,,,,,,,,,
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,,,,,,,,,,
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,,Released,not_active,"canada, mexico, spain, united states",...,,,,,,,,,,
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,$24M,Released,active,"global, united states",...,,,,,,,,,,
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,,Released,active,"north america, europe, global, france, germany...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,Intimate City,Intimate City is a portal that allows you to i...,2013,B2C,1-10,Bootstrapped,,Released,not_active,,...,,,,,,,,,,
1992,WeTicket,WeTicket is a social cultural events platform ...,1/2014,B2C,1-10,Bootstrapped,,Released,not_active,,...,,,,,,,,,,
1993,Nostalgic,Nostalgic is a business-to-business-to-consume...,4/2011,B2B2C,1-10,Bootstrapped,,R&D,not_active,,...,,,,,,,,,,
1994,StoreX.me,StoreX.me is an online platform that connects ...,2014,B2C,1-10,Bootstrapped,,Beta,not_active,,...,,,,,,,,,,


In [119]:
df_test = df.copy()
df_test.shape

(1996, 1696)

In [120]:
tag_cols = [col for col in df_test.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_test.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_test.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_test.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df_test.columns if col.startswith("core_technology_")]


In [121]:
df_test[tag_cols] = df_test[tag_cols].fillna(0)
df_test[targetmarket_cols] = df_test[targetmarket_cols].fillna(0)
df_test[sector_list] = df_test[sector_list].fillna(0)
df_test[target_ind_list] = df_test[target_ind_list].fillna(0)
df_test[technology_list] = df_test[technology_list].fillna(0)


vectorize

In [122]:
df_test = vectorize_and_replace(df_test, tag_cols, "tag")
df_test = vectorize_and_replace(df_test, targetmarket_cols, "targetmarket")
df_test = vectorize_and_replace(df_test, sector_list, "sector")
df_test = vectorize_and_replace(df_test, target_ind_list, "target_ind")
df_test = vectorize_and_replace(df_test, technology_list, "tech")
df_test.shape

(1996, 23)

Replacing null values to 0

In [123]:
df_test['raised'] = df_test['raised'].fillna(0)
df_test['total_rounds'] = df_test['total_rounds'].fillna(0)
df_test['investors'] = df_test['investors'].fillna(0)
df_test['ipo_price'] = df_test['ipo_price'].fillna(0)

In [124]:
df_test.status.replace({'active' : 1, 'not_active' : 0 }, inplace=True)

Removing unused columns

In [125]:
df_test.drop(['use cases','academic spin-off','total_raised'], axis = 1, inplace = True)
df_test.shape

(1996, 20)

Decoding str to numeric value

In [126]:
for i,val in enumerate(df_test['ipo_price']):
    if val != 0:
        df_test.loc[i,'ipo_price'] = decode_price(val)

for i,val in enumerate(df_test['raised']):
    if val != 0:
        df_test.loc[i,'raised'] = decode_price(val)

Price before manipulation: $30.6M
Price after manipulation: 30600000
Price before manipulation: $12M
Price after manipulation: 12000000
Price before manipulation: $28M
Price after manipulation: 28000000
Price before manipulation: $5.58M
Price after manipulation: 5580000
Price before manipulation: $4.34M
Price after manipulation: 4340000
Price before manipulation: $13M
Price after manipulation: 13000000
Price before manipulation: $12M
Price after manipulation: 12000000
Price before manipulation: $7.5M
Price after manipulation: 7500000
Price before manipulation: $9.177M
Price after manipulation: 9177000
Price before manipulation: $4.81M
Price after manipulation: 4810000
Price before manipulation: $10.22M
Price after manipulation: 10220000
Price before manipulation: $5.15M
Price after manipulation: 5150000
Price before manipulation: $7M
Price after manipulation: 7000000
Price before manipulation: $31M
Price after manipulation: 31000000
Price before manipulation: $22.74M
Price after manipu

Converting the numeric colums

In [127]:
df_test['raised'] = df_test['raised'].astype('float')
df_test['total_rounds'] = df_test['total_rounds'].astype('int')
df_test['investors'] = df_test['investors'].astype('int')
df_test['ipo_price'] = df_test['ipo_price'].astype('float')

In [128]:
df_test.raised.mean()

4444618.980460922

In [129]:
df_test.corr()

Unnamed: 0,raised,status,total_rounds,investors,ipo_price
raised,1.0,0.107952,0.373286,0.468932,0.270553
status,0.107952,1.0,0.2065,0.169824,0.037246
total_rounds,0.373286,0.2065,1.0,0.740356,0.05184
investors,0.468932,0.169824,0.740356,1.0,0.033812
ipo_price,0.270553,0.037246,0.05184,0.033812,1.0


In [131]:
df_test.to_csv('cleaned.csv')

In [138]:
df_test.loc[(df_test["status"]==1)&(df_test['raised']>4000000), 'suceeded'] = 1
df_test.loc[(df_test["status"]==0)|(df_test['raised']<=4000000), 'suceeded'] = 0

In [139]:
# df_test.loc["succeeded"] = df_test[(df_test['status'] == 1) & (df_test['raised'] > 5000000)]
df_test

Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,fund_stage,total_rounds,investors,ipo_price,tag_vec,targetmarket_vec,sector_vec,target_ind_vec,tech_vec,suceeded
0,Tastewise,Tastewise is an AI platform designed to help f...,7/2017,B2B,51-200,ROUND A,21500000.0,Released,1,"australia, canada, france, india, united kingd...",...,A,3,3,0.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,4690000.0,R&D,1,,...,Public,2,2,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, ...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,0.0,Released,0,"canada, mexico, spain, united states",...,,0,0,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,24000000.0,Released,1,"global, united states",...,A,4,12,0.0,"[0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,0.0,Released,1,"north america, europe, global, france, germany...",...,,0,0,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,Intimate City,Intimate City is a portal that allows you to i...,2013,B2C,1-10,Bootstrapped,0.0,Released,0,,...,,0,0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.0
1992,WeTicket,WeTicket is a social cultural events platform ...,1/2014,B2C,1-10,Bootstrapped,0.0,Released,0,,...,,0,0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.0
1993,Nostalgic,Nostalgic is a business-to-business-to-consume...,4/2011,B2B2C,1-10,Bootstrapped,0.0,R&D,0,,...,,0,0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.0
1994,StoreX.me,StoreX.me is an online platform that connects ...,2014,B2C,1-10,Bootstrapped,0.0,Beta,0,,...,,0,0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.0


In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [172]:

xtrain, xtest, ytrain, ytest = train_test_split(df_test.iloc[:,:-1].select_dtypes(include=np.number), df_test.iloc[:,-1])


In [176]:
lr = LogisticRegression()
lr.fit(xtrain,ytrain)

LogisticRegression()

In [177]:
ytrain_pred = lr.predict(xtrain)
ytest_pred = lr.predict(xtest)

In [178]:
metrics.confusion_matrix(ytrain_pred, ytrain)

array([[1054,    4],
       [ 299,  140]], dtype=int64)

In [179]:
print("accuracy is:",metrics.accuracy_score(ytrain_pred, ytrain))
print("precision is:",metrics.precision_score(ytrain_pred, ytrain))
print("recall is:",metrics.recall_score(ytrain_pred, ytrain))
print("f1 is:",metrics.f1_score(ytrain_pred, ytrain))

accuracy is: 0.7975951903807615
precision is: 0.9722222222222222
recall is: 0.31890660592255127
f1 is: 0.48027444253859347


In [180]:
print("accuracy is:",metrics.accuracy_score(ytest_pred, ytest))
print("precision is:",metrics.precision_score(ytest_pred, ytest))
print("recall is:",metrics.recall_score(ytest_pred, ytest))
print("f1 is:",metrics.f1_score(ytest_pred, ytest))

accuracy is: 0.8016032064128257
precision is: 0.975
recall is: 0.2846715328467153
f1 is: 0.4406779661016949
