In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [28]:
def decode_price(price: str):
    '''
    format: $XXXX.XXXXA 
    where X is digit from the range [0,9]
    and A is an action multiplier where K means thousands and M means Millions. 
    '''


    k = 1000
    m = 1000000
    multiplier = 0
    try:
        if(price[0]!='$'):
            price = '$' + price
        symbol = price[-1].upper() # The upper method is used to reduce the need to check wether the symbol is 'k' or 'K'
    except:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    if(symbol=="K"):
        multiplier = k
    elif(symbol == "M"):
        multiplier = m
    elif symbol == 'B':
        multiplier = m * 100
    else:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    
    new_price= int(float(price[1:-1]) * multiplier)
    # print(f"Price before manipulation: {price}")
    # print(f"Price after manipulation: {new_price}")

    # TODO: after the function test we can remove the prints above.
    
    return new_price

def conv_to_float(df):
    for i, val in enumerate(df):
        if val != 0:
            df.iloc[i] = decode_price(val)
    return df

In [29]:
df = pd.read_csv("df0_5000.csv").iloc[:,2:]
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,tag_application-optimization,tag_mainframe,tag_sourcing,tag_plastic-surgery,tag_dermatology,tag_card-payments,tag_paycheck,tag_remittances,targetmarket_unbanked,tag_it-architecture
0,Tastewise,Tastewise is an AI platform designed to help f...,7/2017,B2B,51-200,ROUND A,$21.5M,Released,active,"australia, canada, france, india, united kingd...",...,,,,,,,,,,
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,,,,,,,,,,
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,,Released,not_active,"canada, mexico, spain, united states",...,,,,,,,,,,
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,$24M,Released,active,"global, united states",...,,,,,,,,,,
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,,Released,active,"north america, europe, global, france, germany...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4983,Zaponet,"Zaponet is analytics solutions provider, offer...",2011,B2B,11-50,Bootstrapped,,Released,not_active,,...,,,,,,,,,,
4984,Storydoc,Storydoc can easily transform static PDFs and ...,1/2020,B2B,1-10,Bootstrapped,,Released,active,global,...,,,,,,,,,,
4985,PATX.io,PatX developed an AI system for the assessment...,8/2019,B2B,1-10,Bootstrapped,,Alpha,active,,...,,,,,,,,,,
4986,Peck,Peck's SaaS platform enables successful digita...,8/2019,B2B,1-10,Bootstrapped,,Beta,active,north america,...,,,,,,,,,,


In [4]:
df.shape

(4988, 2213)

In [30]:
tag_cols = [col for col in df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df.columns if col.startswith("core_technology_")]

In [31]:
df[tag_cols] = df[tag_cols].fillna(0)
df[targetmarket_cols] = df[targetmarket_cols].fillna(0)
df[sector_list] = df[sector_list].fillna(0)
df[target_ind_list] = df[target_ind_list].fillna(0)
df[technology_list] = df[technology_list].fillna(0)
df['raised'] = df['raised'].fillna(0)
df['total_rounds'] = df['total_rounds'].fillna(0)
df['investors'] = df['investors'].fillna(0)
df['ipo_price'] = df['ipo_price'].fillna(0)
df.status.replace({'active' : 1, 'not_active' : 0 }, inplace=True)
df.drop(['use cases','academic spin-off','total_raised'], axis = 1, inplace = True)

In [32]:
df['ipo_price'] = conv_to_float(df["ipo_price"])
df['raised'] = conv_to_float(df["raised"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [33]:
df['raised'] = df['raised'].astype('float')
df['total_rounds'] = df['total_rounds'].astype('int')
df['investors'] = df['investors'].astype('int')
df['ipo_price'] = df['ipo_price'].astype('float')

In [35]:
df.raised.mean()

6148873.9212109065

In [38]:
df.loc[(df["status"]==1)&(df['raised']>4000000), 'suceeded'] = 1
df.loc[(df["status"]==0)|(df['raised']<=4000000), 'suceeded'] = 0

In [39]:
xtrain, xtest, ytrain, ytest = train_test_split(df.iloc[:,:-1].select_dtypes(include=np.number), df.iloc[:,-1])


In [46]:
lr = LogisticRegression()
lr.fit(xtrain,ytrain)

LogisticRegression()

In [47]:
ytrain_pred = lr.predict(xtrain)
ytest_pred = lr.predict(xtest)

In [50]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(ytrain_pred, ytrain))
print("precision is:",metrics.precision_score(ytrain_pred, ytrain))
print("recall is:",metrics.recall_score(ytrain_pred, ytrain))
print("f1 is:",metrics.f1_score(ytrain_pred, ytrain))
print("------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(ytest_pred, ytest))
print("precision is:",metrics.precision_score(ytest_pred, ytest))
print("recall is:",metrics.recall_score(ytest_pred, ytest))
print("f1 is:",metrics.f1_score(ytest_pred, ytest))

Train results:
accuracy is: 0.7834803528468324
precision is: 0.9974226804123711
recall is: 0.32357859531772576
f1 is: 0.48863636363636365
------------
Test results:
accuracy is: 0.7923015236567763
precision is: 0.9927536231884058
recall is: 0.3468354430379747
f1 is: 0.5140712945590994


This is the results without vectorize