In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [58]:
def decode_price(price: str):
    '''
    format: $XXXX.XXXXA 
    where X is digit from the range [0,9]
    and A is an action multiplier where K means thousands and M means Millions. 
    '''


    k = 1000
    m = 1000000
    multiplier = 0
    try:
        if(price[0]!='$'):
            price = '$' + price
        symbol = price[-1].upper() # The upper method is used to reduce the need to check wether the symbol is 'k' or 'K'
    except:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    if(symbol=="K"):
        multiplier = k
    elif(symbol == "M"):
        multiplier = m
    elif symbol == 'B':
        multiplier = m * 100
    else:
        # print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return float(price[1:])
    
    new_price= int(float(price[1:-1]) * multiplier)
    # print(f"Price before manipulation: {price}")
    # print(f"Price after manipulation: {new_price}")

    # TODO: after the function test we can remove the prints above.
    
    return new_price

def conv_to_float(df):
    for i, val in enumerate(df):
        if val != 0:
            df.iloc[i] = decode_price(val)
    return df

In [59]:
df = pd.read_csv("df_complete.csv").iloc[:,3:]
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,tag_e-mobility,targetmarket_public-transportation,targetmarket_smart-mobility,tag_hydraulic-drive,tag_hud,tag_simulation-software,tag_luggage,tag_traffic-violations,tag_car-audio,tag_trip
0,Tastewise,Tastewise is an AI platform designed to help f...,7/2017,B2B,51-200,ROUND A,$21.5M,Released,active,"australia, canada, france, india, united kingd...",...,,,,,,,,,,
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,,,,,,,,,,
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,,Released,not_active,"canada, mexico, spain, united states",...,,,,,,,,,,
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,$24M,Released,active,"global, united states",...,,,,,,,,,,
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,,Released,active,"north america, europe, global, france, germany...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13043,TriEye,TriEye is a fabless semiconductor company that...,11/2017,B2B,51-200,ROUND A,$96M,R&D,active,global,...,,,,,,,,,,
13044,LYNX Smartcars,LYNX is developing software for connected and ...,1/2016,B2B,1-10,Bootstrapped,,R&D,not_active,,...,,,,,,,,,,
13045,Deeyook Location Technologies,Deeyook seeks to redefine location technology ...,3/2017,B2B,11-50,Seed,,Released,active,global,...,,,,,,,,,,
13046,SafeCue,SafeCue combines the power of deep learning wi...,1/2016,B2B,1-10,Seed,$500K,Beta,not_active,"asia, germany, india, united states",...,,,,,,,,,,


In [60]:
df.shape

(13048, 2868)

In [61]:
tag_cols = [col for col in df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df.columns if col.startswith("core_technology_")]

In [62]:
df[tag_cols] = df[tag_cols].fillna(0)
df[targetmarket_cols] = df[targetmarket_cols].fillna(0)
df[sector_list] = df[sector_list].fillna(0)
df[target_ind_list] = df[target_ind_list].fillna(0)
df[technology_list] = df[technology_list].fillna(0)
df['raised'] = df['raised'].fillna(0)
df['total_rounds'] = df['total_rounds'].fillna(0)
df['investors'] = df['investors'].fillna(0)
df['ipo_price'] = df['ipo_price'].fillna(0)
df.status.replace({'active' : 1, 'not_active' : 0 }, inplace=True)
df.drop(['use cases','academic spin-off','total_raised'], axis = 1, inplace = True)

In [63]:
df['ipo_price'] = conv_to_float(df["ipo_price"])
df['raised'] = conv_to_float(df["raised"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [64]:
df['raised'] = df['raised'].astype('float')
df['total_rounds'] = df['total_rounds'].astype('int')
df['investors'] = df['investors'].astype('int')
df['ipo_price'] = df['ipo_price'].astype('float')

In [65]:
df['business model'] = preprocessing.LabelEncoder().fit_transform(df['business model'])

In [66]:
df.raised.mean()

8837951.85239117

In [67]:
df.loc[(df["status"]==1)&(df['raised']>4000000), 'succeeded'] = 1
df.loc[(df["status"]==0)|(df['raised']<=4000000), 'succeeded'] = 0

In [68]:
xtrain, xtest, ytrain, ytest = train_test_split(df.iloc[:,:-1].select_dtypes(include=np.number), df.iloc[:,-1])


In [69]:
lr = LogisticRegression()
lr.fit(xtrain,ytrain)

LogisticRegression()

In [70]:
ytrain_pred = lr.predict(xtrain)
ytest_pred = lr.predict(xtest)

In [77]:
proba = df[df['succeeded'] == 1].shape[0] / df.shape[0]
print("The probability to succeed is : ", proba)

The probability to succeed is :  0.14530962599632127


In [72]:
print("Train results:")
print("accuracy is:",metrics.accuracy_score(ytrain_pred, ytrain))
print("precision is:",metrics.precision_score(ytrain_pred, ytrain))
print("recall is:",metrics.recall_score(ytrain_pred, ytrain))
print("f1 is:",metrics.f1_score(ytrain_pred, ytrain))
print("---------------------")
print("Test results:")
print("accuracy is:",metrics.accuracy_score(ytest_pred, ytest))
print("precision is:",metrics.precision_score(ytest_pred, ytest))
print("recall is:",metrics.recall_score(ytest_pred, ytest))
print("f1 is:",metrics.f1_score(ytest_pred, ytest))

Train results:
accuracy is: 0.786429593296546
precision is: 1.0
recall is: 0.4089366515837104
f1 is: 0.5804897631473305
---------------------
Test results:
accuracy is: 0.7863274064990803
precision is: 1.0
recall is: 0.3923278116826504
f1 is: 0.5635566687539136


This is the results without vectorize