In [61]:
import pandas as pd
import numpy as np
from sklearn import preprocessing


In [35]:
def vectorize_and_replace(df, cols: list, prefix: str):
    """Vectorize the {cols} columns in {dataframe}\n
    and returns a {dataframe} copy with the vector after removing the {cols}\n
    new col name is {prefix}_vec """
    # df = dataframe.copy()

    lb = preprocessing.LabelBinarizer()
    vec = lb.fit_transform(df[cols]).tolist()

    df[f"{prefix}_vec"] = vec
    df = df.drop(cols, axis = 1)
    return df

In [70]:
def decode_price(price: str):
    '''
    format: $XXXX.XXXXA 
    where X is digit from the range [0,9]
    and A is an action multiplier where K means thousands and M means Millions. 
    '''

    if(price[0]!='$'):
        price = '$' + price

    k = 1000
    m = 1000000
    multiplier = 0
    try:
        symbol = price[-1].upper() # The upper method is used to reduce the need to check wether the symbol is 'k' or 'K'
    except:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    if(symbol=="K"):
        multiplier = k
    elif(symbol == "M"):
        multiplier = m
    else:
        print(f"Error: The end of the string '{price}' does not contain 'K' or 'M'!")
        return None
    
    new_price= int(float(price[1:-1]) * multiplier)
    print(f"Price before manipulation: {price}")
    print(f"Price after manipulation: {new_price}")

    # TODO: after the function test we can remove the prints above.
    
    return new_price

In [5]:
df = pd.read_csv("df0_1000.csv").iloc[:,1:]
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,geographical markets,...,tag_chat,tag_tv,tag_mobile-games,tag_genealogy,target_industry_Enterprise & Professional Services_Translation & Linguistic Services,tag_text-analytics,tag_document-management,tag_character-recognition,tag_presentations,tag_sales
0,Tastewise,Tastewise is an AI platform designed to help f...,7/2017,B2B,51-200,ROUND A,$21.5M,Released,active,"australia, canada, france, india, united kingd...",...,,,,,,,,,,
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,6/2018,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,,,,,,,,,,
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,3/2008,B2B,11-50,Revenue Financed,,Released,not_active,"canada, mexico, spain, united states",...,,,,,,,,,,
3,BeeHero,BeeHero has developed a platform that can pred...,10/2017,B2B,1-10,ROUND A,$24M,Released,active,"global, united states",...,,,,,,,,,,
4,Cham Foods,Cham Foods is a multinational company with man...,12/1970,"B2B, B2B2C",11-50,Public,,Released,active,"north america, europe, global, france, germany...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,DoroTree Technologies,DoroTree Technologies provides software create...,3/1998,B2C,1-10,Revenue Financed,,Released,not_active,global,...,,,,1.0,,,,,,
996,Ligature,"Ligature develops, markets, and supports optic...",1/1989,"B2B, B2C",1-10,Revenue Financed,,Released,not_active,israel,...,,,,,1.0,1.0,1.0,1.0,,
997,ComeToArt,"ComeToArt has developed HeART, a multimedia ap...",4/2013,"B2B, B2C",1-10,Revenue Financed,,Released,not_active,"canada, ireland, united states",...,,,,,,,,,,
998,PicaScreen,PicaScreen provides a complete presentation su...,10/2013,B2C,1-10,Bootstrapped,,Released,not_active,,...,,,,,,,,,1.0,1.0


In [62]:
df_test = df.copy()
df_test.shape

(1000, 1333)

In [63]:
tag_cols = [col for col in df_test.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_test.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_test.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_test.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df_test.columns if col.startswith("core_technology_")]


In [64]:
df_test[tag_cols] = df_test[tag_cols].fillna(0)
df_test[targetmarket_cols] = df_test[targetmarket_cols].fillna(0)
df_test[sector_list] = df_test[sector_list].fillna(0)
df_test[target_ind_list] = df_test[target_ind_list].fillna(0)
df_test[technology_list] = df_test[technology_list].fillna(0)


vectorize

In [65]:
df_test = vectorize_and_replace(df_test, tag_cols, "tag")
df_test = vectorize_and_replace(df_test, targetmarket_cols, "targetmarket")
df_test = vectorize_and_replace(df_test, sector_list, "sector")
df_test = vectorize_and_replace(df_test, target_ind_list, "target_ind")
df_test = vectorize_and_replace(df_test, technology_list, "tech")
df_test.shape

(1000, 23)

Replacing null values to 0

In [73]:
df_test['raised'] = df_test['raised'].fillna(0)
df_test['total_rounds'] = df_test['total_rounds'].fillna(0)
df_test['investors'] = df_test['investors'].fillna(0)
df_test['ipo_price'] = df_test['ipo_price'].fillna(0)

Removing unused columns

In [106]:
df_test.drop(['use cases','academic spin-off','total_raised'], axis = 1, inplace = True)
df_test.shape

KeyError: "['use cases' 'academic spin-off'] not found in axis"

Decoding str to numeric value

In [99]:
for i,val in enumerate(df_test['ipo_price']):
    if val != 0:
        df_test.loc[i,'ipo_price'] = decode_price(val)

for i,val in enumerate(df_test['raised']):
    if val != 0:
        df_test.loc[i,'raised'] = decode_price(val)

Price before manipulation: $30.6M
Price after manipulation: 30600000
Price before manipulation: $12M
Price after manipulation: 12000000
Price before manipulation: $28M
Price after manipulation: 28000000
Price before manipulation: $5.58M
Price after manipulation: 5580000
Price before manipulation: $4.34M
Price after manipulation: 4340000
Price before manipulation: $13M
Price after manipulation: 13000000
Price before manipulation: $12M
Price after manipulation: 12000000
Price before manipulation: $7.5M
Price after manipulation: 7500000
Price before manipulation: $9.177M
Price after manipulation: 9177000
Price before manipulation: $4.81M
Price after manipulation: 4810000
Price before manipulation: $10.22M
Price after manipulation: 10220000
Price before manipulation: $5.15M
Price after manipulation: 5150000
Price before manipulation: $7M
Price after manipulation: 7000000
Price before manipulation: $31M
Price after manipulation: 31000000
Price before manipulation: $22.74M
Price after manipu

Converting the numeric colums

In [100]:
df_test['raised'] = df_test['raised'].astype('float')
df_test['total_rounds'] = df_test['total_rounds'].astype('int')
df_test['investors'] = df_test['investors'].astype('int')
df_test['ipo_price'] = df_test['ipo_price'].astype('float')

In [110]:
df_test.raised.mean()

203117.0

In [108]:
df_test.shape

(1000, 20)