In [100]:
import pandas as pd


import numpy as np
from sklearn import preprocessing


from lib.preprop import *
from lib.geo_to_vector import vectorize_geo
from lib.eda_visualization import *

Loading the dataframe

In [127]:
df = pd.read_csv('data/dataframes/df_complete.csv').iloc[:,3:]
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(13048, 2868)

Removing duplicated rows

In [128]:
df_no_dups = remove_duplicates(df,['company_name'])
df_no_dups.shape

(10070, 2868)

Sorting all the columns by groups

In [129]:
cat_cols = ['company_name','company_about', 'founded', 'business model','employees','product stage','status','geographical markets','fund_stage',]
num_cols = ['total_raised','total_rounds', 'investors','ipo_price']
tag_cols = [col for col in df_no_dups.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_no_dups.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_no_dups.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_no_dups.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df_no_dups.columns if col.startswith("core_technology_")]


Changing target_ind_list and technology_list columns to have only 1 underscore

In [130]:

new_tech_cols = []
new_industry_cols = []
for col in target_ind_list + technology_list:
    if col in technology_list:
        new_tech_cols.append("technology" + col[15:])
    
    elif col in target_ind_list:
        new_industry_cols.append("industry" + col[15:])


d= {}
for i in range(len(new_tech_cols)):
    d[technology_list[i]] =  new_tech_cols[i]

for i in range(len(new_industry_cols)):
    d[target_ind_list[i]] =  new_industry_cols[i]

df_no_dups.rename(columns=d, inplace=True)

target_ind_list = [col for col in df_no_dups.columns if col.startswith("industry_")]
technology_list = [col for col in df_no_dups.columns if col.startswith("technology_")]
bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list



<h1>Repairing columns</h1>
<p> Before removing NaN values, we will first identify what values we expect to have in these columns and we will implement it</p>

<h2> Column 'founded'  </h2>
<p>We will change the string representation of founded - "month/year" to an integer "year"</br>
Using REGEX to find the year</p>

In [132]:
# Fixing 'founded' column

print(f'type of "founded" col is : {df_no_dups.founded.dtype}')
print('\nRunning script..')
import re
founded_pattern = r"(\d{4})"
year_list=[]
df_founded = df_no_dups.copy()

for i, val in enumerate(df_founded.founded) :
    year = val.split('/')[-1]
    m = re.search(founded_pattern, year)
    if m:
        year_list.append(int(m.group(0)))
        
        
df_founded['founded'] = year_list


print('\nOperation succesfull!')
print(f'type of "founded" col is : {df_founded.founded.dtype}')

df_founded.shape
        

type of "founded" col is : object

Running script..

Operation succesfull!
type of "founded" col is : int64


(10070, 2868)

<h2> Column 'geographical markets'</h2>
<p>The column contains a string representing the geographical markets the company aims for</br>
The script lib/geo_to_vector.py calculates the precentage of the market and add it to the dataframe. </p>

In [133]:
# Fixing 'geographical markets' column

print("Geographical markets col:")
print(df_founded['geographical markets'].head())
print('----------------------------------------')

print('\nRunning the script...\n')
df_geo_market = vectorize_geo(df_founded, 'c')

print('\n----------------------------------------')
print('Operation succesfull!\n')
print("Geographical percentage col:")
print(df_geo_market['geo_market_per'].head())

num_cols.append('geo_market_per')
cat_cols.remove('geographical markets')

df_geo_market = df_geo_market.drop(['geographical markets'], axis=1)
df_geo_market = df_geo_market.dropna(subset=['company_name'])





Geographical markets col:
0    australia, canada, france, india, united kingd...
1                                                  NaN
2                 canada, mexico, spain, united states
3                                global, united states
4    north america, europe, global, france, germany...
Name: geographical markets, dtype: object
----------------------------------------

Running the script...



Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [C:\Users\matan\.wdm\drivers\chromedriver\win32\102.0.5005.61\chromedriver.exe] found in cache



----------------------------------------
Operation succesfull!

Geographical percentage col:
0    0.239716
1         NaN
2    0.068422
3    1.000000
4    1.000000
Name: geo_market_per, dtype: float64


# Checkpoint 1 : save the new df

In [134]:
df_geo_market.to_csv('data/dataframes/cp1.csv')
# df_geo_market = pd.read_csv('data/dataframes/cp1.csv').iloc[:,1:]
df_geo_market.shape

(10070, 2868)

In [121]:
cat_cols = ['company_name','company_about', 'founded', 'business model','employees','product stage','status','funding stage']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price','geo_market_per']
tag_cols = [col for col in df_geo_market.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_geo_market.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_geo_market.columns if col.startswith("sector_")]
target_ind_list = [col for col in df_geo_market.columns if col.startswith("industry_")]
technology_list = [col for col in df_geo_market.columns if col.startswith("technology_")]


<h1>Handling NaN values</h1>

<p>First, we will check how many null values are in each column of the non-binary columns</p>

In [116]:
print("missing values:")
cols_to_check = num_cols + cat_cols
for col in cols_to_check:
    s = df_geo_market[col].isnull().sum()
    print(f'\t{col}: {s} missing values') 

missing values:
	total_raised: 5704 missing values
	total_rounds: 5704 missing values
	investors: 5704 missing values
	ipo_price: 9920 missing values
	company_name: 0 missing values
	company_about: 2 missing values
	founded: 0 missing values
	business model: 68 missing values
	employees: 32 missing values
	product stage: 163 missing values
	status: 0 missing values
	funding stage: 292 missing values


In [117]:
print(f"num cols: {num_cols}")
print(f"cat cols: {cat_cols}")

num cols: ['total_raised', 'total_rounds', 'investors', 'ipo_price']
cat cols: ['company_name', 'company_about', 'founded', 'business model', 'employees', 'product stage', 'status', 'funding stage']


In [119]:
print(f'before :{df_geo_market.shape}')

drop_cols = ['fund_stage','products','raised']
new_df = df_geo_market.drop(drop_cols, axis =1)

new_df = remove_missing_str_val_rows(new_df, cat_cols)
new_df = repair_categorical_missing_vals(new_df, cat_cols)

# new_df = repair_numeric_missing_vals_zero(new_df, [col for col in num_cols if col != 'geo_market_per'])
# new_df = repair_numeric_missing_vals_median(new_df, ['geo_market_per'])

print(f'after :{new_df.shape}')

before :(10070, 2868)
after :(10070, 2865)


In [122]:
for col in num_cols:
    new_df[col] = conv_to_float(new_df[col])

TypeError: 'int' object is not subscriptable

<h3> Dealing with NaN in binary columns  </h3>

In [58]:
new_df[tag_cols] = new_df[tag_cols].fillna(0)
new_df[targetmarket_cols] = new_df[targetmarket_cols].fillna(0)
new_df[sector_list] = new_df[sector_list].fillna(0)
new_df[target_ind_list] = new_df[target_ind_list].fillna(0)
new_df[technology_list] = new_df[technology_list].fillna(0)

In [59]:
print("Checking missing values in non binary columns...\n")

for col in num_cols + cat_cols:
    s = new_df[col].isnull().sum()
    print(f'{col}: {s} missing values') 

Checking missing values in non binary columns...

total_raised: 0 missing values
total_rounds: 0 missing values
investors: 0 missing values
ipo_price: 0 missing values
geo_market_per: 0 missing values
company_name: 0 missing values
company_about: 0 missing values
founded: 0 missing values
business model: 0 missing values
employees: 0 missing values
product stage: 0 missing values
status: 0 missing values
funding stage: 0 missing values


In [61]:
bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list
print("Checking missing values in binary columns...\n")
are_missing = False

for col in bin_cols:
    s = new_df[col].isnull().sum()
    if s!=0:
        are_missing = True
        print(f'{col}: {s} missing values') 

if are_missing is False:
    print("No Nan values in binary columns!")


Checking missing values in binary columns...

No Nan values in binary columns!


<h1> # Checkpoint 2 : save the new df # </h1>


In [62]:
new_df.to_csv('data/dataframes/cp2.csv')
# new_df = pd.read_csv('data/dataframes/cp2.csv').iloc[:,1:]
# new_df.shape

In [63]:
# Defining cols

cat_cols = ['company_name','company_about', 'founded', 'business model','employees','product stage','status','funding stage']
num_cols = ['total_raised','total_rounds', 'investors','ipo_price', 'geo_market_per']
tag_cols = [col for col in new_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in new_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in new_df.columns if col.startswith("sector_")]
target_ind_list = [col for col in new_df.columns if col.startswith("industry_")]
technology_list = [col for col in new_df.columns if col.startswith("technology_")]
bin_cols = tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list

<h1> Categorical columns </h1>

<p> First, we need to explore the categorical columns:</p>

In [64]:
cat_df = new_df.copy()
cat_df[cat_cols].describe(include='all')

Unnamed: 0,company_name,company_about,founded,business model,employees,product stage,status,funding stage
count,10070,10070,10070.0,10070,10070,10070,10070,10070
unique,10070,10015,,16,6,7,2,11
top,Tastewise,This company is a known business entity but la...,,B2B,1-10,Released,active,Bootstrapped
freq,1,48,,4743,6232,6674,6554,2810
mean,,,2011.537736,,,,,
std,,,10.052327,,,,,
min,,,1901.0,,,,,
25%,,,2010.0,,,,,
50%,,,2014.0,,,,,
75%,,,2017.0,,,,,


<p>We see that [employees, business model, status, product stage and fund stage] columns</br>
Are with few unique values, therefore we can encode them.</p>

In [65]:
# Employees col
print("Value counts for 'employees' column:\n")
print(cat_df.employees.value_counts())

print("\nApplying LabelEncoder.\n")

le = preprocessing.LabelEncoder()
cat_df.employees = le.fit_transform(cat_df.employees)
print("New value counts for 'employees' column:\n")
print(cat_df.employees.value_counts())

Value counts for 'employees' column:

1-10       6232
11-50      2683
51-200      822
201-500     189
500+        112
na           32
Name: employees, dtype: int64

Applying LabelEncoder.

New value counts for 'employees' column:

0    6232
1    2683
4     822
2     189
3     112
5      32
Name: employees, dtype: int64


We will do the same for the rest of the columns

In [66]:
cols = ['business model', 'product stage', 'funding stage']

for col in cols:
    le = preprocessing.LabelEncoder()
    cat_df[col] = le.fit_transform(cat_df[col])
    print(f'{col} : {le.classes_}')

replace_map = {'active' : 1, 'not_active' : 0}
cat_df.status.replace(replace_map, inplace= True)

business model : ['B2B' 'B2B, B2B2C' 'B2B, B2C' 'B2B, B2C, B2B2C' 'B2B, B2C, B2G'
 'B2B, B2C, B2G, B2B2C' 'B2B, B2G' 'B2B, B2G, B2B2C' 'B2B2C' 'B2C'
 'B2C, B2B2C' 'B2C, B2G' 'B2C, B2G, B2B2C' 'B2G' 'B2G, B2B2C' 'na']
product stage : ['Alpha' 'Beta' 'Clinical Trial' 'Customer development' 'R&D' 'Released'
 'na']
funding stage : ['Acquired' 'Bootstrapped' 'Established' 'Pre-Seed' 'Public' 'ROUND A'
 'ROUND B' 'ROUND C+' 'Revenue Financed' 'Seed' 'na']


This is next step

In [98]:
# add suceeded column
success_rate = 4000000

PRODUCT_SUCCESS = (cat_df['status'] == 1) & (cat_df['product stage'] == 5)
FUNDING_SUCCESS = (cat_df['funding stage'] == 1) | ((cat_df['funding stage'] == 7) &
                  (cat_df['status'] == 1)) | ((cat_df['total_raised'] >= success_rate) &
                  (cat_df['status'] == 1))

cat_df.loc[PRODUCT_SUCCESS | FUNDING_SUCCESS , 'suceeded'] = 1
cat_df.loc[~PRODUCT_SUCCESS & ~FUNDING_SUCCESS, 'suceeded'] = 0

cat_cols.append('suceeded')
print(f'Total succeeded companies: {cat_df.suceeded.sum()}')

Total succeeded companies: 7009.0


<h1>Final cleaned dataframe</h1>
<p>Saving the df as csv file in the following col order</br>
cat_cols, num_cols, tag_cols, targetmarket_cols, sector_list, target_ind_list, technology_list</p>

In [97]:
# Final cleaned non-binary cols

final_df = pd.DataFrame(cat_df, columns = cat_cols + num_cols + tag_cols + targetmarket_cols + sector_list + target_ind_list + technology_list)
final_df.to_csv('data/dataframes/final_cleaned.csv')

In [99]:
final_df

Unnamed: 0,company_name,company_about,founded,business model,employees,product stage,status,funding stage,suceeded,suceeded.1,...,core_technology_Quantum Computing,core_technology_Data Storage_Flash,core_technology_Machinery & Robotics_Cobots,core_technology_Sensing_SWIR,core_technology_Materials & Substances_Adhesive,core_technology_Materials & Substances_Nonwoven Fabric,core_technology_Materials & Substances_Insulation,core_technology_Sensing_Scent,core_technology_Machinery & Robotics_Exoskeleton,core_technology_Materials & Substances_Implants
0,Tastewise,Tastewise is an AI platform designed to help f...,2017,0,4,5,1,5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,2018,1,1,4,1,4,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,2008,0,1,5,0,8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BeeHero,BeeHero has developed a platform that can pred...,2017,0,0,5,1,5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Cham Foods,Cham Foods is a multinational company with man...,1970,1,1,5,1,4,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10065,Intelligent Smart Ideas,Intelligent Smart Ideas is the creator of iSaf...,2017,6,0,4,1,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10066,MappMakers,MappMakers has an algorithm that can take the ...,2016,9,0,6,0,10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10067,LYNX Smartcars,LYNX is developing software for connected and ...,2016,0,0,4,0,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10068,Deeyook Location Technologies,Deeyook seeks to redefine location technology ...,2017,0,1,5,1,9,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
