In [1]:
import pandas as pd


import numpy as np
from sklearn import preprocessing


from lib.m import *
from lib.geo_to_vector import vectorize_geo
from lib.eda_visualization import *

Loading the dataframe

In [2]:
df = pd.read_csv('df_complete.csv').iloc[:,3:]
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(13048, 2868)

Removing duplicated rows

In [3]:
df_no_dups = remove_duplicates(df,['company_name'])
df_no_dups.shape

(10070, 2868)

Sorting all the columns by groups

In [4]:
drop_cols = ['funding stage','products','raised']
cat_cols = ['company_name','company_about','business model','employees','product stage','status','geographical markets','fund_stage',]
num_cols = ['founded','total_raised','total_rounds', 'investors','ipo_price']
tag_cols = [col for col in df_no_dups.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_no_dups.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_no_dups.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_no_dups.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df_no_dups.columns if col.startswith("core_technology_")]


<h1>Repairing columns</h1>
<p> Before removing NaN values, we will first identify what values we expect to have in these columns and we will implement it</p>

<h2> Column 'founded'  </h2>
<p>We will change the string representation of founded - "month/year" to an integer "year"</br>
Using REGEX to find the year</p>

In [5]:
# Fixing 'founded' column

print(f'type of "founded" col is : {df_no_dups.founded.dtype}')
print('\nRunning script..')
import re
founded_pattern = r"(\d{4})"
year_list=[]
df_founded = df_no_dups.copy()

for i, val in enumerate(df_founded.founded) :
    year = val.split('/')[-1]
    m = re.search(founded_pattern, year)
    if m:
        year_list.append(int(m.group(0)))
        
        
df_founded['founded'] = year_list


print('\nOperation succesfull!')
print(f'type of "founded" col is : {df_founded.founded.dtype}')

df_founded.shape
        

type of "founded" col is : object

Running script..

Operation succesfull!
type of "founded" col is : int64


(10070, 2868)

<h2> Column 'geographical markets'</h2>
<p>The column contains a string representing the geographical markets the company aims for</br>
The script lib/geo_to_vector.py calculates the precentage of the market and add it to the dataframe. </p>

In [6]:
# Fixing 'geographical markets' column

print("Geographical markets col:")
print(df_founded['geographical markets'].head())
print('----------------------------------------')

print('\nRunning the script...\n')
df_geo_market = vectorize_geo(df_founded, 'c')

print('\n----------------------------------------')
print('\nOperation succesfull!')
print("Geographical percentage col:")
print(df_geo_market['geo_market_per'].head())

num_cols.append('geo_market_per')
df_geo_market = df_geo_market.drop(['geographical markets'], axis=1)
df_geo_market = df_geo_market.dropna(subset=['company_name'])

Geographical markets col:
0    australia, canada, france, india, united kingd...
1                                                  NaN
2                 canada, mexico, spain, united states
3                                global, united states
4    north america, europe, global, france, germany...
Name: geographical markets, dtype: object
----------------------------------------

Running the script...





Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [C:\Users\matan\.wdm\drivers\chromedriver\win32\101.0.4951.41\chromedriver.exe] found in cache


shape of df['geographical markets']: (10070,)
len of countries list: 161
len of country_pop list: 235
22 supposed to equal to 22
north america 373182759
europe 748490877
africa 1404117104
asia 4716763186
central america 183663150
south america 437549336
southeast asia 681251186
northern europe 107195976
oceania 43733336
americas 1038295491
southern africa 69181287
west asia 287946147
east asia 1688461673
western europe 197327365
southern europe 151779605
eastern europe 292201534
western africa 422257672
eastern africa 467911256
central asia 76509631
south asia 1982820548
northern africa 254844957
middle africa 189996556
shape of df['geo_market_per']: (11182,)

----------------------------------------

Operation succesfull!
Geographical percentage col:
0    0.239985
1         NaN
2    0.068499
3    1.000000
4    1.000000
Name: geo_market_per, dtype: float64


<h1> # Checkpoint : save the new df # </h1>

In [13]:
df_geo_market


Unnamed: 0,company_name,company_about,founded,business model,employees,funding stage,raised,product stage,status,products,...,targetmarket_public-transportation,targetmarket_smart-mobility,tag_hydraulic-drive,tag_hud,tag_simulation-software,tag_luggage,tag_traffic-violations,tag_car-audio,tag_trip,geo_market_per
0,Tastewise,Tastewise is an AI platform designed to help f...,2017.0,B2B,51-200,ROUND A,$21.5M,Released,active,tastewise,...,,,,,,,,,,0.239985
1,Wilk Technologies,Wilk is dedicated to revolutionizing the dairy...,2018.0,"B2B, B2B2C",11-50,Public,$4.69M,R&D,active,,...,,,,,,,,,,
2,Eco Pack Green Box,Eco Pack Green Box has developed and patented ...,2008.0,B2B,11-50,Revenue Financed,,Released,not_active,,...,,,,,,,,,,0.068499
3,BeeHero,BeeHero has developed a platform that can pred...,2017.0,B2B,1-10,ROUND A,$24M,Released,active,,...,,,,,,,,,,1.000000
4,Cham Foods,Cham Foods is a multinational company with man...,1970.0,"B2B, B2B2C",11-50,Public,,Released,active,,...,,,,,,,,,,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13041,Intelligent Smart Ideas,Intelligent Smart Ideas is the creator of iSaf...,2017.0,"B2B, B2G",1-10,Bootstrapped,,R&D,active,"isafecross, ibabysafe+, ibabycrytranslator",...,,,,,,,,,,
13042,MappMakers,MappMakers has an algorithm that can take the ...,2016.0,B2C,1-10,,,,not_active,,...,,,,,,,,,,
13044,LYNX Smartcars,LYNX is developing software for connected and ...,2016.0,B2B,1-10,Bootstrapped,,R&D,not_active,,...,,,,,,,,,,
13045,Deeyook Location Technologies,Deeyook seeks to redefine location technology ...,2017.0,B2B,11-50,Seed,,Released,active,,...,,,,,,,,,,


In [None]:
df_geo_market.to_csv('cp1.csv')
# df_geo_market = pd.read_csv('cp1.csv').iloc[:,1:]
# df_geo_market

<h1>Handling NaN values</h1>

<p>First, we will check how many null values are in each column of the non-binary columns</p>

In [None]:
print("missing values:")
for col in df_copy.columns[num_cols + cat_cols]:
    s = df[col].isnull().sum()
    print(f'\t{col}: {s} missing values') 

In [None]:

print(f'before :{df.shape}')
new_df = df_geo_market.drop(drop_cols, axis =1)
new_df = remove_missing_str_val_rows(new_df, cat_cols)
new_df = repair_categorical_missing_vals(new_df, cat_cols)
new_df = repair_numeric_missing_vals_zero(new_df, num_cols)


print(f'after :{new_df.shape}')

In [None]:
new_df[tag_cols] = new_df[tag_cols].fillna(0)
new_df[targetmarket_cols] = new_df[targetmarket_cols].fillna(0)
new_df[sector_list] = new_df[sector_list].fillna(0)
new_df[target_ind_list] = new_df[target_ind_list].fillna(0)
new_df[technology_list] = new_df[technology_list].fillna(0)

In [None]:
for col in num_cols:
    new_df[col] = conv_to_float(new_df[col])