In [1]:
import pandas as pd


import numpy as np
from sklearn import preprocessing


from lib.m import *
from lib.geo_to_vector import vectorize_geo
from lib.eda_visualization import *

Loading the dataframe

In [2]:
df = pd.read_csv('df_complete.csv').iloc[:,3:]
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(13048, 2868)

Removing duplicated rows

In [3]:
df_no_dups = remove_duplicates(df,['company_name'])
df_no_dups.shape

(10070, 2868)

Sorting all the columns by groups

In [4]:
drop_cols = ['funding stage','products','raised']
cat_cols = ['company_name','company_about','business model','employees','product stage','status','geographical markets','fund_stage',]
num_cols = ['founded','total_raised','total_rounds', 'investors','ipo_price']
tag_cols = [col for col in df_no_dups.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in df_no_dups.columns if col.startswith('targetmarket_')]
sector_list = [col for col in df_no_dups.columns if col.startswith("sector_")]
target_ind_list = [col  for col in df_no_dups.columns if col.startswith("target_industry_")]
technology_list = [col  for col in df_no_dups.columns if col.startswith("core_technology_")]


<h1>Repairing columns</h1>
<p> Before removing NaN values, we will first identify what values we expect to have in these columns and we will implement it</p>

<h2> Column 'founded'  </h2>
<p>We will change the string representation of founded - "month/year" to an integer "year"</br>
Using REGEX to find the year</p>

In [5]:
# Fixing 'founded' column

print(f'type of "founded" col is : {df.founded.dtype}')

import re
founded_pattern = r"(\d{4})"

for i, val in enumerate(df.founded) :

    year = val.split('/')[-1]
    m = re.search(founded_pattern, year)
    if m:
        df_no_dups.loc[i, 'founded'] = int(m.group(0))
    else:
        df_no_dups.loc[i, 'founded'] = np.nan


df_no_dups.founded = df_no_dups.founded.astype('int')

print('Operation succesfull')
print(f'type of "founded" col is : {df_no_dups.founded.dtype}')

        

type of "founded" col is : object
Operation succesfull
type of "founded" col is : int32


<h2> Column 'geographical markets'</h2>
<p>The column contains a string representing the geographical markets the company aims for</br>
The script lib/geo_to_vector.py calculates the precentage of the market and add it to the dataframe. </p>

In [6]:
# Fixing 'geographical markets' column

print("Geographical markets col:")
print(df_no_dups['geographical markets'].head())
print('----------------------------------------')

print('\nRunning the script...\n')
df_geo_market = vectorize_geo(df_no_dups, 'c')

print('----------------------------------------')
print("Geographical percentage col:")
print(df_geo_market['geo_market_per'].head())

num_cols.append('geo_market_per')
df_geo_market = df_geo_market.drop(['geographical markets'], axis=1)






Geographical markets col:
0    australia, canada, france, india, united kingd...
1                                                  NaN
2                 canada, mexico, spain, united states
3                                global, united states
4    north america, europe, global, france, germany...
Name: geographical markets, dtype: object
----------------------------------------

Running the script...



Current firefox version is 100.0
Get LATEST geckodriver version for 100.0 firefox
Driver [C:\Users\matan\.wdm\drivers\geckodriver\win64\v0.31.0\geckodriver.exe] found in cache


----------------------------------------
Geographical percentage col:
0    0.239989
1         NaN
2    0.068500
3    1.000000
4    1.000000
Name: geo_market_per, dtype: float64


KeyError: "['geographic markets'] not found in axis"

<h1>Handling NaN values</h1>

<p>First, we will check how many null values are in each column</p>

In [16]:
print("missing values:")
for col in df_geo_market.columns[:15]:
    s = df[col].isnull().sum()
    print(f'\t{col}: {s} missing values') 

missing values:
	company_name: 0 missing values
	company_about: 3 missing values
	founded: 0 missing values
	business model: 85 missing values
	employees: 39 missing values
	funding stage: 374 missing values
	raised: 8363 missing values
	product stage: 202 missing values
	status: 0 missing values
	products: 6308 missing values
	fund_stage: 7382 missing values
	total_raised: 7291 missing values
	total_rounds: 7291 missing values
	investors: 7291 missing values
	ipo_price: 12858 missing values


In [None]:

print(f'before :{df.shape}')
new_df = df_geo_market.drop(drop_cols, axis =1)
new_df = remove_missing_str_val_rows(new_df, cat_cols)
new_df = repair_categorical_missing_vals(new_df, cat_cols)
new_df = repair_numeric_missing_vals_zero(new_df, num_cols)


print(f'after :{new_df.shape}')

In [None]:
new_df[tag_cols] = new_df[tag_cols].fillna(0)
new_df[targetmarket_cols] = new_df[targetmarket_cols].fillna(0)
new_df[sector_list] = new_df[sector_list].fillna(0)
new_df[target_ind_list] = new_df[target_ind_list].fillna(0)
new_df[technology_list] = new_df[technology_list].fillna(0)

In [None]:
for col in num_cols:
    new_df[col] = conv_to_float(new_df[col])