In [7]:
import pandas as pd


import numpy as np
from sklearn import preprocessing


from lib.m import *
from lib.geo_to_vector import vectorize_geo
from lib.eda_visualization import *

Loading the dataframe

In [8]:
df = pd.read_csv('df_complete.csv').iloc[:,3:]
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(13048, 2868)

Removing duplicated rows

In [9]:
new_df = remove_duplicates(df,['company_name'])
new_df.shape

(10070, 2868)

Sorting all the columns by groups

In [10]:
drop_cols = ['funding stage','products','raised']
cat_cols = ['company_name','company_about','founded','business model','employees','product stage','status','geographical markets','fund_stage',]
num_cols = ['total_raised','total_rounds', 'investors','ipo_price']
tag_cols = [col for col in new_df.columns if col.startswith('tag_')]
targetmarket_cols = [col for col in new_df.columns if col.startswith('targetmarket_')]
sector_list = [col for col in new_df.columns if col.startswith("sector_")]
target_ind_list = [col  for col in new_df.columns if col.startswith("target_industry_")]
technology_list = [col  for col in new_df.columns if col.startswith("core_technology_")]


<h1>Repairing columns</h1>
<p> Before removing NaN values, we will first identify what values we expect to have in these columns and we will implement it</p>

<h2> Column 'founded'  </h2>
<p>We will change the string representation of founded - "month/year" to an integer "year"</br>
Using REGEX to find the year</p>

In [11]:
# Fixing founded column

print(f'type of "founded" col is : {df.founded.dtype}')

import re
founded_pattern = r"(\d{4})"

for i, val in enumerate(df.founded) :

    year = val.split('/')[-1]
    m = re.search(founded_pattern, year)
    if m:
        new_df.loc[i, 'founded'] = int(m.group(0))
    else:
        new_df.loc[i, 'founded'] = np.nan


new_df.founded = new_df.founded.astype('int')

print('Operation succesfull')
print(f'type of "founded" col is : {new_df.founded.dtype}')

        

type of "founded" col is : object
Operation succesfull
type of "founded" col is : int32


<h1> Column 'geographical markets'</h1>
<p>The column contains a string representing the geographical markets the company aims for</br>
The script lib/geo_to_vector.py calculates the precentage of the markets share of the company. </p>

In [13]:
print("Geographical markets col:")
print(new_df['geographical markets'].head())
print('----------------------------------------')

print('\nRunning the script...\n')
new_df = vectorize_geo(new_df)

print('----------------------------------------')
print("Geographical percentage col:")
print(new_df['geo_market_per'].head())






Geographical markets col:
----------------------------------------

Running the script...



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [C:\Users\matan\.wdm\drivers\chromedriver\win32\101.0.4951.41\chromedriver.exe] found in cache


----------------------------------------
Geographical percentage col:
----------------------------------------


Handling NaN values

In [None]:
for col in df.columns[2:16]:
    s = df[col].isnull().sum()
    print(f'{col}: {s} missing values') 

In [None]:

print(f'before :{df.shape}')
new_df = new_df.drop(drop_cols, axis =1)
new_df = remove_missing_str_val_rows(new_df, cat_cols)
new_df = repair_categorical_missing_vals(new_df, cat_cols)
new_df = repair_numeric_missing_vals_zero(new_df, num_cols)


print(f'after :{new_df.shape}')

In [None]:
new_df[tag_cols] = new_df[tag_cols].fillna(0)
new_df[targetmarket_cols] = new_df[targetmarket_cols].fillna(0)
new_df[sector_list] = new_df[sector_list].fillna(0)
new_df[target_ind_list] = new_df[target_ind_list].fillna(0)
new_df[technology_list] = new_df[technology_list].fillna(0)

In [None]:
for col in num_cols:
    new_df[col] = conv_to_float(new_df[col])

In [None]:
new_df.founded

In [None]:
for col in df.columns[2:16]:
    print(get_frequent_elements(df,col,5))
    print("----------------")

In [None]:
df.iloc[:,:13].info()
