In [1]:
# import modules
import pandas as pd
import numpy as np

# this function is used later on to format data. e.g. 75k converted to 75000 etc.
def convert_to_number(x):
    val = 0
    if x is np.nan:
        return np.nan
    elif 'k' in x or 'K' in x:
        val = round(float(x.replace('k', '').replace('K','')) * 1000, 2) # convert k or K  to a thousand
    elif 'm' in x or 'M' in x:            
        val = round(float(x.replace('m', '').replace('M','')) * 1000000, 2) # convert m or M to a million
    elif 'b' in x or 'B' in x:
        val = round(float(x.replace('b', '').replace('B','')) * 1000000000, 2) # convert b or B to a Billion
    else:
        val = round(float(x), 2) 
    print(val)
    return val

In [2]:
# read csv file data into a dataframe
df = pd.read_csv('vc1.csv') 

In [3]:
# initial preview of first five rows
df.head()

Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code,Unnamed: 22
0,North America,USA,South-Eastern,Alabama,Accomplice,2015,Private Equity Firm,www.accomplice.co,"25 FIRST STREET, SUITE 303 CAMBRIDGE, MA 02141...",2141.0,...,"cybersecurity,\n eSports\n, data analytics, \n...",Startups,785K,150M,405M,180.0,32.31823,-86.902298,AL,
1,North America,USA,South,Alabama,Bonaventure Capital,1998,Venture Capital,http://www.bonaventurecapital.net/,820 Shades Creek Parkway\n \n Suite 1200\n \n ...,35209.0,...,"internet services,\n internet advertising, \n...",Early Stage,4.4M,33M,83M,8.0,32.31823,-86.902298,AL,
2,North America,United States of America,South,Argentina,Kaszek Ventures,2011,Venture Capital,http://www.kaszek.com/,,,...,,Seed \nEarly Stage Venture\nLate Stage Venture,1.7M,50M,165.2M,68.0,-34.603722,-58.381592,AR,
3,North America,USA,West,Arizona,Beechtree Capital,1994,private,http://www.beechtreecapital.com/,34522 North scottsddale road suite Arizona 85266,85266.0,...,"entertainment \n,sports,\necological,\nports,\...",Early Stage,4.4M,41.6M,83.5M,4.0,34.048927,-111.093731,AZ,
4,North America,USA,West,Arizona,"Diamond State Ventures, L.P.",1999,Venture Capital,http://new.diamondstateventures.com/,Diamond State Ventures\n Suite 400\n Little Ro...,72201.0,...,"manufacturing, \n business services,\n media,...",Early Stage,4M,15M,26M,4.0,34.048928,-111.093732,AZ,


In [4]:
# remove last unnamed column which is not part of our dataset
df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True) 

In [5]:
# replace spaces and empty fields with NaN
df.replace(r'\s+', np.nan, regex=True).replace('',np.nan)

# dataframe after dropping last column
df.head()

Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Email Address,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code
0,North America,USA,South-Eastern,Alabama,Accomplice,2015,Private Equity Firm,www.accomplice.co,"25 FIRST STREET, SUITE 303 CAMBRIDGE, MA 02141...",2141.0,...,hello@accomplice.co,"cybersecurity,\n eSports\n, data analytics, \n...",Startups,785K,150M,405M,180.0,32.31823,-86.902298,AL
1,North America,USA,South,Alabama,Bonaventure Capital,1998,Venture Capital,http://www.bonaventurecapital.net/,820 Shades Creek Parkway\n \n Suite 1200\n \n ...,35209.0,...,info@bonaventurecapital.net,"internet services,\n internet advertising, \n...",Early Stage,4.4M,33M,83M,8.0,32.31823,-86.902298,AL
2,North America,United States of America,South,Argentina,Kaszek Ventures,2011,Venture Capital,http://www.kaszek.com/,,,...,\ninfo@kaszek.com,,Seed \nEarly Stage Venture\nLate Stage Venture,1.7M,50M,165.2M,68.0,-34.603722,-58.381592,AR
3,North America,USA,West,Arizona,Beechtree Capital,1994,private,http://www.beechtreecapital.com/,34522 North scottsddale road suite Arizona 85266,85266.0,...,,"entertainment \n,sports,\necological,\nports,\...",Early Stage,4.4M,41.6M,83.5M,4.0,34.048927,-111.093731,AZ
4,North America,USA,West,Arizona,"Diamond State Ventures, L.P.",1999,Venture Capital,http://new.diamondstateventures.com/,Diamond State Ventures\n Suite 400\n Little Ro...,72201.0,...,jhays@dsvlp.com,"manufacturing, \n business services,\n media,...",Early Stage,4M,15M,26M,4.0,34.048928,-111.093732,AZ


In [6]:
# number of row entries and columns in our dataframe
df.shape

(1949, 22)

In [7]:
# convert all strings to lower case
df = df.applymap(lambda s:s.lower() if type(s) == str else s)

In [8]:
# see if strings are converted to lowercase
df.head()

Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Email Address,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code
0,north america,usa,south-eastern,alabama,accomplice,2015,private equity firm,www.accomplice.co,"25 first street, suite 303 cambridge, ma 02141...",2141.0,...,hello@accomplice.co,"cybersecurity,\n esports\n, data analytics, \n...",startups,785k,150m,405m,180.0,32.31823,-86.902298,al
1,north america,usa,south,alabama,bonaventure capital,1998,venture capital,http://www.bonaventurecapital.net/,820 shades creek parkway\n \n suite 1200\n \n ...,35209.0,...,info@bonaventurecapital.net,"internet services,\n internet advertising, \n...",early stage,4.4m,33m,83m,8.0,32.31823,-86.902298,al
2,north america,united states of america,south,argentina,kaszek ventures,2011,venture capital,http://www.kaszek.com/,,,...,\ninfo@kaszek.com,,seed \nearly stage venture\nlate stage venture,1.7m,50m,165.2m,68.0,-34.603722,-58.381592,ar
3,north america,usa,west,arizona,beechtree capital,1994,private,http://www.beechtreecapital.com/,34522 north scottsddale road suite arizona 85266,85266.0,...,,"entertainment \n,sports,\necological,\nports,\...",early stage,4.4m,41.6m,83.5m,4.0,34.048927,-111.093731,az
4,north america,usa,west,arizona,"diamond state ventures, l.p.",1999,venture capital,http://new.diamondstateventures.com/,diamond state ventures\n suite 400\n little ro...,72201.0,...,jhays@dsvlp.com,"manufacturing, \n business services,\n media,...",early stage,4m,15m,26m,4.0,34.048928,-111.093732,az


In [9]:
# missing values in each column - How sparse is our data ?
df.isnull().sum() 

Continent                                   22
Country                                     13
Region                                      17
State                                        0
Name of VC                                   0
Year of Establishment                      239
Type of Investor                            90
Website                                     56
Postal Address                             158
Pincode                                    233
LinkedIn Details                           696
Phone Details                              355
Email Address                              640
Sectors dealt with                         181
Investment Stage                           229
Investment Amount Minimum                  342
Investment Amount Maximum                  342
Previously Invested Total Amount           683
Total number of companies funded so far    253
Latitude                                     0
Longitude                                    0
Code         

In [10]:
# data type of each column
df.dtypes

Continent                                   object
Country                                     object
Region                                      object
State                                       object
Name of VC                                  object
Year of Establishment                       object
Type of Investor                            object
Website                                     object
Postal Address                              object
Pincode                                     object
LinkedIn Details                            object
Phone Details                               object
Email Address                               object
Sectors dealt with                          object
Investment Stage                            object
Investment Amount Minimum                   object
Investment Amount Maximum                   object
Previously Invested Total Amount            object
Total number of companies funded so far    float64
Latitude                       

In [11]:
# the entry at index 778 is a duplicate of above entry at index 777
# also this entry contains garbage values in "Investment Amount Minimum" and some other entries.
# delete this entry in next step
print(df.loc[[778]])

         Continent Country   Region     State        Name of VC  \
778  north america     usa  midwest  illinois  chicago ventures   

    Year of Establishment Type of Investor Website          Postal Address  \
778                  1979  venture capital    http  //chicagoventures.com/   

                                               Pincode  ... Email Address  \
778  222 west merchandise mart plaza\nsuite 1212\nc...  ...           NaN   

    Sectors dealt with                                   Investment Stage  \
778                 \n  ict                                           ...   

         Investment Amount Minimum Investment Amount Maximum  \
778  early stage\nlate stage\ndebt                     0.75m   

    Previously Invested Total Amount Total number of companies funded so far  \
778                               1m                                     NaN   

    Latitude  Longitude       Code   
778      105  41.881832  -87.623177  

[1 rows x 22 columns]


In [12]:
# drop the above entry
df = df.drop([778,], axis=0)
# reindex the entries
df.index = range(len(df))
df.shape

(1948, 22)

In [13]:
# new entry at 778 index
print(df.loc[[778]])

         Continent                   Country   Region     State   Name of VC  \
778  north america  united states of america  midwest  illinois  cid capital   

    Year of Establishment Type of Investor                 Website  \
778                   NaN  venture capital  http://www.cidcap.com/   

                                        Postal Address Pincode  ...  \
778  10201 north illinois street\n suite 200\n indi...   46290  ...   

       Email Address                                 Sectors dealt with  \
778  adam@cidcap.com  manufacturing, \n power generation components,...   

                            Investment Stage Investment Amount Minimum  \
778  early stage venture, late stage venture                        1m   

    Investment Amount Maximum Previously Invested Total Amount  \
778                       75m                              NaN   

    Total number of companies funded so far   Latitude   Longitude Code   
778                                     9.0  41

In [14]:
# Clean data for Minimum, Maximum and Previously Invested Total Amount
# Remove Dollar Symbol 
# df['Investment Amount Minimum'] = df['Investment Amount Minimum'].str.replace('$','').str.replace(',','')
# Above steps not needed as it can be done using find and replace in excel for simplicity

# Replace Million or Billion figures with actual floating numbers
df['Investment Amount Minimum'] = df['Investment Amount Minimum'].apply(lambda x: convert_to_number(x))
df['Investment Amount Maximum'] = df['Investment Amount Maximum'].apply(lambda x: convert_to_number(x))
df['Previously Invested Total Amount'] = df['Previously Invested Total Amount'].apply(lambda x: convert_to_number(x))

785000.0
4400000.0
1700000.0
4400000.0
4000000.0
1500000.0
12000000.0
1000000.0
200000.0
500000.0
800000.0
322000.0
120000.0
500000.0
350000.0
500000.0
120000.0
200000.0
1500000.0
4000000.0
10000.0
5000000.0
1000000.0
2000000.0
750000.0
100000.0
2500000.0
6700000.0
100000.0
10000000.0
1000000.0
1000000.0
7500000.0
2000000.0
250000.0
350000.0
7000000.0
2800000.0
1200000.0
2700000.0
1500000.0
5900000.0
15000000.0
400000.0
700000.0
1000000.0
5000000.0
680000.0
100000.0
1500000.0
6000000.0
320000.0
3000000.0
10000.0
85000.0
500000.0
1000000.0
1500000.0
4300000.0
2000000.0
2000000.0
1000000.0
1000000.0
850000.0
1000000.0
2000000.0
17000000.0
600000.0
1000000.0
5000000.0
500000.0
3000000.0
3000000.0
3000000.0
1000000.0
1000000.0
1100000.0
750000.0
800000.0
9000000.0
500000.0
665000.0
3500000.0
350000000.0
100000.0
5000000.0
1000000.0
500000.0
5000000.0
5000000.0
3200000.0
450000.0
6000000.0
100000.0
100000.0
2300000.0
1000000.0
2000000.0
2000000.0
280000.0
250000.0
5000000.0
11500000.0
57000

250000000.0
14000000.0
80000000.0
38000000.0
90000000.0
49000000.0
160000000.0
30000000.0
45000000.0
3000000.0
50000000.0
50000000.0
200000.0
20000000.0
80000000.0
80000000.0
132000000.0
60000000.0
15500000.0
5000000.0
1000000000.0
60000000.0
75000000.0
33500000.0
25000000.0
20000000.0
175000000.0
50000000.0
30000000.0
435000000.0
200000000.0
70000000.0
15500000.0
700000000.0
100000000.0
100000000.0
44000000.0
30000000.0
175000000.0
107000000.0
25000000.0
100000000.0
65000000.0
220000000.0
55000000.0
63000000.0
63000000.0
45000000.0
45000000.0
9000000.0
111000000.0
10400000.0
108000000.0
115000000.0
200000000.0
20000000.0
65000000.0
150000000.0
30000000.0
200000000.0
165000000.0
8000000.0
1000000000.0
100000000.0
154000000.0
100000000.0
50000000.0
154000000.0
100000000.0
30000000.0
400000000.0
36000000.0
105000000.0
200000000.0
47000000.0
17000000.0
15000000.0
1000000000.0
27000000.0
1000000000.0
60000000.0
175000000.0
109000000.0
160000000.0
21500000.0
21000000.0
120000000.0
132000000

897400000.0
4600000000.0
33000000.0
8200000000.0
1900000000.0
1500000000.0
2600000000.0
969000000.0
7800000.0
493400000.0
900000.0
1570000000.0
5700000000.0
3610000000.0
74400000.0
1300000000.0
3000000000.0
3200000000.0
13600000.0
39700000000.0
50500000.0
44940000.0
470000000.0
11490000000.0
7600000000.0
300000000.0
356400000.0
252500000.0
2300000000.0
200000000.0
1000000000.0
74500000.0
256900000.0
256900000.0
66000000.0
150000000.0
136000000.0
48000000.0
35000000.0
267000000.0
213000000.0
6600000000.0
340200000.0
780000000.0
248500000.0
2000000000.0
329200000.0
240280000.0
274070000.0
178000000.0
284000000.0
1300000000.0
6400000000.0
75000000.0
42250000.0
1600000000.0
12000000000.0
175000000.0
1290000000.0
2300000000.0
885500000.0
885500000.0
1200000000.0
3000000000.0
433500000.0
2000000000.0
183300000.0
10000000.0
79600000.0
150000000.0
130000000.0
489500000.0
576000000.0
1600000000.0
896000000.0
1200000000.0
30000000.0
1900000000.0
224000000.0
136000000.0
1800000000.0
434000000.0
7

In [15]:
print(df['Investment Amount Minimum'].dtype)
print(df['Investment Amount Maximum'].dtype)
print(df['Previously Invested Total Amount'].dtype)
df.tail()

float64
float64
float64


Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Email Address,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code
1943,north america,united states of america,west,washington,wrf capital,1981,venture capital,http://www.wrfcapital.com/,2815 eastlake avenue east\n suite 300\n seattl...,98102,...,,"life sciences, information technology and phys...","early stage venture, late stage venture",1200000.0,55000000.0,593440000.0,45.0,47.751076,-120.740135,wa
1944,north america,united states of america,south,west virginia,harbert management corporation,repeat,private equity firm,http://www.harbert.net/,"1210 east cary street\n suite 400\n richmond, ...",23219,...,,"real estate investment products\n , private eq...",early stage,33000000.0,40000000.0,73000000.0,2.0,38.349819,-81.632622,wv
1945,north america,united states of america,south,west virginia,mountaineer capital l.p.,,venture capital,http://www.mtncap.com/,"107 capitol street\n suite 300\n charleston, w...",25301,...,info@mountaineercapital.com,"information and communications technology\n , ...",early stage venture,750000.0,1000000.0,1750000.0,3.0,38.349819,-81.632622,wv
1946,north america,united states of america,south,west virginia,novitas capital,1997,venture capital,http://www.novitascapital.com/,"435 devon park drive\n suite 801\n wayne, pa 1...",19087,...,info@novitascapital.com,early stage technology\n life sciences,"early stage venture, late stage venture",1400000.0,45000000.0,235700000.0,39.0,38.349819,-81.632622,wv
1947,north america,usa,midwest,wisconsin,american family ventures,1995,venture capital,http://amfamventures.com/,"6000 american parkway\n madison, wi 53783\n un...",53783,...,,home automation \n vehicles \n data analytics.,early stage,100000.0,2000000.0,132000000.0,20.0,43.038902,-87.906471,wi


In [16]:
# remove exact duplicates
df.drop_duplicates()
# shape of dataframe remains same as there are no exact duplicates
df.shape

(1948, 22)

In [17]:
# All duplicate entries by "Name of VC"
df[df.duplicated(['Name of VC'], keep='first')]

Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Email Address,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code
58,north america,united states of america,west,california,andreessen horowitz,2009,private,http://a16z.com/,"menlo park, california",,...,businessplans@a16z.com,"enterprise software, \nmobile","seed , startups, early stage, growth stage",85000.0,9.500000e+08,1.960000e+10,595.0,36.778259,-119.417931,ca
106,north america,united states of america,west,california,bluerun ventures,1998,venture capital,http://www.brv.com/,"545 middlefield road\n suite 250\n menlo park,...",94025,...,ventures@brv.com,"internet and media, \n enterprise software, \n...","early stage venture, late stage venture",,,7.750000e+08,151.0,36.778259,-119.417931,ca
124,north america,usa,west,california,canaan partners,1987,private firm,https://canaan.com,2765 sand hill road menlo park ca 94025,94025,...,ihahn@canaan.com,"biopharma\n,fintech,\nhealthcare",early stage,50000.0,8.000000e+07,2.800000e+09,510.0,36.778259,-119.417931,ca
125,north america,united states of america,west,california,canaan partners,1987,private,http://canaan.com/,"san francisco\n27 south park, ste. 201\nsan fr...",94107,...,lhahn@canaan.com,"enterprise applications, \nfintech,\nhealth ca...","later/mature stage, early stage, startups, seed",179000.0,1.320000e+08,8.900000e+09,515.0,36.778259,-119.417931,ca
203,north america,united states of america,west,california,el dorado ventures,1986,private firm,http://www.eldorado.com/,"702 oak grove avenue\n menlo park, ca 94025\n ...",94025,...,info@eldorado.com,cloud computing,"early stage venture, late stage venture",,,2.000000e+08,118.0,36.778259,-119.417931,ca
222,north america,united states of america,west,california,focus ventures,1997,private firm,http://www.focusventures.com/,"525 university avenue,2nd floor\n suite 225\n ...",94301,...,info@charterventures.com,internet,"early stage venture, late stage venture, priva...",3000000.0,8.000000e+06,2.500000e+08,131.0,36.778259,-119.417931,ca
229,north america,united states of america,west,california,forward ventures,1993,private firm,http://www.forwardventures.com/,la jolla commons\n 4747 executive drive\n suit...,92121,...,info@forwardventures.com,"healthcare, \n pharmaceuticals, \n medical dev...","early stage venture, late stage venture",1000000.0,1.500000e+07,,52.0,36.778259,-119.417931,ca
233,north america,united states of america,west,california,foundation capital,1995,private firm,http://www.foundationcapital.com/,"250 middlefield road\n menlo park, ca 94025\n ...",94025,...,info@foundationcapital.com,information technology sectors,"early stage venture, late stage venture, priva...",,,3.100000e+09,467.0,36.778259,-119.417931,ca
242,north america,united states of america,west,california,ftventures,1998,,http://www.ftventures.com/,555 california street\n suite 2850\n san franc...,94104,...,,"software, \n business services companies.","early stage venture, late stage venture",,,5.120000e+08,34.0,36.778259,-119.417931,ca
246,north america,united states of america,west,california,gabriel venture partners,1999,private firm,http://www.gabrielvp.com/,"999 baker way\n suite 400\n san mateo, ca 9440...",94404,...,info@gabrielvp.com,information technology,"early stage venture, late stage venture",3000000.0,6.000000e+06,,59.0,36.778259,-119.417931,ca


In [18]:
# remove duplicates by "Name of VC" and keep those which have highest "Previously Invested Total Amount"
df.sort_values('Previously Invested Total Amount', ascending=False).drop_duplicates('Name of VC').sort_index().reset_index(drop=True)
df

Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Email Address,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code
0,north america,usa,south-eastern,alabama,accomplice,2015,private equity firm,www.accomplice.co,"25 first street, suite 303 cambridge, ma 02141...",2141,...,hello@accomplice.co,"cybersecurity,\n esports\n, data analytics, \n...",startups,785000.0,1.500000e+08,4.050000e+08,180.0,32.31823,-86.902298,al
1,north america,usa,south,alabama,bonaventure capital,1998,venture capital,http://www.bonaventurecapital.net/,820 shades creek parkway\n \n suite 1200\n \n ...,35209,...,info@bonaventurecapital.net,"internet services,\n internet advertising, \n...",early stage,4400000.0,3.300000e+07,8.300000e+07,8.0,32.31823,-86.902298,al
2,north america,united states of america,south,argentina,kaszek ventures,2011,venture capital,http://www.kaszek.com/,,,...,\ninfo@kaszek.com,,seed \nearly stage venture\nlate stage venture,1700000.0,5.000000e+07,1.652000e+08,68.0,-34.603722,-58.381592,ar
3,north america,usa,west,arizona,beechtree capital,1994,private,http://www.beechtreecapital.com/,34522 north scottsddale road suite arizona 85266,85266,...,,"entertainment \n,sports,\necological,\nports,\...",early stage,4400000.0,4.160000e+07,8.350000e+07,4.0,34.048927,-111.093731,az
4,north america,usa,west,arizona,"diamond state ventures, l.p.",1999,venture capital,http://new.diamondstateventures.com/,diamond state ventures\n suite 400\n little ro...,72201,...,jhays@dsvlp.com,"manufacturing, \n business services,\n media,...",early stage,4000000.0,1.500000e+07,2.600000e+07,4.0,34.048928,-111.093732,az
5,north america,usa,west,arizona,pacific partners,,private equity firm,http://www.pacificpartnerslp.com/,7150 east camelback road\n suite 444\n scottsd...,85251,...,,financial services,early stage,1500000.0,1.010000e+07,1.900000e+07,3.0,‎34.048929,‎-111.093733,az
6,north america,united states of america,west,arizona,sustainable conversion ventures,2014,venture capital,http://www.scvco.com/,"9532 east riggs road\n\nsun lakes, az",85248,...,jason@scvco.com,,,12000000.0,1.200000e+07,1.200000e+07,1.0,34.04893,-111.093734,az
7,north america,usa,west,arizona,tallwave capital,2009,private equity firm,http://tallwavecapital.com/,6263 north scottsdale road\n suite 180\n scott...,85250,...,info@tallwavecapital.com,saas \n e-commerce\n b2b software \n software ...,early stage,1000000.0,1.000000e+07,5.300000e+07,21.0,34.048931,-111.093735,az
8,north america,usa,southern,arkansas,flatiron investors,2015,individual,www.flatironinvestors.com/,"43-45 east 19th street, floor 8 new york, ny 1...",10003,...,hello@flatironinvestors.com,various,early stage,200000.0,7.000000e+06,2.350000e+08,10.0,34.746483,-92.289597,ar
9,north america,united states of america,northeast,boston,bolt,2013,venture capital,http://bolt.io/,,,...,\ninfo@bolt.io,finance\nfinancial services\nventure capital,early stage venture\nseed,500000.0,3.500000e+06,4.000000e+06,58.0,42.361145,-71.057083,ma


In [19]:
# reindex the entries
df.index = range(len(df))

df.shape

(1948, 22)

In [20]:
df.head()

Unnamed: 0,Continent,Country,Region,State,Name of VC,Year of Establishment,Type of Investor,Website,Postal Address,Pincode,...,Email Address,Sectors dealt with,Investment Stage,Investment Amount Minimum,Investment Amount Maximum,Previously Invested Total Amount,Total number of companies funded so far,Latitude,Longitude,Code
0,north america,usa,south-eastern,alabama,accomplice,2015,private equity firm,www.accomplice.co,"25 first street, suite 303 cambridge, ma 02141...",2141.0,...,hello@accomplice.co,"cybersecurity,\n esports\n, data analytics, \n...",startups,785000.0,150000000.0,405000000.0,180.0,32.31823,-86.902298,al
1,north america,usa,south,alabama,bonaventure capital,1998,venture capital,http://www.bonaventurecapital.net/,820 shades creek parkway\n \n suite 1200\n \n ...,35209.0,...,info@bonaventurecapital.net,"internet services,\n internet advertising, \n...",early stage,4400000.0,33000000.0,83000000.0,8.0,32.31823,-86.902298,al
2,north america,united states of america,south,argentina,kaszek ventures,2011,venture capital,http://www.kaszek.com/,,,...,\ninfo@kaszek.com,,seed \nearly stage venture\nlate stage venture,1700000.0,50000000.0,165200000.0,68.0,-34.603722,-58.381592,ar
3,north america,usa,west,arizona,beechtree capital,1994,private,http://www.beechtreecapital.com/,34522 north scottsddale road suite arizona 85266,85266.0,...,,"entertainment \n,sports,\necological,\nports,\...",early stage,4400000.0,41600000.0,83500000.0,4.0,34.048927,-111.093731,az
4,north america,usa,west,arizona,"diamond state ventures, l.p.",1999,venture capital,http://new.diamondstateventures.com/,diamond state ventures\n suite 400\n little ro...,72201.0,...,jhays@dsvlp.com,"manufacturing, \n business services,\n media,...",early stage,4000000.0,15000000.0,26000000.0,4.0,34.048928,-111.093732,az


In [21]:
df1 = df.fillna("NaN")
df1.to_csv('cleaned_vc_data.csv', index=False)

# removed some further duplicates which were somehow left out (around 139) from this file using excel