##Data Cleaning

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import json
import datetime
import time

In [2]:
with open("tmp/organizations_dict.json", "r") as fd:
    orgs = json.load(fd)

In [3]:
df=pd.DataFrame(orgs.values(),index=orgs.keys())

Copied this from HW5, to get adjectives and nouns from the short_description

In [4]:
from sklearn.feature_extraction import text 
stopwords=text.ENGLISH_STOP_WORDS
import re
regex1=re.compile(r"\.{2,}")
regex2=re.compile(r"\-{2,}")
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
punctuation = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')
# taken from HW5
def get_parts(thetext):
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stopwords or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2


In [5]:
# add column for description nouns and adjectives
df['text_nouns']=df.short_description.map(lambda x: None if x==None else get_parts(x)[0])
df['text_adjectives']=df.short_description.map(lambda x: None if x==None else get_parts(x)[1])

In [6]:
# add indicator columns for each category
catlist=df.categories.map(lambda x: None if x==None or isinstance(x,float) else [a['name'] for a in x]).tolist()
catset=set([item for sublist  in catlist if sublist!=None for item in sublist])
for category in catset:
    df["c_"+category+"_i"]=pd.Series(catlist, index=df.index).map(lambda x: category in x if x else False)

In [7]:
# add indicator columns for investors
investorslist=df.investors.map(lambda x: None if x==None or isinstance(x,float) else [a['permalink'] for a in x]).tolist()
investorset=set([item for sublist  in investorslist if sublist!=None for item in sublist])
for inv in investorset:
    df["i_"+inv+"_i"]=pd.Series(investorslist, index=df.index).map(lambda x: inv in x if x else False)

In [8]:
# add indicator columns for cities (from offices)
citylist=df.offices.map(lambda x: None if not x or isinstance(x,float) else [a['city'] for a in x]).tolist()
cityset=set([item for sublist  in citylist if sublist!=None for item in sublist])
for city in cityset:
    if city != None:
        df["city_"+city+"_i"]=pd.Series(citylist, index=df.index).map(lambda x: city in x if x else False)

After adding indicator columns, we now get some more data from the fields that currently exist as dictionaries/lists

In [9]:
df['funding_dates']=df.funding_rounds.map(lambda x:[] if not x or isinstance(x,float) else [datetime.datetime(*time.strptime(a['announced_on'], '%Y-%m-%d')[:3]) for a in x])

In [10]:
df['funding_amounts']=df.funding_rounds.map(lambda x:[] if not x or isinstance(x,float) else [float(a['money_raised_usd']) if a['money_raised_usd'] != None else 0 for a in x])

In [11]:
df['num_funding_rounds']=df.funding_amounts.map(lambda x: len(x))

In [12]:
df['funding_mean']=df.funding_amounts.map(lambda x: np.mean(x))



In [13]:
df['went_ipo']=df.ipo.map(lambda x: True if x else False)

In [14]:
df['was_acquired']=df.acquired_by.map(lambda x: True if x else False)

In [15]:
df['founder_names']=df.founders.map(lambda x:[] if not x or isinstance(x,float) else [a['permalink']  for a in x])

In [16]:
df['founder_ages']= df.founders.map(lambda x:[] if not x or isinstance(x,float) else [(datetime.date.today()-datetime.date(*time.strptime(a['born_on'],'%Y-%m-%d')[:3])).days/365 for a in x if isinstance(a['born_on'],unicode)])

In [18]:
df.head()

Unnamed: 0,acquired_by,acquisitions,board_members_and_advisors,categories,competitors,founders,funding_rounds,headquarters,investments,investors,ipo,offices,owned_by,short_description,text_nouns,text_adjectives,c_Marketplaces_i,c_Crowdsourcing_i,c_Anything Capital Intensive_i,c_B2B_i,c_Corporate IT_i,c_Health and Insurance_i,c_Crowdfunding_i,c_Corporate Training_i,c_Incentives_i,c_Communications Infrastructure_i,c_Tourism_i,c_Cloud Data Services_i,c_Training_i,c_Automated Kiosk_i,c_Social Media Marketing_i,c_Restaurants_i,c_Portals_i,c_Clinical Trials_i,c_Testing_i,c_Water Purification_i,c_Health Services Industry_i,c_SNS_i,c_Non Profit_i,c_Telecommunications_i,c_Ticketing_i,c_Language Learning_i,c_Video Streaming_i,c_Mobile Payments_i,c_3D_i,c_Automotive_i,c_Collaboration_i,c_Interior Design_i,c_M2M_i,c_Game_i,...,city_Bucharest_i,city_Beijing_i,city_Fuzhou Shi_i,city_Caparica_i,city_Westmount_i,city_Preston_i,city_Pleasanton_i,city_Runcorn_i,city_Kanata_i,city_Gaithersburg_i,city_Beverly Hills_i,city_Guayabo_i,city_Upper Heyford_i,city_Vancouver_i,city_Chandigarh_i,city_Reading_i,city_Geneva_i,city_Cherry Hill_i,city_Princeton_i,city_Elkridge_i,city_Plymouth Meeting_i,city_Malvern_i,city_Mount Vernon_i,city_Miass_i,city_Stanford_i,city_Wilmington_i,city_Arnold_i,city_Melbourne_i,city_Plymouth_i,city_Longwood_i,city_Palo Alto_i,city_Almaty_i,city_Baltimore_i,city_Brentwood_i,city_Abingdon_i,city_New Haven_i,city_New Berlin_i,city_Amstelveen_i,city_Easley_i,city_Tourcoing_i,city_Oakland_i,city_Bra_i,funding_dates,funding_amounts,num_funding_rounds,funding_mean,went_ipo,was_acquired,founder_names,founder_ages
24me,[],[],[],"[{u'name': u'Artificial Intelligence'}, {u'nam...",[],[{u'bio': u'Gilad is the Co-Founder and CEO of...,"[{u'announced_on': u'2014-05-02', u'funding_ty...",[],[],[{u'permalink': u'iangels'}],[],[],[],24me is a tech company building the next gener...,"[[tech, company, generation, assistant]]","[[digital, personal]]",True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,[2014-05-02 00:00:00],[0],1,0,False,False,"[gilad-hertanu, liat-mordechay-hertanu]","[39, 37]"
achieve-financial-services,[],[],[],"[{u'name': u'Finance'}, {u'name': u'P2P Money ...",[],[],"[{u'announced_on': u'2012-03-20', u'funding_ty...","[{u'city': u'Austin', u'country': u'United Sta...",[],[{u'permalink': u'escalate-capital-partners'}],[],"[{u'city': u'Austin', u'country': u'United Sta...",[],Achieve Financial Services is a marketer of ge...,"[[debit, card, service]]","[[general-purpose, reloadable, prepaid, relate...",False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,[2012-03-20 00:00:00],[12500000.0],1,12500000,False,False,[],[]
4tiitoo,[],[],[],"[{u'name': u'Augmented Reality'}, {u'name': u'...",[],[{u'bio': u'Tore Meyer is Founder & CEO at 4ti...,"[{u'announced_on': u'2014-08-18', u'funding_ty...","[{u'city': u'Munich', u'country': u'Germany'},...",[],[{u'permalink': u'intel-capital'}],[],"[{u'city': u'Munich', u'country': u'Germany'}]",[],Software platform that allows people to intera...,"[[platform, person, device, application, voice...",[[everyday]],False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"[2014-08-18 00:00:00, 2013-12-10 00:00:00, 201...","[0, 0, 0]",3,0,False,False,"[tore-meyer, stephan-odoerfer]",[]
128-technology,[],[],[],"[{u'name': u'Technology'}, {u'name': u'Service...",[],[],"[{u'announced_on': u'2014-12-16', u'funding_ty...","[{u'city': u'Burlington', u'country': u'United...",[],[],[],"[{u'city': u'Burlington', u'country': u'United...",[],128 Technology,[],[],False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,[2014-12-16 00:00:00],[11999347.0],1,11999347,False,False,[],[]
-the-one-of-them-inc-,[],[],[],"[{u'name': u'Apps'}, {u'name': u'Mobile'}, {u'...",[],[],"[{u'announced_on': u'2014-01-30', u'funding_ty...",[],[],[],[],[],[],smartphone application business,[],[],False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,[2014-01-30 00:00:00],[3406878.0],1,3406878,False,False,[],[]


# EDA

First let's take a look at how many companies match each of our different success metrics.

In [51]:
# The number of rows of companies in our 1000 sample that were acquired
print "Number of companies that were acquired: %s" % len(df[df.was_acquired == True])
print "Number of companies with at least 1 funding round: %s" % len(df[df.num_funding_rounds > 0])
print "Number of companies that IPOed: %s" % len(df[df.went_ipo == True])

Number of companies that were acquired: 91
Number of companies with at least 1 funding round: 976
Number of companies that IPOed: 46


Clearly Acquired and IPOed are good indicators, but funding rounds appears to be a prerequiste to making it into the CrunchBase system.  Perhaps we can more discerning in which companies that raised funds are considered succesful, maybe a specific amount or number of rounds.  Let's take a closer look at funding rounds.