### Intro: find missing data

In [12]:
import pandas as pd
import numpy as np
import math

In [13]:
# Here's building a function using def: 

def square_root(x): return math.sqrt(x)

In [14]:
# Here's building the same function using lambda
x = lambda x: math.sqrt(x)

In [15]:
# Use the isnull() method to detect the missing values. The output
# shows True when the value is missing. By adding an index into 
# the dataset, you obtain just the entries that are missing.
# A dataset could represent missing data in several ways. In this 
# example, you see missing data represented as np.NaN (NumPy Not 
# a Number) and the Python None value.

### Intro: fill in missing data

In [16]:
# To fill in missing data use fillna(). For fillna() you need to 
# provide a number. Usually, the mean, median, or mode is used. 
# Let's use the same data set and this time let's fill in missing 
# values with the mean. 

In [17]:
# We could also just drop all the NAs, by using dropna()

### Write the equivalent lambda function

In [18]:
# Write the equivalent lambda function for the following def 
# function: 

def f (x): return(x**2)
print(f(8))

64


In [19]:
# equivalent lambda function

g = lambda x = 8: x ** 2
g(8)

64

### Lab 3.4

In [20]:
# read in the raw data from the github url

data = pd.read_csv("https://raw.githubusercontent.com/suneel0101/lesson-plan/master/crunchbase_monthly_export.csv")

In [21]:
# find missing values
missing_data = data[data.isnull()]
missing_data

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,,,,
9,,,,,,,,,,,,,,,,,,,


In [22]:
# fill in missing data with the mean
filled_data = data[data.notnull()]

In [23]:
filled_data.head()

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,Credit,750000,,BRA,,Rio de Janeiro,Belo Horizonte,1,,,,,1/1/10,1/1/10,
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,Entertainment,1750000,acquired,USA,NY,New York City,New York,1,6/1/12,2012-06,2012-Q2,2012.0,6/30/12,6/30/12,
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,Los Angeles,2,,,,,6/4/10,9/23/10,
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Education,40000,operating,EST,,Tallinn,Tallinn,1,10/26/12,2012-10,2012-Q4,2012.0,8/9/12,8/9/12,
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Apps,1500000,operating,GBR,,London,London,1,4/1/11,2011-04,2011-Q2,2011.0,4/1/11,4/1/11,


In [24]:
# Set status to categorical
statuses = pd.DataFrame(data[data['status'].notnull()]['status'], dtype = 'category')

# Find mode status
print(pd.get_dummies(statuses, prefix = "", prefix_sep = "").mode())
statuses['status'].value_counts()

   acquired  closed  operating
0       0.0     0.0        1.0


operating    36135
acquired      3194
closed        2879
Name: status, dtype: int64

In [25]:
# Therefore we can set any empty statuses to 'operating' without losing much accuracy
data['status'] = data['status'].fillna('operating')

print(data['status'].head())

0    operating
1     acquired
2    operating
3    operating
4    operating
Name: status, dtype: object


In [26]:
#find matching regions to fill in state
regions_dataframe = data[['region','state_code']].dropna(axis = 0)#.drop_duplicates()
regions_dataframe.head()

Unnamed: 0,region,state_code
1,New York City,NY
2,Los Angeles,CA
6,Ft. Lauderdale,FL
9,"Springfield, Illinois",IL
10,SF Bay Area,CA


In [27]:
empty_states = data[data['state_code'].isnull()]

Unnamed: 0,region,state_code_x,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code_y,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,New York City,NY,/organization/black-rhino-group,Black Rhino Group,http://www.blackrhinogroup.com/,,,-,operating,,,New York,1.0,,,,,7/30/14,7/30/14,
1,New York City,NY,/organization/commonbond,CommonBond,http://commonbond.co,|Finance|,Finance,103500000,operating,,,New York,2.0,11/1/11,2011-11,2011-Q4,2011.0,11/29/12,9/4/13,
2,New York City,NY,/organization/prospect-accelerator,Prospect Accelerator,http://www.prospectaccelerator.com,|Advertising|,Advertising,-,operating,,,Manhattan,1.0,10/1/12,2012-10,2012-Q4,2012.0,10/1/12,10/1/12,
3,Los Angeles,CA,/organization/geocities,GeoCities,,|Web Hosting|,Web Hosting,40000000,closed,,,Marina Del Rey,4.0,1/1/96,1996-01,1996-Q1,1996.0,1/1/95,1/1/98,
4,Los Angeles,CA,/organization/orbitera-inc,"Orbitera, Inc.",http://www.orbitera.com,|Cloud Computing|Marketing Automation|SaaS|,Cloud Computing,-,operating,,,West Hollywood,1.0,5/7/11,2011-05,2011-Q2,2011.0,5/7/11,5/7/11,
5,Los Angeles,CA,/organization/peggd,Pegg'd,http://peggd.com/,|Events|Apps|,Apps,65000,operating,,,Los Angeles,1.0,11/27/12,2012-11,2012-Q4,2012.0,5/15/14,5/15/14,
6,Ft. Lauderdale,FL,,,,,,,,,,,,,,,,,,
7,"Springfield, Illinois",IL,,,,,,,,,,,,,,,,,,
8,SF Bay Area,CA,/organization/kloudless,Kloudless,https://kloudless.com,|Developer APIs|Cloud Data Services|Software|,Cloud Data Services,1300000,operating,,,Berkeley,1.0,12/1/11,2011-12,2011-Q4,2011.0,1/1/13,1/1/13,
9,SF Bay Area,CA,/organization/krowdpad,KrowdPad,http://www.krowdpad.com,,,-,operating,,,Redwood City,1.0,2/1/13,2013-02,2013-Q1,2013.0,8/1/14,8/1/14,
