# Project: Kickstarter Project

## Exploring Kickstarter Data and Identifying Key Factors for Project Success

In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from patsy import dmatrices
import statsmodels.api as sm;
from datetime import *
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

In [2]:
# read in csv
df_og = pd.read_csv('ks-projects-201801.csv')

## Introduction:

## Data Assessment:

In [3]:
# visual assessment
df_og.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


* Does not look like we'll be needing the ID column for analysis
* As usd conversion is already done for us, we won't be needing currency either (we still have country recorded)
* Extract n_days from deadline - launched
* Remove goal as well since we have usd_goal_real
* Discrepancy between usd pledged and usd_pledged_real (Row #2) - We will take usd_pledged_real (converted by fixer.io api) over KS conversions due to errors

In [5]:
df_og.category.value_counts()

Product Design       22314
Documentary          16139
Music                15727
Tabletop Games       14180
Shorts               12357
Video Games          11830
Food                 11493
Film & Video         10108
Fiction               9169
Fashion               8554
Nonfiction            8318
Art                   8253
Apparel               7166
Theater               7057
Technology            6930
Rock                  6758
Children's Books      6756
Apps                  6345
Publishing            6018
Webseries             5762
Photography           5752
Indie Rock            5657
Narrative Film        5188
Web                   5153
Comics                4996
Crafts                4664
Country & Folk        4451
Design                4199
Hip-Hop               3912
Hardware              3663
                     ...  
Couture                275
Blues                  268
Animals                255
Fabrication Tools      250
Makerspaces            238
Printing               238
M

In [6]:
df_og.main_category.value_counts()

Film & Video    63585
Music           51918
Publishing      39874
Games           35231
Technology      32569
Design          30070
Art             28153
Food            24602
Fashion         22816
Theater         10913
Comics          10819
Photography     10779
Crafts           8809
Journalism       4755
Dance            3768
Name: main_category, dtype: int64

* Naming convention for main cateogy and category is confusing. Rename category as sub_category

In [7]:
df_og.state.value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [8]:
df_og.currency.value_counts()

USD    295365
GBP     34132
EUR     17405
CAD     14962
AUD      7950
SEK      1788
MXN      1752
NZD      1475
DKK      1129
CHF       768
NOK       722
HKD       618
SGD       555
JPY        40
Name: currency, dtype: int64

In [21]:
df_og.country.value_counts()

US      292627
GB       33672
CA       14756
AU        7839
DE        4171
N,0"      3797
FR        2939
IT        2878
NL        2868
ES        2276
SE        1757
MX        1752
NZ        1447
DK        1113
IE         811
CH         761
NO         708
HK         618
BE         617
AT         597
SG         555
LU          62
JP          40
Name: country, dtype: int64

In [50]:
string = 'N,0"'
df_no = df_og.query('country == @string')

In [52]:
df_no.tail()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
378233,997971307,"EQUUS International Film Festival®, all-equine...",Film & Video,Film & Video,USD,2015-02-03,7500.0,2014-12-05 04:19:14,10.0,undefined,0,"N,0""",,10.0,7500.0
378303,998319149,Emily 2050 - Short Film,Film & Video,Film & Video,CAD,2014-05-23,3000.0,2014-04-08 00:30:09,3102.0,undefined,0,"N,0""",,2845.61,2752.04
378434,9988744,Matthew Stephens Music,Music,Music,USD,2016-02-05,5000.0,2016-01-06 21:59:23,235.0,undefined,0,"N,0""",,235.0,5000.0
378585,999610349,Lady Vendredi: Afrofuturist concept 12 inch EP,Music,Music,GBP,2015-10-19,2000.0,2015-09-21 22:33:18,2125.0,undefined,0,"N,0""",,3273.36,3080.81
378588,999621278,Jeune auteur a besoin de vous!,Publishing,Publishing,EUR,2015-11-20,1600.0,2015-10-21 21:28:13,0.0,undefined,0,"N,0""",,0.0,1710.08


* Fix Typo in Country: N,0"
  * Upon taking a closer look at rows with country listed as "N,0"" and each of the projects on kickstarter.com, there was no single country that all of these projects belonged to. I assume that it is being recorded as NA. Not all hope is lost though, I found that the currency is the perfect indicator as to which country the project belongs to. Therefore, we will need to keep the currency column to fill in N,0"s and drop it later on

In [3]:
df_og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [4]:
type(df_og.deadline[0])

str

In [5]:
type(df_og.launched[0])

str

* change both deadline and launch date to datetime object

## Wrangling

In [54]:
df = df_og.copy()

In [55]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [56]:
# selecting columns I want to use for analysis
df = df[['name', 'main_category', 'category', 'currency', 'country', 'launched', 'deadline', 'state', 'backers', 'usd_pledged_real', 'usd_goal_real']]

In [57]:
df.head()

Unnamed: 0,name,main_category,category,currency,country,launched,deadline,state,backers,usd_pledged_real,usd_goal_real
0,The Songs of Adelaide & Abullah,Publishing,Poetry,GBP,GB,2015-08-11 12:12:28,2015-10-09,failed,0,0.0,1533.95
1,Greeting From Earth: ZGAC Arts Capsule For ET,Film & Video,Narrative Film,USD,US,2017-09-02 04:43:57,2017-11-01,failed,15,2421.0,30000.0
2,Where is Hank?,Film & Video,Narrative Film,USD,US,2013-01-12 00:20:50,2013-02-26,failed,3,220.0,45000.0
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,US,2012-03-17 03:24:11,2012-04-16,failed,1,1.0,5000.0
4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,US,2015-07-04 08:35:03,2015-08-29,canceled,14,1283.0,19500.0


In [59]:
# renaming columns
df.columns = ['name', 'main_cat', 'sub_cat', 'currency', 'country', 'launch_date', 'deadline', 'state', 'backers', 'usd_pledged', 'usd_goal']

In [61]:
df.head()

Unnamed: 0,name,main_cat,sub_cat,currency,country,launch_date,deadline,state,backers,usd_pledged,usd_goal
0,The Songs of Adelaide & Abullah,Publishing,Poetry,GBP,GB,2015-08-11 12:12:28,2015-10-09,failed,0,0.0,1533.95
1,Greeting From Earth: ZGAC Arts Capsule For ET,Film & Video,Narrative Film,USD,US,2017-09-02 04:43:57,2017-11-01,failed,15,2421.0,30000.0
2,Where is Hank?,Film & Video,Narrative Film,USD,US,2013-01-12 00:20:50,2013-02-26,failed,3,220.0,45000.0
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,US,2012-03-17 03:24:11,2012-04-16,failed,1,1.0,5000.0
4,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,US,2015-07-04 08:35:03,2015-08-29,canceled,14,1283.0,19500.0
