# Exploratory Data Analysis: Kickstarter Project 


**Importing Packages and Tools**

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from matplotlib import pyplot as plt  

#supressing scientific notation 
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
#importing the data 
kick = pd.read_csv("C:/Users/mayaa/OneDrive/Documents/project_BA/Kickstarter-Project-/kickstarter_final.csv")

Note: The original kickstarter dataset had the values that were other than success or failure but we decided to ignore thos values for now. The values were "canceled", "undefined". 

In [3]:
kick.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,10/9/2015,1000.0,8/11/2015 12:12,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,11/1/2017,30000.0,9/2/2017 4:43,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2/26/2013,45000.0,1/12/2013 0:20,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,4/16/2012,5000.0,3/17/2012 3:24,1.0,failed,1,US,1.0,1.0,5000.0
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,4/1/2016,50000.0,2/26/2016 13:38,52375.0,successful,224,US,52375.0,52375.0,50000.0


In [24]:
#checking NA values 
kick.isna().sum()

#checking the count of success and failures 
#kick['state'].value_counts()

#kick['currency'].value_counts()

ID                    0
name                  3
category              0
main_category         0
currency              0
deadline              0
goal                  0
launched              0
pledged               0
state                 0
backers               0
country               0
usd pledged         210
usd_pledged_real      0
usd_goal_real         0
deadline_date         0
launched_date         0
diff_days             0
dtype: int64

In [5]:
kick.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real
count,331675.0,331675.0,331675.0,331675.0,331465.0,331675.0,331675.0
mean,1074270646.28906,44251.57306,10584.00355,116.38158,7684.56901,9943.46126,41510.00154
std,619192392.79953,1117916.69906,101591.73182,965.42727,83209.37721,96732.93417,1108929.66305
min,5971.0,0.01,0.0,0.0,0.0,0.0,0.01
25%,537172480.0,2000.0,50.0,2.0,25.0,50.0,2000.0
50%,1074573261.0,5000.0,782.0,15.0,502.0,788.0,5000.0
75%,1609843616.5,15000.0,4657.0,63.0,3421.0,4608.0,15000.0
max,2147476221.0,100000000.0,20338986.27,219382.0,20338986.27,20338986.27,166361390.7


In [12]:
#creating a new column with number of months 
import datetime
from dateutil.relativedelta import relativedelta 
from datetime import date

#converting deadline string into a date 
kick['deadline_date'] = pd.to_datetime(kick['deadline']).dt.date

#converting launch string into a date 
kick['launched_date'] = pd.to_datetime(kick['launched']).dt.date

In [20]:
#calculating the difference between the dates 
#https://medium.com/@bramtunggala/a-simple-way-to-finding-the-difference-between-two-dates-in-pandas-179d2714b6c 
#diff_days is the difference in months between the date of deadline and launched 
kick['diff_days'] = kick['deadline_date'] - kick['launched_date']

#we had originally thought of coverting it to months, but I feel like days is better as the maximum is 92 days. 
kick.describe()

Unnamed: 0,ID,goal,pledged,backers,usd pledged,usd_pledged_real,usd_goal_real,diff_days
count,331675.0,331675.0,331675.0,331675.0,331465.0,331675.0,331675.0,331675
mean,1074270646.28906,44251.57306,10584.00355,116.38158,7684.56901,9943.46126,41510.00154,33 days 22:55:01.156855
std,619192392.79953,1117916.69906,101591.73182,965.42727,83209.37721,96732.93417,1108929.66305,12 days 17:07:11.920645
min,5971.0,0.01,0.0,0.0,0.0,0.0,0.01,1 days 00:00:00
25%,537172480.0,2000.0,50.0,2.0,25.0,50.0,2000.0,30 days 00:00:00
50%,1074573261.0,5000.0,782.0,15.0,502.0,788.0,5000.0,30 days 00:00:00
75%,1609843616.5,15000.0,4657.0,63.0,3421.0,4608.0,15000.0,36 days 00:00:00
max,2147476221.0,100000000.0,20338986.27,219382.0,20338986.27,20338986.27,166361390.7,92 days 00:00:00


In [27]:
#updating currency to real time 
#https://www.in2013dollars.com/us/inflation/2018?amount=1
#the value of $1 in 2018 is worth $1.03 today 
kick['usd_goal_real'] = kick['usd_goal_real'] * 1.03 
kick['usd_pledged_real'] = kick['usd_pledged_real'] * 1.03 

In [34]:
#selecting only the columns we might want to use 
kick = kick.iloc[:, np.r_[0:5, 6, 8:11, 13:len(kick.columns)]]

#excluded: deadline, launched (as there is the deadline_date, launched_date) 
#excluded usd_pledged which has 206 missing values -> this was the kickstarter automatic conversion which is replaced by usd_pleged_real