# KICKSTERTER INTERMEDIATE DATA EXPLORATION

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
kick_inter = pd.read_csv('../../data/02_intermediate/kick_inter.csv', 
                         parse_dates=['created_at', 'deadline', 
                                      'last_update_published_at', 
                                      'launched_at', 'state_changed_at'])

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
pd.set_option('display.max_columns', None)

In [36]:
kick_inter.friends.isnull().sum()/len(kick_inter)

0.995106623930982

In [35]:
kick_inter.columns

Index(['backers_count', 'blurb', 'converted_pledged_amount', 'country',
       'created_at', 'currency', 'currency_symbol', 'currency_trailing_code',
       'current_currency', 'deadline', 'disable_communication', 'friends',
       'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred',
       'last_update_published_at', 'launched_at', 'name', 'permissions',
       'pledged', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'unread_messages_count',
       'unseen_activity_count', 'urls', 'usd_pledged', 'usd_type',
       'sub_category', 'overall_category', 'city', 'country_loc', 'state_loc',
       'creator_name', 'creator_slug'],
      dtype='object')

**NOTE**: Let's think about the difference between analysis problem and building a model. For instance. The fact that there are both names and ID in the dataset is repetitive information. If we really want to keep the fact that some campaigns were relaunched/ renamed we need to make a boolean feature that is yes or no on that front (though we may not want to do this because it could be leaking future information). 

In [48]:
kick_inter.creator_slug.head()

0              dima01
1            quempire
2    giulianoclothing
3        drankthegold
4        aperiodpiece
Name: creator_slug, dtype: object

In [47]:
kick_inter.creator_name.head()

0                      Dima01
1             quEmpire Gaming
2           Giuliano Clothing
3              Drank The Gold
4    Anthony Stephen Hamilton
Name: creator_name, dtype: object

In [45]:
kick_inter.usd_type.isnull().sum()/len(kick_inter)

0.41907305218699964

## FEATURE ENGINEERING TO-DO LIST

**COLUMNS TO DROP:**
* backers_count: contains future information 
* blurb: good for NLP, but not for decision trees (every blurb is unique) 
* currency_symbol: This information is repeated in the currency column, and since these are not words it's difficult to interpret
* currency_trailing_code - this column is redundant with currency column 
* current_currency - this column is redundant with currency column 
* friends - This column is ~ 99% empty 
* ID/ Name - we need to drop one of these columns (the information contained in them is redundant. Redundant information can make your model less effective. 
* is_backing - this column is ~ 99% empty 
* is_starrable - This looks like it could be leaking future data. Need to look into this column more
* Permissions - this column is ~99% empty 
* slug - this column is redundant with name/ blurb
* source_url - this is not needed for analysis
* spotlight - this is leaking future information (not available when the campaign first comes online) 
* staff_pick - this is leaking future info
* unread_message_count - this column is completly empty 
* unseen_activity_count - this column is completely empty 
* URL - this is not helpful for analysis 
* this seems to be redundant with currency column (and contains less information) 
* country_loc/ country - once it is established that these columns are the same, one needs to be dropped (should drop the country_loc column) 
* creator_name/ creator_slug - repetative information (one needs to be dropped). 


## EXPLORING OUR DATASET

In [10]:
kick_inter.head(3)

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,created_at,currency,currency_symbol,currency_trailing_code,current_currency,deadline,disable_communication,friends,fx_rate,goal,id,is_backing,is_starrable,is_starred,last_update_published_at,launched_at,name,permissions,pledged,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,unread_messages_count,unseen_activity_count,urls,usd_pledged,usd_type,sub_category,overall_category,city,country_loc,state_loc,creator_name,creator_slug
0,0,"I'm just going to say it, I'm not special. I'm...",0.0,US,2019-07-15 02:59:36,USD,$,True,USD,2019-08-17 05:04:48,False,,1.0,5000.0,1893102245,,True,,NaT,2019-07-18 05:04:48,Shirt and hat,,0.0,shirt-and-hat,https://www.kickstarter.com/discover/categorie...,False,False,live,2019-07-18 05:04:48,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",0.0,domestic,Apparel,fashion/apparel,Wasilla,US,AK,Dima01,dima01
1,568,for Tabletop Role Playing Games like Dungeons ...,18969.0,US,2019-06-02 21:06:55,USD,$,True,USD,2019-07-18 03:55:00,False,,1.0,5000.0,1175125319,,False,,NaT,2019-06-16 02:20:41,RPG Minimalist Creature Dice & Status / Condit...,,18969.0,rpg-minimalist-creature-dice-and-status-condit...,https://www.kickstarter.com/discover/categorie...,True,False,successful,2019-07-18 03:55:01,1.0,,,"{""web"":{""project"":""https://www.kickstarter.com...",18969.0,domestic,Tabletop Games,games/tabletop games,Holland,US,MI,quEmpire Gaming,quempire
2,0,Giuliano Clothing is on a mission to reinvent ...,0.0,CA,2019-07-17 23:13:13,CAD,$,True,USD,2019-08-17 03:50:07,False,,0.766388,5000.0,1290757180,,True,,NaT,2019-07-18 03:50:07,Giuliano Clothing: Modern Fashion,,0.0,giuliano-clothing-modern-fashion,https://www.kickstarter.com/discover/categorie...,False,False,live,2019-07-18 03:50:07,0.766254,,,"{""web"":{""project"":""https://www.kickstarter.com...",0.0,domestic,Fashion,fashion,Toronto,CA,ON,Giuliano Clothing,giulianoclothing


In [5]:
kick_inter.columns

Index(['backers_count', 'blurb', 'converted_pledged_amount', 'country',
       'created_at', 'currency', 'currency_symbol', 'currency_trailing_code',
       'current_currency', 'deadline', 'disable_communication', 'friends',
       'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred',
       'last_update_published_at', 'launched_at', 'name', 'permissions',
       'pledged', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'unread_messages_count',
       'unseen_activity_count', 'urls', 'usd_pledged', 'usd_type',
       'sub_category', 'overall_category', 'city', 'country_loc', 'state_loc',
       'creator_name', 'creator_slug'],
      dtype='object')

In [6]:
kick_inter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332899 entries, 0 to 332898
Data columns (total 42 columns):
backers_count               332899 non-null int64
blurb                       332889 non-null object
converted_pledged_amount    195183 non-null float64
country                     332899 non-null object
created_at                  332899 non-null datetime64[ns]
currency                    332899 non-null object
currency_symbol             332899 non-null object
currency_trailing_code      332899 non-null bool
current_currency            195183 non-null object
deadline                    332899 non-null datetime64[ns]
disable_communication       332899 non-null bool
friends                     1629 non-null object
fx_rate                     185035 non-null float64
goal                        332899 non-null float64
id                          332899 non-null int64
is_backing                  1629 non-null object
is_starrable                206127 non-null object
is_starred   

In [11]:
kick_inter.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
backers_count,332899.0,139.8791,1036.208,0.0,3.0,18.0,74.0,219382.0
converted_pledged_amount,195183.0,13308.94,96142.74,0.0,105.0,1405.0,6289.0,11385450.0
fx_rate,185035.0,1.042613,2.324627,0.006597,1.0,1.0,1.0,150.7221
goal,332899.0,46665.33,1085352.0,0.01,2000.0,5000.0,15000.0,100000000.0
id,332899.0,1074464000.0,619328700.0,8624.0,536884623.5,1076563000.0,1609570000.0,2147476000.0
pledged,332899.0,15504.32,292807.0,0.0,57.0,1024.0,5525.0,98863820.0
static_usd_rate,332899.0,1.010866,0.2191453,0.008771,1.0,1.0,1.0,1.716408
unread_messages_count,0.0,,,,,,,
unseen_activity_count,0.0,,,,,,,
usd_pledged,332899.0,12091.19,106823.6,0.0,56.0,1005.0,5359.0,20338990.0


In [14]:
kick_inter.country.mode()

0    US
dtype: object

In [18]:
kick_inter.country.value_counts()

US    244692
GB     32424
CA     14335
AU      7556
DE      5087
FR      3897
IT      3715
MX      3201
ES      3144
NL      2780
SE      2030
HK      1513
NZ      1414
DK      1252
SG      1087
CH       987
IE       924
BE       851
NO       752
AT       742
JP       435
LU        81
Name: country, dtype: int64

In [26]:
kick_inter.fx_rate.isnull().sum()

147864

In [28]:
kick_inter.is_backing.isnull().sum()

331270

In [31]:
kick_inter.unread_messages_count.isnull().sum()

332899

In [34]:
kick_inter.unseen_activity_count.isnull().sum()

332899