### Transforming data into useful form for modeling and for explorative data analysis ###

In [235]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [236]:
# Data came from the following link: https://webrobots.io/kickstarter-datasets/

In [237]:
kickstarter = pd.read_csv('kickstarter.csv')


Columns (28) have mixed types. Specify dtype option on import or set low_memory=False.



In [238]:
kickstarter.shape

(207049, 37)

In [239]:
kickstarter.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')

In [242]:
kickstarter = kickstarter.drop(columns = ['creator', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'friends', 'is_backing', 'is_starrable', 'is_starred', 'permissions', 'photo', 'profile', 'source_url', 'urls'])

In [243]:
kickstarter.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'currency', 'current_currency', 'deadline',
       'fx_rate', 'goal', 'id', 'launched_at', 'location', 'name', 'pledged',
       'slug', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'usd_pledged', 'usd_type'],
      dtype='object')

In [244]:
kickstarter = pd.get_dummies(columns = ['state'], drop_first = True, data = kickstarter)

In [247]:
g = kickstarter['category']
from ast import literal_eval
g = g.apply(literal_eval)

In [249]:
child_category = []
for i, name in enumerate(d['name'] for d in g): 
    child_category.append(name)

In [250]:
kickstarter['main_category'] = parent_category

In [251]:
kickstarter['child_category'] = child_category

In [179]:
#kickstarter = kickstarter.drop(columns = 'category')

In [253]:
kickstarter['main_category'].value_counts()

music           30147
film & video    28848
technology      24382
art             22350
publishing      18071
food            16297
fashion         14051
games           13099
comics           9372
theater          7050
photography      6316
design           6268
crafts           4725
journalism       3777
dance            2296
Name: main_category, dtype: int64

In [181]:
kickstarter = pd.get_dummies(columns = ['main_category'], drop_first = True, data = kickstarter)

In [183]:
kickstarter.to_csv('kickstarter_04_24_19_8_48_AM.csv', index = False)

In [254]:
kickstarter['country'].value_counts()

US    156814
GB     19697
CA      8420
AU      4364
DE      2664
FR      2043
IT      1919
NL      1682
MX      1632
ES      1513
SE      1137
NZ       832
DK       748
HK       545
CH       519
IE       519
SG       493
BE       448
NO       435
AT       385
JP       197
LU        43
Name: country, dtype: int64

In [255]:
kickstarter = kickstarter.replace('SE', 'OTHER')
kickstarter = kickstarter.replace('NZ', 'OTHER')
kickstarter = kickstarter.replace('DK', 'OTHER')
kickstarter = kickstarter.replace('HK', 'OTHER')
kickstarter = kickstarter.replace('CH', 'OTHER')
kickstarter = kickstarter.replace('IE', 'OTHER')
kickstarter = kickstarter.replace('SG', 'OTHER')
kickstarter = kickstarter.replace('BE', 'OTHER')
kickstarter = kickstarter.replace('NO', 'OTHER')
kickstarter = kickstarter.replace('AT', 'OTHER')
kickstarter = kickstarter.replace('JP', 'OTHER')
kickstarter = kickstarter.replace('LU', 'OTHER')

In [256]:
kickstarter['country'].value_counts()

US       156814
GB        19697
CA         8420
OTHER      6301
AU         4364
DE         2664
FR         2043
IT         1919
NL         1682
MX         1632
ES         1513
Name: country, dtype: int64

In [189]:
kickstarter = pd.get_dummies(columns = ['country'], drop_first = True, data = kickstarter)

In [193]:
kickstarter = pd.get_dummies(columns = ['staff_pick'], drop_first = True, data = kickstarter)

In [194]:
kickstarter = pd.get_dummies(columns = ['spotlight'], drop_first = True, data = kickstarter)

In [196]:
kickstarter['state_successful'].value_counts()

1    112120
0     94929
Name: state_successful, dtype: int64

In [197]:
kickstarter.to_csv('kickstarter_with_all_money_04_24_19_11_16_AM.csv', index = False)

In [198]:
kickstarter = kickstarter.drop(columns = ['currency', 'current_currency', 'fx_rate',  'pledged', 'static_usd_rate', 'usd_pledged', 'usd_type'])

In [200]:
kickstarter = kickstarter.drop(columns = 'name')

In [274]:
kickstarter[['created_at', 'deadline', 'launched_at', 'state_changed_at']].head(3)

Unnamed: 0,created_at,deadline,launched_at,state_changed_at
0,2016-09-11 22:05:51,2016-12-05 19:42:23,2016-11-05 18:42:23,2016-12-05 19:42:23
1,2015-07-10 14:59:32,2015-08-24 12:00:34,2015-07-21 12:00:34,2015-08-24 12:00:34
2,2015-03-24 17:41:14,2015-05-15 16:22:34,2015-04-15 16:22:34,2015-05-15 16:22:34


In [261]:
deadline_list = kickstarter['deadline']

In [262]:
from datetime import datetime
new_list = []
for n in deadline_list:

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
    new_date = datetime.utcfromtimestamp(n).strftime('%Y-%m-%d %H:%M:%S')
    new_list.append(new_date)

In [263]:
kickstarter['deadline'] = new_list

In [264]:
created_at_list = kickstarter['created_at']

In [265]:
new_list = []
for n in created_at_list:

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
    new_date = datetime.utcfromtimestamp(n).strftime('%Y-%m-%d %H:%M:%S')
    new_list.append(new_date)

In [266]:
kickstarter['created_at'] = new_list

In [267]:
launched_at_list = kickstarter['launched_at']

In [268]:
new_list = []
for n in launched_at_list:

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
    new_date = datetime.utcfromtimestamp(n).strftime('%Y-%m-%d %H:%M:%S')
    new_list.append(new_date)

In [269]:
kickstarter['launched_at'] = new_list

In [271]:
launched_at_list = kickstarter['state_changed_at']

In [272]:
new_list = []
for n in launched_at_list:

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
    new_date = datetime.utcfromtimestamp(n).strftime('%Y-%m-%d %H:%M:%S')
    new_list.append(new_date)

In [273]:
kickstarter['state_changed_at'] = new_list

In [276]:
#kickstarter.to_csv('kickstarter_features_04_25_19_12_59_pm.csv', index = False)

In [227]:
cols = kickstarter.columns.tolist()

In [228]:
cols = ['id',
        'blurb',
        'slug', 
        'state_successful',
        'converted_pledged_amount', 
        'goal', 
        'backers_count', 
        'created_at', 
        'launched_at',
        'deadline', 
        'staff_pick_True',
        'spotlight_True', 
        'child_category',
         'main_category_comics',
         'main_category_crafts',
         'main_category_dance',
         'main_category_design',
         'main_category_fashion',
         'main_category_film & video',
         'main_category_food',
         'main_category_games',
         'main_category_journalism',
         'main_category_music',
         'main_category_photography',
         'main_category_publishing',
         'main_category_technology',
         'main_category_theater',
         'country_CA',
         'country_DE',
         'country_FR',
         'country_GB',
         'country_IT',
         'country_MX',
         'country_NL',
         'country_OTHER',
         'country_US',
        'location']

In [229]:
kickstarter = kickstarter[cols]

In [232]:
#kickstarter.to_csv('kickstarter_cleaned_04_24_19_1_08_pm.csv', index = False)

In [233]:
#import pickle
#pickling notes
#with open('kickstarter_cleaned_04_24_19_1_08_pm.pkl', 'wb') as f:
    #pickle.dump(kickstarter, f)