In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_pickle('kickstarter_desc.pkl')

In [6]:
# step one, rename parsed
df.rename(columns = {"description":"parsed"}, inplace = True)

In [7]:
# step two, pull out the four columns
df[['description', 'img_count', 'vid_count', 'rewards']] = pd.DataFrame(df['parsed'].tolist(), index=df.index) 

In [8]:
## df[df['description'].str.contains('This Error:')].info()
## Uhh there's 184 errors and I may just drop them all. Pulling the URLs again doesn't seem to be worth it. There's
## multiple error that can result from this and 184/200,000 records isn't so bad. 


## Step three, drop parsed column
df.drop('parsed',axis = 1, inplace = True)

In [9]:
## Step four, drop rows that don't have all information
df.dropna(subset=['img_count', 'vid_count', 'rewards'], inplace = True)

In [10]:
## Step five, transforms "N/A" strings to NaNs, clean up empty vals
df.replace('N/A',np.NaN, inplace = True)

In [11]:
##step six, clean up empty / null values
df.description.replace('', None, inplace = True)

## It looks like for some descriptions, it's a long string of null characters that we may be able to ignore
## during analysis

In [12]:
## Step seven, clean up categories

# df.loc[10]['category']
# df.groupby(["category_name", "category_slug"]).size()
def cat_core(str):
    if "/" in str:
        split_list = str.split('/')
        core = split_list[0]
        return core
    else:
        return str
    
df['category_core'] = df['category_slug'].apply(cat_core)

In [13]:
## Step eight, clean up dates 

## df.loc[0]['created_at_date']

df['created_at_date'] = pd.to_datetime(df['created_at_date'])
df['deadline_date'] = pd.to_datetime(df['deadline_date'])
df['launched_at_date'] = pd.to_datetime(df['launched_at_date'])
df['state_changed_at_date'] = pd.to_datetime(df['state_changed_at_date'])

In [14]:
## Step Nine, USD Goal

df['usd_goal'] = df['goal'] * df['static_usd_rate']

In [15]:
## Step ten, percent reached

df['percent_goal'] = (df['pledged'] / df['goal']) * 100.00

In [16]:
## Step eleven, video_usage. 
## Determined that video usage is sparse and woudl rather see if they did use it or not in addition to how many

df['vid_usage'] = df['vid_count'] >= 1.0

In [17]:
## Step twelve, boolean whether they reached goal or not.

df['reach_goal'] = df['percent_goal'] >= 100.00

In [18]:
## Step thirteen, drop duplicates
df.drop_duplicates(subset = ['id', 'name'], inplace = True)

In [19]:
## Step fourteen, rewards translated
def convert_tool(values, rate):
    new_rewards = []
    for v in values:
        new_rewards.append(round(v * rate, 2))
    return new_rewards

df['usd_rewards'] = df.apply(lambda x: convert_tool(x['rewards'], x['static_usd_rate']), axis = 1)

In [20]:
## Step fifteen, did they reach the goal? boolean instead of quantitative
df['reach_goal'] = df['percent_goal'] >= 100.00

In [40]:
## Step sixteen, get the length of a description count & blurg

df['description_len'] = df['description'].str.len()
df['blurb_len'] = df['blurb'].str.len()
df['slug_len'] = df['slug'].str.len()

In [None]:
## Step eighteen, handling "state" of a project.
## We want to drop the 'live' state since it may not accurately represent a projects current status
## it's not fair if we took only half the 'live' projects that passed and not the ones who did not.

In [22]:
df.to_pickle('kickstarter_clean.pkl')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182905 entries, 0 to 210086
Data columns (total 58 columns):
backers_count                182905 non-null int64
blurb                        182897 non-null object
category                     182905 non-null object
converted_pledged_amount     182905 non-null int64
country                      182905 non-null object
created_at                   182905 non-null int64
creator                      182905 non-null object
currency                     182905 non-null object
currency_trailing_code       182905 non-null bool
current_currency             182905 non-null object
deadline                     182905 non-null int64
fx_rate                      182905 non-null float64
goal                         182905 non-null float64
id                           182905 non-null int64
launched_at                  182905 non-null int64
location                     182688 non-null object
name                         182905 non-null object
pledged    

In [56]:
df.groupby(['state','reach_goal']).count()
## df[(df['state'] == 'successful') & (df['reach_goal'] == False)]['web_url'].loc[47894]
## Uh I explored it and I can't figure out why Kickstarter approved the project even w/o meeting
## the goal. 
## df[(df['state'] == 'failed') & (df['reach_goal'] == True)][['slug', 'usd_goal', 'usd_pledged']]
df[(df['state'] == 'canceled') & (df['reach_goal'] == True)][['slug', 'usd_goal', 'usd_pledged']]

Unnamed: 0,slug,usd_goal,usd_pledged
805,maria-bonita-joyeria-maria-bonita-jewelry,260.249100,4.533539e+02
2624,on-the-ideology-of-berlin-trans-formation,2000.000000,2.025000e+03
3084,e1-charge-free-anc-earbuds-for-superior-audio-...,2549.313200,5.774067e+03
3276,custom-printed-rear-car-window-decal-color-non...,100.000000,6.720000e+02
3930,unbreakable-mythological-rings-for-titans-and-...,559.582685,5.845401e+03
9597,johnny-segura-presents-the-fandom-years-volume-1,400.000000,6.480000e+02
10488,making-baby-food-for-the-first-time,10.000000,1.000000e+01
13714,plusboard-an-a-prototyping-experience-for-makers,5877.296720,7.352498e+03
15614,the-last-patch,433.760445,4.337604e+02
18349,hardwood-clocks,50.000000,1.000000e+02


In [None]:
df['web_url'].loc[]

In [25]:
drop_list = ['backers_count', 'category', 'converted_pledge_amount',
            'created_at', 'creator', 'currency_trailing_code',
            'current_currency', 'deadline', 'fx_rate', 'goal', 'id', 'launched_at', 'location',
            'name', 'pledged', 'profile', 'state_changed_at', 'urls', 'usd_type', 'profile_id',
            'profile_project_id', 'profile_state', 'urls_web', 'rewards']
## Values not useful for analysis

Additional Stuff to Do
- figure out how to handle "state"