In [2]:
import pandas as pd

from os import path
import pathlib ##Getting to work in Notebooks
import glob

In [5]:
## Access folder where csv data is stored

## Code from previous project; altered to work for jupyter notebooks
## For code outside of notebooks, use: 
##     basepath = path.dirname(__file__)
##     filepath = path.abspath(path.join(basepath, "data"))

basepath = pathlib.Path().resolve()
filepath = path.abspath(path.join(basepath,"data"))


## https://stackoverflow.com/questions/20906474
## /import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
## Pulling from csvs when they're in a different location

files = glob.glob(filepath + "/*.csv")

sets = []

for filename in files:
    f = pd.read_csv(filename, index_col = None, header = 0)
    sets.append(f)
    
df = pd.concat(sets, axis = 0, ignore_index = True)

In [6]:
df.head(2)

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,creator,currency,currency_symbol,currency_trailing_code,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,23,Ginger KICK! is back for the holidays with ama...,"{""id"":313,""name"":""Small Batch"",""slug"":""food/sm...",825,US,1509883503,"{""id"":990359968,""name"":""Danielle Ackley-McPhai...",USD,$,True,...,ginger-kick-holiday-cheer,https://www.kickstarter.com/discover/categorie...,True,False,successful,1510518809,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",825.0,domestic
1,322,Let's go to the movies on the L.A. River! Help...,"{""id"":298,""name"":""Movie Theaters"",""slug"":""film...",10545,US,1361914696,"{""id"":860373786,""name"":""L.A. River Revitalizat...",USD,$,True,...,la-river-bike-in-movie-theater,https://www.kickstarter.com/discover/categorie...,True,True,successful,1365700816,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",10545.0,domestic


In [7]:
df.describe()

Unnamed: 0,backers_count,converted_pledged_amount,created_at,deadline,fx_rate,goal,id,launched_at,pledged,state_changed_at,static_usd_rate,usd_pledged
count,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0,210088.0
mean,145.346417,12859.93,1457802000.0,1464764000.0,0.992695,49843.99,1073280000.0,1461933000.0,18385.93,1464569000.0,1.010243,12860.61
std,890.106978,90023.84,64196450.0,63863060.0,0.20914,1208255.0,619800100.0,63891010.0,306074.2,63711580.0,0.232424,90038.0
min,0.0,0.0,1240366000.0,1242468000.0,0.008932,0.01,8624.0,1240674000.0,0.0,1242468000.0,0.008771,0.0
25%,4.0,109.0,1414097000.0,1421827000.0,1.0,1500.0,535365800.0,1418785000.0,111.0,1421640000.0,1.0,110.0
50%,27.0,1549.0,1459286000.0,1466077000.0,1.0,5000.0,1074189000.0,1463243000.0,1566.0,1466013000.0,1.0,1550.0
75%,89.0,6591.0,1514907000.0,1522030000.0,1.0,15000.0,1610073000.0,1519245000.0,6980.0,1521988000.0,1.0,6598.0
max,105857.0,10469490.0,1555533000.0,1560744000.0,1.743276,100000000.0,2147476000.0,1555563000.0,81030740.0,1555563000.0,1.716408,10469490.0


## Exploration

There are 210,088 rows of backer data pulled from https://webrobots.io/kickstarter-datasets/
All quantitative fields look to contain all information in some form since their respective counts are 210088. 



Looking at the backers_count field, it's interesting that the 75th percentile is at 89 while the max is at 105857. Additionally, the mean is at 145. With the mean being higher than the median, we can tell that backers_count is right skewed. This makes sense with Kickstarters and how their projects work. Incredibly popular projects will get way more than their goal and have a higher reach. 


The fields created_at, deadline, id, launched_at, state_changed_at look to be different data types than the ones presented by .describe()

In [10]:
df.head()[['created_at', 'deadline', 'launched_at', 'state_changed_at']]

Unnamed: 0,created_at,deadline,launched_at,state_changed_at
0,1509883503,1510518804,1509914004,1510518809
1,1361914696,1365700816,1363108816,1365700816
2,1549877514,1554903476,1552657076,1554903476
3,1502707276,1505921026,1503329026,1505921027
4,1513448977,1534015191,1532805591,1534015191


It looks like these are datetime fields and the dataset is storing them in epoch time. 

In [20]:
# We can import the time module and use either the gmtime or localtime
# to converter epoch to the format we'd like.

import time

epochtime = df.iloc[0]['created_at']

struct = time.gmtime(epochtime)
time_string = time.strftime("%Y-%m-%d, %H:%M:%S", struct)
print(epochtime)
print(time_string)

1509883503
2017-11-05, 12:05:03


In [23]:
def epoch_to_date(epoch):
    struct = time.gmtime(epoch)
    date_string = time.strftime("%Y-%m-%d", struct)
    return date_string
def epoch_to_time(epoch):
    struct = time.gmtime(epoch)
    time_string = time.strftime("%H:%M:%S", struct)
    return time_string

In [27]:
df['created_at_date'] = df['created_at'].apply(epoch_to_date)
df['created_at_time'] = df['created_at'].apply(epoch_to_time)

Unnamed: 0,created_at,created_at_date
0,1509883503,2017-11-05
1,1361914696,2013-02-26
2,1549877514,2019-02-11
3,1502707276,2017-08-14
4,1513448977,2017-12-16
