In [1]:
import pandas as pd
import glob
import os
import numpy as np
import json

In [2]:
path = r'./data'
all_files = glob.glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f) for f in all_files)
pd.concat(df_from_each_file, ignore_index=True).to_csv('./data/merged.csv', index=False)

In [3]:
df = pd.read_csv('./data/merged.csv')

In [4]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')

In [5]:
df_json_col = df[['category', 'location',  'profile', 'urls']]

for col in df_json_col.columns:
    print(df[col][0])

{"id":266,"name":"Footwear","slug":"fashion/footwear","position":5,"parent_id":9,"color":16752598,"urls":{"web":{"discover":"http://www.kickstarter.com/discover/categories/fashion/footwear"}}}
{"id":2462429,"name":"Novato","slug":"novato-ca","short_name":"Novato, CA","displayable_name":"Novato, CA","localized_name":"Novato","country":"US","state":"CA","type":"Town","is_root":false,"urls":{"web":{"discover":"https://www.kickstarter.com/discover/places/novato-ca","location":"https://www.kickstarter.com/locations/novato-ca"},"api":{"nearby_projects":"https://api.kickstarter.com/v1/discover?signature=1552595066.49b64db66a5124f5831752d055cd09aff20cc652&woe_id=2462429"}}}
{"id":3508024,"project_id":3508024,"state":"inactive","state_changed_at":1541459205,"name":null,"blurb":null,"background_color":null,"text_color":null,"link_background_color":null,"link_text_color":null,"link_text":null,"link_url":null,"show_feature_image":false,"background_image_opacity":0.8,"should_show_feature_image_sect

In [6]:
remove = ['id', 'permissions', 'location', 'photo', 'disable_communication',
       'source_url',  'creator', 'friends', 'profile', 'urls', 'is_backing', 'is_starrable', 'is_starred',
        'usd_type', 'static_usd_rate', 'currency_symbol','currency_trailing_code', 'current_currency', 'fx_rate']

In [7]:
df.drop(remove, axis=1, inplace=True)

In [8]:
categories = df['category'].apply(json.loads)

df['category_name'] = categories.apply(lambda x: x['name'])
df['category_slug'] = categories.apply(lambda x: x['slug'])

In [9]:
df.drop(['category'], axis=1, inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   backers_count             209222 non-null  int64  
 1   blurb                     209214 non-null  object 
 2   converted_pledged_amount  209222 non-null  int64  
 3   country                   209222 non-null  object 
 4   created_at                209222 non-null  int64  
 5   currency                  209222 non-null  object 
 6   deadline                  209222 non-null  int64  
 7   goal                      209222 non-null  float64
 8   launched_at               209222 non-null  int64  
 9   name                      209222 non-null  object 
 10  pledged                   209222 non-null  float64
 11  slug                      209222 non-null  object 
 12  spotlight                 209222 non-null  bool   
 13  staff_pick                209222 non-null  b

In [11]:
# Save the dataframe as a csv file

df.to_csv('./data/procesed.csv',index=False)

In [12]:
dtypes = {
    'backers_count': 'int64',
    'blurb': 'str',
    'category': 'str',
    'country': 'str',
    'goal': 'float64',
    'name': 'str',
    'pledged': 'float64',
    'spotlight': 'bool',
    'staff_pick': 'bool',
    'state': 'str',
    'usd_pledged': 'float64',
    'category_name': 'str',
    'category_slug': 'str',
}

In [13]:
df = pd.read_csv('./data/procesed.csv', dtype=dtypes)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   backers_count             209222 non-null  int64  
 1   blurb                     209214 non-null  object 
 2   converted_pledged_amount  209222 non-null  int64  
 3   country                   209222 non-null  object 
 4   created_at                209222 non-null  int64  
 5   currency                  209222 non-null  object 
 6   deadline                  209222 non-null  int64  
 7   goal                      209222 non-null  float64
 8   launched_at               209222 non-null  int64  
 9   name                      209222 non-null  object 
 10  pledged                   209222 non-null  float64
 11  slug                      209222 non-null  object 
 12  spotlight                 209222 non-null  bool   
 13  staff_pick                209222 non-null  b

In [15]:
# convert date columns to datetime iso format 
df['deadline'] = pd.to_datetime(df['deadline'], unit='s')
df['launched_at'] = pd.to_datetime(df['launched_at'], unit='s')
df['created_at'] = pd.to_datetime(df['created_at'], unit='s')
df['state_changed_at'] = pd.to_datetime(df['state_changed_at'], unit='s')

In [16]:
# convert to category
df['category_name'] = df['category_name'].astype('category')
df['category_slug'] = df['category_slug'].astype('category')
df['state'] = df['state'].astype('category')
df['country'] = df['country'].astype('category')

In [17]:
# convert to string
df['blurb'] = df['blurb'].astype('str')
df['name'] = df['name'].astype('str')

In [18]:
# convert to boolean
df['spotlight'] = df['spotlight'].astype('bool')
df['staff_pick'] = df['staff_pick'].astype('bool')


In [19]:
# save the dataframe as a csv file
df.to_csv('data.csv',index=False)