In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
dtypes = {
    'backers_count': 'int32',
    'blurb': 'str',
    'category': 'str',
    'converted_pledged_amount': 'int32',
    'country': 'str',
    'current_currency': 'str',
    'goal': 'float32',
    'location': 'str',
    'name': 'str',
    'pledged': 'float32',
    'spotlight': 'bool',
    'staff_pick': 'bool',
    'state': 'str',
    'usd_pledged': 'float32',
    'category_name': 'str',
    'category_slug': 'str',
}

In [23]:
df = pd.read_csv('/Users/annelahann/neue-fische/kickstarter-ml-project/data/data.csv', dtype=dtypes)

In [14]:
# convert date columns to datetime iso format 
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched_at'] = pd.to_datetime(df['launched_at'])
df['created_at'] = pd.to_datetime(df['created_at'])
df['state_changed_at'] = pd.to_datetime(df['state_changed_at'])

In [15]:
# convert to category
df['category_name'] = df['category_name'].astype('category')
df['category_slug'] = df['category_slug'].astype('category')
df['country'] = df['country'].astype('category')
df['current_currency'] = df['current_currency'].astype('category')
df['location'] = df['location'].astype('category')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   backers_count             209222 non-null  int32         
 1   blurb                     209214 non-null  object        
 2   converted_pledged_amount  209222 non-null  int32         
 3   country                   209222 non-null  category      
 4   created_at                209222 non-null  datetime64[ns]
 5   current_currency          209222 non-null  category      
 6   deadline                  209222 non-null  datetime64[ns]
 7   goal                      209222 non-null  float32       
 8   is_backing                300 non-null     object        
 9   is_starrable              209222 non-null  bool          
 10  is_starred                300 non-null     object        
 11  launched_at               209222 non-null  datetime64[ns]
 12  lo

In [17]:
df.head()

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,created_at,current_currency,deadline,goal,is_backing,is_starrable,...,name,pledged,slug,spotlight,staff_pick,state,state_changed_at,usd_pledged,category_name,category_slug
0,315,Babalus Shoes,28645,US,2018-11-05 23:06:45,USD,2019-03-14 05:02:55,28000.0,,False,...,Babalus Children's Shoes,28645.0,babalus-childrens-shoes,False,False,live,2019-01-23 06:02:55,28645.0,Footwear,fashion/footwear
1,47,A colorful Dia de los Muertos themed oracle de...,1950,US,2017-08-02 14:28:13,USD,2017-09-09 17:00:59,1000.0,,False,...,The Ofrenda Oracle Deck,1950.0,the-ofrenda-oracle-deck,True,False,successful,2017-09-09 17:00:59,1950.0,Playing Cards,games/playing cards
2,271,"Electra's long awaited, eclectic Debut Pop/Roc...",22404,US,2012-09-30 06:45:33,USD,2013-06-12 05:03:15,15000.0,,False,...,"Record Electra's Debut Album (Pop, Rock, Class...",22404.0,record-electras-debut-album-pop-rock-classical,True,False,successful,2013-06-12 05:03:15,22404.0,Rock,music/rock
3,3,The Mist of Tribunal is a turn-based card game...,165,GB,2017-01-07 09:11:11,USD,2017-03-13 17:22:56,10000.0,,False,...,The Mist of Tribunal - A Card Game,136.0,the-mist-of-tribunal-a-card-game,False,False,failed,2017-03-13 17:22:56,165.384933,Playing Cards,games/playing cards
4,3,"Livng with a brain impairment, what its like t...",2820,US,2012-12-06 18:04:31,USD,2013-01-09 20:32:07,2800.0,,False,...,Help change the face of Brain Impairment,2820.0,help-change-the-face-of-brain-impairment,True,False,successful,2013-01-09 20:32:07,2820.0,Nonfiction,publishing/nonfiction


In [18]:
df_numeric = df.select_dtypes(include=['float32', 'int32'])
df_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 5 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   backers_count             209222 non-null  int32  
 1   converted_pledged_amount  209222 non-null  int32  
 2   goal                      209222 non-null  float32
 3   pledged                   209222 non-null  float32
 4   usd_pledged               209222 non-null  float32
dtypes: float32(3), int32(2)
memory usage: 4.0 MB


In [19]:
df_dates = df.select_dtypes(include=['datetime64'])

df_dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   created_at        209222 non-null  datetime64[ns]
 1   deadline          209222 non-null  datetime64[ns]
 2   launched_at       209222 non-null  datetime64[ns]
 3   state_changed_at  209222 non-null  datetime64[ns]
dtypes: datetime64[ns](4)
memory usage: 6.4 MB


In [20]:
df_category = df.select_dtypes(include=['category'])

df_category.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   country           209222 non-null  category
 1   current_currency  209222 non-null  category
 2   location          0 non-null       category
 3   category_name     209222 non-null  category
 4   category_slug     209222 non-null  category
dtypes: category(5)
memory usage: 1.4 MB
