In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import os
from IPython.display import display
from datetime import datetime
pd.options.display.max_columns = None

RSEED=42

In [2]:
def loadData(RS=42):
    i=0
    df_raw = pd.DataFrame()
    for f in os.listdir("./data"):
        if f[:4] == "Kick":
            da = pd.read_csv("./data/" + str(f))
            df_raw = pd.concat([df_raw, da], axis = 0)
            i+=1
            print(i,f)
    df_raw = df_raw.drop_duplicates(subset='id', keep='first')
    df_raw.reset_index(drop=True, inplace=True)
    df, df_backUp = train_test_split(df_raw,test_size=0.1,stratify=df_raw.state,random_state=RS)
    return df, df_backUp

def cat2name(cat):
    replaceString = '}{"'
    for c in replaceString: cat = cat.replace(c,"")
    dic = {}
    d = cat.split(",")
    for i in d: dic[i.split(":")[0]] = i.split(":")[1]
    return dic["name"]

def cat2slug(cat):
    replaceString = '}{"'
    for c in replaceString: cat = cat.replace(c,"")
    dic = {}
    d = cat.split(",")
    for i in d: dic[i.split(":")[0]] = i.split(":")[1]
    return dic["slug"]

def catCleaner(df):
    name = df.category.apply(cat2name)
    name.name = "category_name"
    slug = df.category.apply(cat2slug)
    slug.name = "category_slug"
    df.drop("category", inplace=True, axis =1)
    df = pd.concat([df, name, slug], axis= 1)
    return df

def timeline(t, m="dt"):
    if m == "days":
        return int(t/24/60/60)
    elif m == "months":
        return int(t/24/60/60/30.4167)
    elif m == "year":
        return int(t/24/60/60/30.4167/12) + 1970
    elif m == "dt":
        return datetime.fromtimestamp(t)

def dropTrashCols(df):
    drop_cols = [
        'is_backing',
        'is_starred',
        'currency_symbol',
        'current_currency', 
        'friends',
        'id',
        'permissions',
        'photo',
        'urls',
        'spotlight'
    ]
    df.drop(drop_cols, inplace=True, axis = 1)
    
    return df
    
def categorizeObjects(df):
    for c in df.columns: 
        if df[c].dtype == "object": 
            df[c] = df[c].astype("category") 
    df.dropna(inplace=True)
    return df

def groupCountries(df):
    map_dictionary ={
        "DE" : "Europe",
        "FR" :"Europe",
        "IT" : "Europe",
        "ES":"Europe",
        "NL":"Europe",
        "SE": "Europe",
        "DK": "Europe",
        "BE": "Europe",
        "NO": "Europe",
        "AT": "Europe",
        "LU": "Europe", 
        "CH": "Europe", 
        "IE": "Europe", 
        "JP": "Asia", 
        "HK": "Asia",
        "SG": "Asia", 
        "MX": "Other",
        "NZ": "Other",
        "AU": "Other", 
        "US": "US",
        "GB": "GB",
        "CA": "CA"
    } 
    df['cgrouped']  = df['country'].map(map_dictionary)
    return df



In [3]:
df, df_backUp = loadData(RS=RSEED)

1 Kickstarter040.csv
2 Kickstarter054.csv
3 Kickstarter055.csv
4 Kickstarter041.csv
5 Kickstarter043.csv
6 Kickstarter042.csv
7 Kickstarter052.csv
8 Kickstarter046.csv
9 Kickstarter047.csv
10 Kickstarter053.csv
11 Kickstarter045.csv
12 Kickstarter051.csv
13 Kickstarter050.csv
14 Kickstarter044.csv
15 Kickstarter023.csv
16 Kickstarter037.csv
17 Kickstarter036.csv
18 Kickstarter022.csv
19 Kickstarter008.csv
20 Kickstarter034.csv
21 Kickstarter020.csv
22 Kickstarter021.csv
23 Kickstarter035.csv
24 Kickstarter009.csv
25 Kickstarter031.csv
26 Kickstarter025.csv
27 Kickstarter019.csv
28 Kickstarter018.csv
29 Kickstarter024.csv
30 Kickstarter030.csv
31 Kickstarter026.csv
32 Kickstarter032.csv
33 Kickstarter033.csv
34 Kickstarter027.csv
35 Kickstarter002.csv
36 Kickstarter016.csv
37 Kickstarter017.csv
38 Kickstarter003.csv
39 Kickstarter029.csv
40 Kickstarter015.csv
41 Kickstarter001.csv
42 Kickstarter000.csv
43 Kickstarter014.csv
44 Kickstarter028.csv
45 Kickstarter010.csv
46 Kickstarter004.c

In [4]:
df = catCleaner(df)
df = groupCountries(df)
df = dropTrashCols(df)
df = categorizeObjects(df)

In [5]:
print(df.columns)
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df.info()

Index(['backers_count', 'blurb', 'converted_pledged_amount', 'country',
       'created_at', 'creator', 'currency', 'currency_trailing_code',
       'deadline', 'disable_communication', 'fx_rate', 'goal', 'is_starrable',
       'launched_at', 'location', 'name', 'pledged', 'profile', 'slug',
       'source_url', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'usd_pledged', 'usd_type', 'category_name',
       'category_slug', 'cgrouped'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 163618 entries, 52535 to 60087
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   backers_count             163618 non-null  int64   
 1   blurb                     163618 non-null  category
 2   converted_pledged_amount  163618 non-null  int64   
 3   country                   163618 non-null  category
 4   created_at                163618 non-null  int64   
 

In [6]:
df_backUp = catCleaner(df_backUp)
df_backUp = groupCountries(df_backUp)
df_backUp = dropTrashCols(df_backUp)
df_backUp = categorizeObjects(df_backUp)
df_backUp.drop(df_backUp.columns[df_backUp.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df_backUp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18179 entries, 67317 to 65546
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   backers_count             18179 non-null  int64   
 1   blurb                     18179 non-null  category
 2   converted_pledged_amount  18179 non-null  int64   
 3   country                   18179 non-null  category
 4   created_at                18179 non-null  int64   
 5   creator                   18179 non-null  category
 6   currency                  18179 non-null  category
 7   currency_trailing_code    18179 non-null  bool    
 8   deadline                  18179 non-null  int64   
 9   disable_communication     18179 non-null  bool    
 10  fx_rate                   18179 non-null  float64 
 11  goal                      18179 non-null  float64 
 12  is_starrable              18179 non-null  bool    
 13  launched_at               18179 non-null  

In [7]:
df.to_csv("./data/df_clean.csv",index=False)
df_backUp.to_csv("./data/df_backUp.csv",index=False)