In [1]:
import pandas as pd

import glob
import json

import datetime
from datetime import datetime
from datetime import timedelta

In [2]:
#provide path to inference data

df = pd.read_csv("../Inference Data/Kickstarter_inference_data.csv")

In [3]:
def clean_date(column):
    date_list = []
    start_date = datetime(year=1970, month=1, day=1)
    for r in column:
        new_date = start_date + timedelta(seconds = int(r))
        date_list.append(new_date)
    return date_list

In [4]:
# Validation function - check input if valid JSON before reading it in
# In try except, be specific about the error

def parse_json(column, keyword):
    new_column = []
    for r in column:
        try:
            new_column.append(json.loads(r)[keyword])
        except:
            new_column.append(None)
    return new_column

In [5]:
df.created_at = clean_date(df.created_at)
df.state_changed_at = clean_date(df.state_changed_at)
df.deadline = clean_date(df.deadline)
df.launched_at = clean_date(df.launched_at)

In [6]:
df.category = parse_json(df.category, 'name')
df.creator = parse_json(df.creator, 'name')
df['location_city'] = parse_json(df.location, 'localized_name')
df['location_state'] = parse_json(df.location, 'state')

In [7]:
#drops duplicate rows that match on id, deadline, and creator
df = df.drop_duplicates(subset=['id', 'deadline', 'creator'])

In [8]:
# drop json Array columns
df = df.drop(columns=['location','photo','profile','urls'])

In [9]:
df = df.reset_index().drop(columns='index')

In [10]:
# Filter to projects only with successful or failed state for training purposes
df = df[((df.state == 'successful') | (df.state == 'failed'))]

# Drop columns without distinct values (found from profiling report)
df = df.drop(columns=['disable_communication','friends','is_backing','is_starred','permissions'])

# Drop highly correlated columns (ex. currency and currency symbol)
df = df.drop(columns=['country_displayable_name','currency_symbol','currency_trailing_code','static_usd_rate','usd_exchange_rate','usd_type'])

# Drop columns that are not known at the beginning of a project
df = df.drop(columns=['backers_count','converted_pledged_amount','pledged','spotlight','state_changed_at','usd_pledged'])

# Drop columns irrelevant due to business knowledge
df = df.drop(columns=['currency','current_currency','fx_rate','slug','source_url','location_city'])

# Drop columns due to difficulty to model (potentially NLP future additions)
df = df.drop(columns=['blurb','creator','name'])

In [11]:
df['campaign_length'] = (df['deadline'] - df['launched_at']).dt.days
df['prep_time'] = (df['launched_at'] - df['created_at']).dt.days
df['month_of_launch'] = df['launched_at'].dt.month
df['weekday_of_launch'] = df['launched_at'].dt.dayofweek
df['hour_of_launch'] = df['launched_at'].dt.hour

# Drop date columns due to difficulty for interpretation for ML
df = df.drop(columns=['deadline','created_at','launched_at'])

In [12]:
# Encode Data
from evalml.pipelines.components.transformers import OneHotEncoder

number_of_categories = 10
ohe = OneHotEncoder(top_n=number_of_categories)



In [13]:
def encode(df, column_to_encode):
    ohe.fit(df[column_to_encode])
    return ohe.transform(df[column_to_encode])

In [14]:
# Columns outside top_n receive an other value for encoding purposes
columns_to_encode = ['location_state','country','category','staff_pick', 'is_starrable']

for column in columns_to_encode:
    top_cats = df[column].value_counts().head(number_of_categories).index.tolist()
    df.loc[~df[column].isin(top_cats), column] = 'Other'

In [15]:
encoded_df = encode(df, columns_to_encode)
df = df.drop(columns=columns_to_encode)
encoded_df['id'] = df['id']
inference_df = pd.merge(df, encoded_df, on='id')

In [16]:
inference_df = inference_df.drop(columns=['state'])
inference_df = inference_df.drop(columns=['id'])

In [17]:
inference_df.to_json("../Tables/inference_data.json")