In [1]:
# modules we'll use

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#mount Google Drive
#from google.colab import drive
#drive.mount('/content/drive')

# import sys
# sys.path.append('/content/drive/MyDrive/Praca Inżynierska/kickstarter/code')

from preprocessing.label_binarizer import DataFrameLabelBinarizer
from preprocessing.ngram_counter import NGramCounter

In [2]:
# read in all our data

# Google Drive's path: /content/drive/MyDrive/Praca Inżynierska/kickstarter/data/ks-projects-201801.csv
ks_projects_orig  = pd.read_csv("../data/ks-projects-201801.csv")

# set seed for reproducibility
np.random.seed(0) 

# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
338862,796196901,10G Christmas Tree,Art,Art,USD,2010-12-26,10526.0,2010-12-08 08:44:04,0.0,failed,0,US,0.0,0.0,10526.0
277871,483825010,Gliff,Gaming Hardware,Games,USD,2016-03-28,10000.0,2016-01-28 04:56:18,51.0,failed,5,US,51.0,51.0,10000.0
47000,123916947,STUFFED Food Truck,Food Trucks,Food,USD,2015-01-06,60000.0,2014-11-07 02:24:36,25.0,failed,1,US,25.0,25.0,60000.0
111338,1565733636,NeoExodus Adventure: Origin of Man for Pathfin...,Tabletop Games,Games,USD,2012-05-01,500.0,2012-03-15 01:16:10,585.0,successful,17,US,585.0,585.0,500.0
53743,1273544891,NAPOLEON IN NEW YORK! an original TV Series,Comedy,Film & Video,USD,2016-07-26,25000.0,2016-05-27 00:07:25,25.0,failed,1,US,25.0,25.0,25000.0


In [3]:
# get info about DataFrame columns
ks_projects_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 11  country           378661 non-null  object 
 12  usd pledged       374864 non-null  float64
 13  usd_pledged_real  378661 non-null  float64
 14  usd_goal_real     378661 non-null  float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [4]:
# get info about DataFrame columns
ks_projects_orig.info()

# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 11  country           378661 non-null  object 
 12  usd pledged       374864 non-null  float64
 13  usd_pledged_real  378661 non-null  float64
 14  usd_goal_real     378661 non-null  float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
299667,595962034,Button Poetry Live!,Mixed Media,Art,USD,2015-09-18,10000.0,2015-08-19 19:34:20,18216.27,successful,455,US,18216.27,18216.27,10000.0
181674,1924707671,"C STREET 2012 : Tbilisi, Georgia",World Music,Music,USD,2012-06-07,5000.0,2012-05-08 18:22:59,7210.69,successful,82,US,7210.69,7210.69,5000.0
137583,1698707842,Dérive's Next Project,Punk,Music,USD,2014-07-06,1200.0,2014-06-08 17:58:37,1255.66,successful,33,US,1255.66,1255.66,1200.0
296861,581269566,Photo Book - World Santa Claus Congress,Photobooks,Photography,DKK,2017-04-14,110000.0,2017-03-14 23:45:35,462.0,failed,5,DK,0.0,66.46,15823.47
66362,1337585114,Kickstart CLE Brewing to greatness!,Drinks,Food,USD,2017-08-19,6500.0,2017-07-20 21:22:43,250.0,failed,5,US,75.0,250.0,6500.0


In [5]:
# convert categorical data to one-hot-encoded
lb_category = DataFrameLabelBinarizer(data_frame=ks_projects_orig, column_to_encode='category')
ks_projects_orig = lb_category.encode()

lb_main_category = DataFrameLabelBinarizer(data_frame=ks_projects_orig, column_to_encode='main_category')
ks_projects_orig = lb_main_category.encode()

lb_country = DataFrameLabelBinarizer(data_frame=ks_projects_orig, column_to_encode='country')
ks_projects_orig = lb_country.encode()

# get info about DataFrame columns
# ks_projects_orig.info()

# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)

Unnamed: 0,ID,name,currency,deadline,goal,launched,pledged,state,backers,usd pledged,...,13,14,15,16,17,18,19,20,21,22
284129,515317667,Three Zombies in Philadelphia,USD,2013-05-29,20000.0,2013-03-30 19:36:04,29.99,failed,4,29.99,...,0,0,0,0,0,0,0,0,0,1
187025,1952141049,Chroma Cafe and Bakery Hood and Expansion Project,USD,2014-05-30,9750.0,2014-04-25 03:13:43,9900.0,successful,134,9900.0,...,0,0,0,0,0,0,0,0,0,1
231103,24476326,The L Train,USD,2017-09-09,6000.0,2017-08-25 00:00:26,6065.0,successful,81,340.0,...,0,0,0,0,0,0,0,0,0,1
117045,1594525425,Slimo - Desktop Buddy,USD,2017-11-01,250.0,2017-10-02 20:10:09,22.0,failed,3,10.0,...,0,0,0,0,0,0,0,0,0,1
214293,2091909738,TheBoutSheet.Com Mobile Aps,USD,2015-06-24,25000.0,2015-05-18 23:40:29,0.0,failed,0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# Parsing Dates

# convert 'deadline' and 'launched' columns to datetime
ks_projects_orig['deadline'] = pd.to_datetime(ks_projects_orig['deadline'])
ks_projects_orig['launched'] = pd.to_datetime(ks_projects_orig['launched'])

## print the first few rows of the date column
print(ks_projects_orig['deadline'].head())

# create a new column, 'deadline_parsed', with the parsed dates
ks_projects_orig['deadline_parsed'] = pd.to_datetime(ks_projects_orig['deadline'], format = "%Y-%m-%d")

# print the first few rows
ks_projects_orig['deadline_parsed'].head()


# create a new column, 'launched_parsed', with the parsed dates
ks_projects_orig['launched_parsed'] = pd.to_datetime(ks_projects_orig['launched'], format = "%Y-%m-%d")

# print the first few rows
ks_projects_orig['launched_parsed'].head()

# create a new column 'duration', as a difference between deadline and launched
ks_projects_orig['duration'] = ks_projects_orig['deadline_parsed'] - ks_projects_orig['launched_parsed']
ks_projects_orig['duration'] = ks_projects_orig['duration'].astype('timedelta64[D]')

# print the first few rows
ks_projects_orig['duration'].head()

0   2015-10-09
1   2017-11-01
2   2013-02-26
3   2012-04-16
4   2015-08-29
Name: deadline, dtype: datetime64[ns]


0    58.0
1    59.0
2    44.0
3    29.0
4    55.0
Name: duration, dtype: float64

In [7]:
# convert 'name' to char count matrix
ngram_counter = NGramCounter(data_frame=ks_projects_orig, column_to_encode='name', ngram_length=1)
ks_projects_orig = ngram_counter.encode()

MemoryError: Unable to allocate 414. GiB for an array with shape (378661, 146920) and data type int64

In [None]:
# look at a few rows of the ks_projects file
ks_projects_orig.sample(5)

In [None]:
# convert 'status' to numeric value applying function:
# 'success' -> 1
# 'other' -> sigmoid(ks_projects_orig['usd_pledged_real'] / ks_projects_orig['usd_goal_real'])
# where 0.2 is an arbitrary penalty for failure
ks_projects_orig['state_converted'] = np.where(ks_projects_orig['state'] == 'successful', 1,
                                               ks_projects_orig['usd_pledged_real'] / ks_projects_orig['usd_goal_real'])
ks_projects_orig['state_converted'] = ks_projects_orig['state_converted'].apply(lambda x: 1 if x > 1 else x)
ks_projects_orig['state_converted'] = np.where(ks_projects_orig['state'] != 'successful', ks_projects_orig['state_converted'] - 0.2, ks_projects_orig['state_converted'])
ks_projects_orig['state_converted'] = ks_projects_orig['state_converted'].apply(lambda x: 0 if x < 0 else x)

# print the first few rows
ks_projects_orig['state_converted'].head()

In [None]:
# drop redundant columns
ks_projects_reduced = ks_projects_orig.drop(['ID', 'currency', 'goal', 'pledged', 'usd pledged', 'usd_pledged_real', 'deadline',
                                             'deadline_parsed', 'launched', 'launched_parsed', 'state'], axis=1)

In [None]:
# get info about DataFrame columns
ks_projects_reduced.info()

In [None]:
# look at a few rows of the ks_projects file
ks_projects_reduced.sample(5)

In [None]:
# convert to numpy_array
ks_projects_numpy = ks_projects_reduced.to_numpy()

In [None]:
# split numpy array into train and test datasets

train_indices = np.random.rand(len(ks_projects_numpy)) < 0.8

ks_projects_numpy_col_no = ks_projects_numpy.shape[1]

train_set = ks_projects_numpy[train_indices]
X_train, y_train = train_set[:, range(ks_projects_numpy_col_no - 1)], train_set[:, ks_projects_numpy_col_no - 1]

test_set = ks_projects_numpy[~train_indices]
X_test, y_test = test_set[:, range(ks_projects_numpy_col_no - 1)], test_set[:, ks_projects_numpy_col_no - 1]

In [None]:
# create first simple model using Keras Sequential

n_inputs = train_set.shape[1] - 1
n_hidden1 = 800
n_hidden2 = 400
drop_rate = 0.5
momentum = 0.99
learning_rate = 0.1

he_init = keras.initializers.he_normal()
elu = keras.activations.elu
batch_normalization = layers.BatchNormalization
dropout = layers.Dropout

model = keras.Sequential(
    [
        keras.Input(shape=(n_inputs,)),
        layers.Dense(n_hidden1, kernel_initializer= he_init , bias_initializer=he_init, activation=elu, name="hidden1"),
        batch_normalization(momentum=momentum),
        dropout(rate=drop_rate),
        layers.Dense(n_hidden2, kernel_initializer=he_init, bias_initializer=he_init, activation=elu, name="hidden2"),
        batch_normalization(momentum=momentum),
        dropout(rate=drop_rate),
        layers.Dense(1, activation=keras.activations.linear, name="outputs"),
        batch_normalization(momentum=momentum)
    ]
)

#print model details
model.summary()

In [None]:
# train model
optimizer = tf.compat.v1.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True)
model.compile(optimizer=optimizer, loss=keras.losses.mse, metrics=['mse', 'mae'])
history=model.fit(X_train, y_train, epochs=30, batch_size=150, verbose=1, validation_split=0.2)

#%

# print model loss
print(history.history.keys())

In [None]:
# "Loss"
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()