# Kickstarter Preprocessing

## Preprocess the data. Balance the dataset. Create 3 datasets: training, validation, and test. Save the newly created sets in a tensor friendly format (e.g. *.npz)

In [1]:
#Import libraries 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers

# We will use the sklearn preprocessing library, as it will be easier to standardize the data.
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

## Visualize the data

In [2]:
# GENERAL : Read the data set
kickstarter = pd.read_csv('ks-projects-201801-clean.csv')
kickstarter.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_year,period,funded_ratio
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 00:00:00,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015,58 days 11:47:32.000000000,0.0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01 00:00:00,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017,59 days 19:16:03.000000000,0.0807
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:00:00,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013,44 days 23:39:10.000000000,0.004889
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 00:00:00,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012,29 days 20:35:49.000000000,0.0002
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 00:00:00,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015,55 days 15:24:57.000000000,0.065795


In [107]:
# Remove columns that won't be used
kickstarter = kickstarter.drop(['category','ID', 'name', 'deadline', 'launched', 'launched_year', 'currency', 'goal', 'pledged', 'usd pledged'], axis=1)
kickstarter.head()

Unnamed: 0,main_category,state,backers,country,usd_pledged_real,usd_goal_real,period,funded_ratio
0,Publishing,failed,0,GB,0.0,1533.95,58 days 11:47:32.000000000,0.0
1,Film & Video,failed,15,US,2421.0,30000.0,59 days 19:16:03.000000000,0.0807
2,Film & Video,failed,3,US,220.0,45000.0,44 days 23:39:10.000000000,0.004889
3,Music,failed,1,US,1.0,5000.0,29 days 20:35:49.000000000,0.0002
4,Film & Video,canceled,14,US,1283.0,19500.0,55 days 15:24:57.000000000,0.065795


In [108]:
# Convert state column to numbers
cleanup_state = {"state": {"failed": 0, "canceled": 0, "successful": 1}}
kickstarter.replace(cleanup_state, inplace=True)
kickstarter.head()

Unnamed: 0,main_category,state,backers,country,usd_pledged_real,usd_goal_real,period,funded_ratio
0,Publishing,0,0,GB,0.0,1533.95,58 days 11:47:32.000000000,0.0
1,Film & Video,0,15,US,2421.0,30000.0,59 days 19:16:03.000000000,0.0807
2,Film & Video,0,3,US,220.0,45000.0,44 days 23:39:10.000000000,0.004889
3,Music,0,1,US,1.0,5000.0,29 days 20:35:49.000000000,0.0002
4,Film & Video,0,14,US,1283.0,19500.0,55 days 15:24:57.000000000,0.065795


In [109]:
# Count number of ones and ceros
state_counts = kickstarter ['state'].value_counts()
state_percent = kickstarter.state.value_counts(normalize=True).mul(100).round(1).astype(str)+'%'
pd.DataFrame({'counts':state_counts,'percent': state_percent})

Unnamed: 0,counts,percent
0,233247,63.8%
1,132266,36.2%


In [110]:
# Parse days of period to integer
kickstarter['period'] = kickstarter['period'].str[:2]
kickstarter['period'] = kickstarter['period'].astype(int)

In [111]:
kickstarter.head()

Unnamed: 0,main_category,state,backers,country,usd_pledged_real,usd_goal_real,period,funded_ratio
0,Publishing,0,0,GB,0.0,1533.95,58,0.0
1,Film & Video,0,15,US,2421.0,30000.0,59,0.0807
2,Film & Video,0,3,US,220.0,45000.0,44,0.004889
3,Music,0,1,US,1.0,5000.0,29,0.0002
4,Film & Video,0,14,US,1283.0,19500.0,55,0.065795


In [112]:
# Split the dataframe into train, validation, and test
train, test = train_test_split(kickstarter, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

233928 train examples
58482 validation examples
73103 test examples


In [113]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('state')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [114]:
# Choose which columns to use
feature_columns = []

# numeric cols
for header in ['backers', 'usd_pledged_real', 'usd_goal_real', 'period', 'funded_ratio']:
  feature_columns.append(feature_column.numeric_column(header))

# embedding cols
categories = ['Film & Video', 'Music', 'Publishing', 'Games', 'Technology', 'Design', 'Art', 'Food', 'Fashion', 'Theater', 'Comics', 'Photography', 'Crafts', 'Journalism', 'Dance']
main_category = feature_column.categorical_column_with_vocabulary_list(
      'main_category', categories)
main_category_embedding = feature_column.embedding_column(main_category, dimension=8)
feature_columns.append(main_category_embedding)

# embedding cols
countries = ['US', 'GB', 'CA', 'AU', 'DE', 'FR', 'NL', 'IT', 'ES', 'SE', 'MX', 'NZ', 'DK', 'IE', 'CH', 'NO', 'BE', 'AT', 'HK', 'SG', 'LU', 'JP']
country = feature_column.categorical_column_with_vocabulary_list(
      'country', countries)
country_embedding = feature_column.embedding_column(country, dimension=8)
feature_columns.append(country_embedding)

In [115]:
# Create a feature layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [116]:
#Trnasorm pandas dt to ds
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [118]:
# Create, compile, and train the model
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Train for 7311 steps, validate for 1828 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1cfc6859148>

In [119]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.99659383


In [140]:
example_batch = next(iter(train_ds))[0]
example_result = model.predict(example_batch)
example_result

array([[-6.0442173e+03],
       [-1.9660100e+02],
       [-1.5000263e+04],
       [ 4.6873484e+00],
       [-1.5003446e+04],
       [ 4.6873484e+00],
       [-1.1767678e+02],
       [-5.8219141e+02],
       [-1.7707749e+03],
       [ 4.6873484e+00],
       [-4.5294519e+02],
       [ 4.6873484e+00],
       [-2.4316616e+03],
       [-1.0518740e+03],
       [-6.3161621e+03],
       [-2.4333209e+05],
       [ 4.6873484e+00],
       [ 4.6873484e+00],
       [-3.8182605e+04],
       [-2.9336489e+03],
       [-1.5636982e+03],
       [ 4.6873484e+00],
       [-1.2690889e+04],
       [-4.9772139e+03],
       [ 4.6873484e+00],
       [-1.6119035e+04],
       [-1.7943389e+03],
       [-6.6647769e+03],
       [-9.7637914e+04],
       [-6.0621735e+02],
       [ 4.6873484e+00],
       [-2.6472303e+04]], dtype=float32)

In [156]:
example_batch

{'main_category': <tf.Tensor: shape=(32,), dtype=string, numpy=
 array([b'Design', b'Crafts', b'Design', b'Design', b'Publishing',
        b'Games', b'Publishing', b'Games', b'Music', b'Art', b'Crafts',
        b'Publishing', b'Music', b'Publishing', b'Film & Video', b'Art',
        b'Music', b'Music', b'Technology', b'Games', b'Music',
        b'Film & Video', b'Games', b'Publishing', b'Dance', b'Fashion',
        b'Comics', b'Technology', b'Art', b'Film & Video', b'Design',
        b'Art'], dtype=object)>,
 'backers': <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([  2,   7,  48, 480,   0, 344,   0,   1,  18,  37,   2,  29,   1,
          4,  59,   1, 113,  54,  75,  36,   8,  53,   1,   9,  84,   4,
          0,   1,   0,   0, 109,  33])>,
 'country': <tf.Tensor: shape=(32,), dtype=string, numpy=
 array([b'US', b'US', b'GB', b'SG', b'CA', b'US', b'US', b'US', b'US',
        b'DE', b'US', b'US', b'US', b'US', b'US', b'NL', b'US', b'US',
        b'AU', b'US', b'US', b'US', b'US',

Literature : https://www.tensorflow.org/tutorials/keras/regression
        https://www.tensorflow.org/tutorials/structured_data/feature_columns