In [1]:
#import multiple files with names as Kickstarter*.csv into pandas dataframe (data used 15 Aug 2019)

from glob import glob
import pandas as pd

# for auto-reloading extenrnal modules
%load_ext autoreload
%autoreload 2

#just replace the path parts
#glob import all files starting with Kickstarter in the folder (the folder i used is the 15 Aug 2019)

filenames = glob(r"Kickstarter_2019-08-15T03_20_03_022Z/Kickstarter*.csv")

df = [pd.read_csv(f, encoding='utf-8') for f in filenames]

data = pd.concat(df, axis = 0, ignore_index = True).reset_index()
print(data.columns.tolist())
print(data.iloc[0])

['index', 'backers_count', 'blurb', 'category', 'converted_pledged_amount', 'country', 'created_at', 'creator', 'currency', 'currency_symbol', 'currency_trailing_code', 'current_currency', 'deadline', 'disable_communication', 'friends', 'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged', 'profile', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged', 'usd_type']
index                                                                       0
backers_count                                                              10
blurb                       Buttons created to commemorate the January 21s...
category                    {"id":27,"name":"Graphic Design","slug":"desig...
converted_pledged_amount                                                  171
country                                                                    US
created_at   

In [2]:
from pandas.io.json import json_normalize
import json as json
import numpy as np

#data['category'].fillna(json.dumps('{}'), inplace = True)
data['location'].fillna('{}', inplace = True)
#data['creator'].fillna(json.dumps('{}'), inplace = True)

x1 = json_normalize(data['category'].apply(json.loads)).add_prefix('category_')
x3 = json_normalize(data['location'].apply(json.loads)).add_prefix('location_')
x5 = json_normalize(data['creator'].apply(lambda x: json.loads(x.replace('\n', '').replace('\r', '').replace('\\', '').replace('"', r'\"').replace(r'{\"', '{"').replace(r'\"}', '"}').replace(r':\"', ':"').replace(r'\":', '":').replace(r',\"', ',"').replace(r'\",', '",').replace(r'", ', r'\",')))).add_prefix('creator_')

data = pd.concat([data, x1, x3, x5], axis = 1)

In [3]:
from datetime import datetime
data['launched_at_utc'] = data['launched_at'].apply(datetime.utcfromtimestamp)
data['deadline_utc'] = data['deadline'].apply(datetime.utcfromtimestamp)
data['created_at_utc'] = data['created_at'].apply(datetime.utcfromtimestamp)
data['goal'] = round(data['goal'] * data['static_usd_rate'], 2)

In [4]:
#pick the relevant data and drop all irrelevant data 

df = data[['disable_communication', 'country', 'currency',
'goal', 'category_id', 'deadline_utc', 'launched_at_utc', 'state']]

#feature creation 
#feature 1: no. of projects for each creator 
# df1 = df.groupby('creator_id')['id'].count()
# df = pd.merge(df, pd.DataFrame(df1), how = 'left', on = 'creator_id')

#feature 2: duration of project 
df['duration'] = (df['deadline_utc'] - df['launched_at_utc']).dt.days

#feature 4: month of launch
df['launch_month'] = df['launched_at_utc'].dt.month

#change state to numerical 
df['state'] = df['state'].map({'successful': 1, 'failed': 0 })

#drop columns
df = df.drop(columns = ['launched_at_utc', 'deadline_utc'])
df = df.dropna() #drop null rows
columns = df.columns.tolist()

columns = columns[-2:] + columns[:-2]
df = df[columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

print(df.columns.tolist())
print(df.iloc[0].values)
X = np.array(df.iloc[:,:-1].values)
y = np.array(df.iloc[:,-1].values)

label_encoder_month_of_launch = LabelEncoder()
X[:, 1] = label_encoder_month_of_launch.fit_transform(X[:, 1])

label_encoder_disable_com = LabelEncoder()
X[:, 2] = label_encoder_disable_com.fit_transform(X[:, 2])

label_encoder_country = LabelEncoder()
X[:, 3] = label_encoder_country.fit_transform(X[:, 3])

label_encoder_currency = LabelEncoder()
X[:, 4] = label_encoder_currency.fit_transform(X[:, 4])

label_encoder_category = LabelEncoder()
X[:, 6] = label_encoder_category.fit_transform(X[:, 6])

one_hot_encoding = OneHotEncoder(categorical_features=[1, 2, 3, 4, 6])
X = one_hot_encoding.fit_transform(X).toarray()

['duration', 'launch_month', 'disable_communication', 'country', 'currency', 'goal', 'category_id', 'state']
[29 2 False 'US' 'USD' 500.0 27 0.0]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape)

(153860, 219) (38466, 219)


In [7]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

classifier = Sequential()

input_dim = int(X_train.shape[1])
output_dim = input_dim // 2

print('Input dim: ', input_dim)
print('Output dim: ', output_dim)

#input layer
classifier.add(Dense(units=input_dim, activation='relu', input_dim=input_dim))
classifier.add(Dropout(p=0.1))

# hidden layer
classifier.add(Dense(units=input_dim, activation='relu'))
classifier.add(Dropout(p=0.1))

# output layer
classifier.add(Dense(output_dim=1, activation='sigmoid'))

# compile ann
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit ANN
classifier.fit(X_train, y_train, batch_size=30, epochs=50)

Input dim:  219
Output dim:  109


  app.launch_new_instance()


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f7d04ca5a20>

In [21]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred

array([[ True],
       [ True],
       [False],
       ...,
       [False],
       [False],
       [ True]])

In [22]:
from sklearn.metrics import accuracy_score
metrics = accuracy_score(y_test, y_pred)
metrics

0.7618156293869911