In [1]:
#import multiple files with names as Kickstarter*.csv into pandas dataframe (data used 15 Aug 2019)

from glob import glob
import pandas as pd
from pandas.io.json import json_normalize
import json as json
import numpy as np
from datetime import datetime

# for auto-reloading extenrnal modules
%load_ext autoreload
%autoreload 2

#just replace the path parts
#glob import all files starting with Kickstarter in the folder (the folder i used is the 15 Aug 2019)

filenames = glob(r"/mnt/CS3244_Proj_Data/Kickstarter_201*/Kickstarter*.csv")

df_chunks = []
for f in filenames:
    for chunk in pd.read_csv(f, encoding='utf-8', chunksize=100000):
        chunk['launched_at_utc'] = chunk['launched_at'].apply(datetime.utcfromtimestamp)
        chunk['deadline_utc'] = chunk['deadline'].apply(datetime.utcfromtimestamp)
        chunk['created_at_utc'] = chunk['created_at'].apply(datetime.utcfromtimestamp)
        chunk['goal'] = round(chunk['goal'] * chunk['static_usd_rate'], 2)
        x1 = json_normalize(chunk['category'].apply(json.loads)).add_prefix('category_')
        chunk = pd.concat([chunk, x1], axis = 1)
        df = chunk[['disable_communication', 'country', 'currency', 'goal', 'category_id', 'deadline_utc', 'launched_at_utc', 'state']]
        #pick the relevant data and drop all irrelevant data 

        #feature 1: duration of project 
        df['duration'] = (df['deadline_utc'] - df['launched_at_utc']).dt.days

        #feature 2: month of launch
        df['launch_month'] = df['launched_at_utc'].dt.month

        #change state to numerical 
        df['state'] = df['state'].map({'successful': 1, 'failed': 0 })

        #drop columns
        df = df.drop(columns = ['launched_at_utc', 'deadline_utc'])
        df = df.dropna() #drop null rows
        columns = df.columns.tolist()

        columns = columns[-2:] + columns[:-2]
        df_chunks.append(df[columns])
        

df = pd.concat(df_chunks, axis = 0, ignore_index = True).reset_index()
df = df.drop(columns = ['index'])
success_length = df.loc[df['state'] == 1].shape[0]
failure_length = df.shape[0] - success_length
sample_size = min(success_length, failure_length) // 5
print(sample_size)
# Get equal number of success and failure cases
df = pd.concat([df.loc[df['state'] == 1].sample(n=sample_size), df.loc[df['state'] == 0].sample(n=sample_size)])
print(df.columns.tolist())
print(df.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


328902
['duration', 'launch_month', 'disable_communication', 'country', 'currency', 'goal', 'category_id', 'state']
(657804, 8)


In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

print(df.columns.tolist())
print(df.iloc[0].values)
X = np.array(df.iloc[:,:-1].values)
y = np.array(df.iloc[:,-1].values)

label_encoder_month_of_launch = LabelEncoder()
X[:, 1] = label_encoder_month_of_launch.fit_transform(X[:, 1])

label_encoder_disable_com = LabelEncoder()
X[:, 2] = label_encoder_disable_com.fit_transform(X[:, 2])

label_encoder_country = LabelEncoder()
X[:, 3] = label_encoder_country.fit_transform(X[:, 3])

label_encoder_currency = LabelEncoder()
X[:, 4] = label_encoder_currency.fit_transform(X[:, 4])

label_encoder_category = LabelEncoder()
X[:, 6] = label_encoder_category.fit_transform(X[:, 6])

one_hot_encoding = OneHotEncoder(categorical_features=[1, 2, 3, 4, 6])
X = one_hot_encoding.fit_transform(X).toarray()

['duration', 'launch_month', 'disable_communication', 'country', 'currency', 'goal', 'category_id', 'state']
[24 7 False 'US' 'USD' 5000.0 34 1.0]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape)

(526243, 221) (131561, 221)


In [4]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [15]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

classifier = Sequential()

input_dim = int(X_train.shape[1])
output_dim = input_dim // 3

print('Input dim: ', input_dim)
print('Output dim: ', output_dim)

#input layer
classifier.add(Dense(units=output_dim, activation='relu', input_dim=input_dim))
classifier.add(Dropout(p=0.1))

# hidden layer
classifier.add(Dense(units=output_dim, activation='relu'))
classifier.add(Dropout(rate=0.1))
classifier.add(Dense(units=output_dim, activation='relu'))
classifier.add(Dropout(rate=0.1))

# output layer
classifier.add(Dense(units=1, activation='sigmoid'))

# compile ann
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# fit ANN
classifier.fit(X_train, y_train, batch_size=30, epochs=40)

Input dim:  221
Output dim:  73


  app.launch_new_instance()


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fcea11cfc50>

In [16]:
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred

array([[ True],
       [False],
       [ True],
       ...,
       [ True],
       [ True],
       [ True]])

In [17]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
accuracy_score = accuracy_score(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred) / len(y_pred)
roc_score = roc_auc_score(y_test, y_pred)

print("Accuracy score of: ", accuracy_score)
print("False Positive rate of: ", confusion_matrix[0, 1])
print("False Negative rate of: ", confusion_matrix[1, 0])
print("ROC AUC Score of: ", roc_score)

Accuracy score of:  0.7833932548399601
False Positive rate of:  0.0738364712946846
False Negative rate of:  0.14277027386535524
ROC AUC Score of:  0.7833937787958144
