In [23]:
import pandas as pd

train_data = pd.read_csv("data/train_new_numeric.csv")
train_data.shape

(83096, 35)

In [70]:
train_copy = train_data.copy()
train_copy.shape

(83096, 35)

In [25]:
from sklearn.model_selection import train_test_split


In [26]:
label = train_copy.final_status.values.tolist()  # a list of class labels i.e. final_status 0 failed, 1 succesful.
del train_copy['final_status']  # removing the class/label column, so we can extract the test examples


In [71]:
# getting list of all non-numeric features.
nonnumeric_columns = train_copy.select_dtypes(['object']).columns
print(nonnumeric_columns)

Index([u'project_id', u'name', u'desc', u'keywords', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'photo', u'state',
       u'creator', u'category', u'profile'],
      dtype='object')


In [28]:
# dropping the object type columns
train_copy.drop(nonnumeric_columns, axis=1, inplace=True)

In [29]:
train_copy.shape

(83096, 20)

In [30]:
features = train_copy.values.tolist()  # extracting features as lists.
# features[:2]


In [31]:
# the split
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=40)


In [32]:
print(len(X_train))
print(len(X_test))


66476
16620


# Decision Tree


In [21]:
from sklearn import tree
import numpy as np

In [33]:
dtree_one = tree.DecisionTreeClassifier()  # getting the tree
dtree_one = dtree_one.fit(X_train, y_train)  # fitting/training the tree classifier
print(dtree_one.feature_importances_)  # probably be calculating the inforamtion gain/entropy
print(dtree_one.score(X_train, y_train))

[  2.40768845e-03   8.22113114e-02   1.44898959e-01   9.85458608e-04
   2.70384059e-04   3.53173758e-01   1.04743481e-01   2.05638849e-03
   1.61320427e-03   2.25002422e-01   2.73065928e-04   0.00000000e+00
   1.05935916e-03   4.02366194e-02   1.35310230e-03   1.58989199e-03
   9.90138544e-05   0.00000000e+00   0.00000000e+00   3.80258924e-02]
1.0


In [34]:
# prediction 
dtree_one_prediction = dtree_one.predict(X_test)

In [38]:
# print(dtree_one.score(X_train, y_train))
print(dtree_one.score(X_test, y_test)) # score takes in test data for accuracy 

1.0
0.983092659446


In [41]:
max_depth =5
min_samples_split = 5

dtree_three = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)

In [42]:
dtree_three = dtree_three.fit(X_train, y_train)  # fitting/training the tree classifier
print(dtree_three.score(X_test, y_test))
print(dtree_three.feature_importances_)

0.880204572804
[  0.00000000e+00   4.42640633e-02   9.02721208e-02   2.09259636e-04
   0.00000000e+00   7.19518952e-02   1.06961454e-01   0.00000000e+00
   0.00000000e+00   9.71717036e-02   0.00000000e+00   0.00000000e+00
   0.00000000e+00   5.00855595e-02   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00   5.39083944e-01]


# Deep Neural Network

In [72]:
train_nn = train_copy.copy()
train_nn.shape

(83096, 35)

In [78]:
label_nn = train_nn.final_status.values.tolist()  # a list of class labels i.e. final_status 0 failed, 1 succesful.
del train_nn['final_status']  # removing the class/label column, so we can extract the test examples


In [74]:
# getting list of all non-numeric features.
nonnumeric_columns = train_nn.select_dtypes(['object']).columns
print(nonnumeric_columns)

Index([u'project_id', u'name', u'desc', u'keywords', u'currency', u'deadline',
       u'state_changed_at', u'created_at', u'launched_at', u'photo', u'state',
       u'creator', u'category', u'profile'],
      dtype='object')


In [75]:
# dropping the object type columns
train_nn.drop(nonnumeric_columns, axis=1, inplace=True)

In [76]:
features_nn = train_nn.values.tolist()  # extracting features as lists.


In [83]:
# to generatee categorical labels for multi class problems - 
from keras.utils.np_utils import to_categorical

categorical_labels = to_categorical(label_nn, 2)  
X_train, X_test, y_train, y_test = train_test_split(features_nn, categorical_labels, test_size=0.2, random_state=40)


In [43]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers import advanced_activations  # all the available activation functions e.g. relu, 


Using TensorFlow backend.


To obtain better accuracy we need to make specific choices for the number of 
- epochs of training, 
- the mini-batch size, and 
- the learning rate.

The above params are known as hyper-parameters for our neural network, in order to distinguish them from the parameters (weights and biases) learnt by our learning algorithm. If we choose our hyper-parameters poorly, we can get bad results. Suppose, for example, that we'd chosen the learning rate to be η=0.001 η=0.001,



In [84]:
# the basic params for deep learning network
FEATURE_NUM = 20   # the input dimension - no of features == input layer nodes.
# CLASSES = 1  # binary classification problem - successful 1 or failed 0 projects - has to be one i.e. size of output layer.
CLASSES = 2
# HIDDEN1_SIZE = 100
# HIDDEN2_SIZE = 50
HIDDEN1_SIZE = 10
HIDDEN2_SIZE = 5
MAX_RANGE = 100



In [85]:
model = Sequential()  # The Sequential model is a linear stack of layers.
model.add(Dense(HIDDEN1_SIZE, input_dim=FEATURE_NUM, init='uniform'))  # next to input layer.


  from ipykernel import kernelapp as app


In [86]:
model.add(advanced_activations.ELU(alpha=1.0))  # adds activation to the last specified layer 
model.add(Dropout(0.6))
model.add(Dense(HIDDEN2_SIZE, init='uniform'))  # # after the first layer, you don't need to specify

  app.launch_new_instance()


In [87]:
# the output layer - for binary classification it has 1 node.

model.add(advanced_activations.ELU(alpha=1.0))
model.add(Dropout(0.6))
model.add(Dense(CLASSES, init='uniform', activation='softmax'))  # output layer - NUmber of classes



In [88]:
checkpointer = ModelCheckpoint(filepath="/tmp/weights.hdf5",
                               verbose=1,
                               save_best_only=True)

In [89]:
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=False)

In [90]:
model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

In [91]:
# Epoch 100, batch size 50, optimizer Adam, loss binary_crossentropy
model.fit(X_train, y_train,
#           nb_epoch =MAX_RANGE,  # 100 number of iteration 
          nb_epoch=10,
#           batch_size=1000,
          batch_size=50,
          validation_data=(X_test, y_test),
          callbacks=[checkpointer, tensorboard])

Train on 66476 samples, validate on 16620 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8e41185250>