<h1>Project Delivery #3 - Deep Learning based Text Classification - Predict Success</h1>

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import texthero as hero
from texthero import preprocessing

ks2018 = pd.read_csv("ks-projects-201801.csv")
#ks2016 = pd.read_csv("ks-projects-201612.csv")
#print(ks2016.info())

df=ks2018

# remove currently live campaigns
df = df[df.state != 'live']

# drop columns we don't need
df = df.drop('ID', axis=1)
df = df.drop('category', axis=1) #we keep main category
#df = df.drop('goal', axis=1)
df = df.drop('pledged', axis=1)
df = df.drop('usd pledged', axis=1)
df = df.drop('currency', axis=1)

# if successful then 1, else 0, this is our class label
df['state'] = df['state'].map(lambda x: 1 if x == "successful" else 0)
print(df.state.value_counts())


df = df[df['name'].notna()]
custom_pipeline = [preprocessing.fillna,
                   #preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_diacritics
                   #preprocessing.remove_brackets
                  ]
df['name'] = hero.clean(df['name'], custom_pipeline)
df['name'] = [n.replace('{','') for n in df['name']]
df['name'] = [n.replace('}','') for n in df['name']]
df['name'] = [n.replace('(','') for n in df['name']]
df['name'] = [n.replace(')','') for n in df['name']]

X=df
X=X.drop(['state', 'deadline', 'launched', 'backers', 'usd_pledged_real'], axis=1)
y=df["state"]

#encoding category and country //nominal attributes
X["main_category"] = X["main_category"].astype('category')
X["main_category"] = X["main_category"].cat.codes
X["country"] = X["country"].astype('category')
X["country"] = X["country"].cat.codes

#https://github.com/sdevalapurkar/kickstarter-prediction
#https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
#https://www.kaggle.com/stacykurnikova/using-glove-embedding
#https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
#https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
#https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
#https://realpython.com/python-keras-text-classification/#convolutional-neural-networks-cnn
#https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/

0    241906
1    133956
Name: state, dtype: int64


In [2]:
embeddings_index = {}
f = open('glove.6B.300d.txt',encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [3]:
import re

from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
## Iterate over the data to preprocess by removing stopwords
lines_without_stopwords=[] 
for line in X['name'].values: 
    line = line.lower()
    line_by_words = re.findall(r'(?:\w+)', line, flags = re.UNICODE) # remove punctuation ans split
    new_line=[]
    for word in line_by_words:
        if word not in stop:
            new_line.append(word)
    lines_without_stopwords.append(new_line)
texts = lines_without_stopwords

In [4]:
## Code adapted from (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
# Vectorize the text samples
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X['name']=texts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True)

MAX_NUM_WORDS = 1000
MAX_SEQUENCE_LENGTH = 100
tokenizertrain = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizertrain.fit_on_texts(X_train["name"])
sequences = tokenizertrain.texts_to_sequences(X_train["name"])

word_index = tokenizertrain.word_index
print('Found %s unique tokens.' % len(word_index))

X_train_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

tokenizertest = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizertest.fit_on_texts(X_test["name"])
sequences = tokenizertest.texts_to_sequences(X_test["name"])

word_index = tokenizertest.word_index
print('Found %s unique tokens.' % len(word_index))

X_test_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

X_train_nottext=X_train.drop('name', axis=1)
X_test_nottext=X_test.drop('name', axis=1)

y_train = to_categorical(np.asarray(y_train))
y_test = to_categorical(np.asarray(y_test))

print(X_train_text.shape)
print(X_test_text.shape)

Found 126185 unique tokens.
Found 51163 unique tokens.
(300686, 100)
(75172, 100)


In [5]:
## More code adapted from the keras reference (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
# prepare embedding matrix 
from keras.layers import Embedding
from keras.initializers import Constant
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, GlobalMaxPooling1D, Dropout, Activation, MaxPooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

## EMBEDDING_DIM =  ## seems to need to match the embeddings_index dimension
EMBEDDING_DIM = embeddings_index.get('a').shape[0]
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [21]:
def create_mlp(dim, regress=False): #Multi-layer perceptron for nominal, numeric data
	# define our MLP network
	model = Sequential()
	model.add(Dense(8, input_dim=dim, activation="relu"))
	model.add(Dense(4, activation="relu"))
	model.add(Dense(2, activation="sigmoid"))
	# check to see if the regression node should be added
	if regress:
		model.add(Dense(1, activation="linear"))
	# return our model
	return model

mlpModel=create_mlp(X_train_nottext.shape[1], regress=False)

In [22]:
## Code from: https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa
## To create and visualize a model

modelcnn = Sequential()
modelcnn.add(Embedding(num_words, 300, input_length=100, weights= [embedding_matrix], trainable=False))
modelcnn.add(Conv1D(128, 5, activation='relu'))
modelcnn.add(GlobalMaxPooling1D())
modelcnn.add(Dense(10, activation='relu'))
modelcnn.add(Dense(2, activation='sigmoid'))
modelcnn.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
modelcnn.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 300)          300300    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 128)           192128    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_21 (Dense)             (None, 2)                 22        
Total params: 493,740
Trainable params: 193,440
Non-trainable params: 300,300
_________________________________________________________________


In [23]:
combinedInput = concatenate([mlpModel.output, modelcnn.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(2, activation="sigmoid")(x)
# our final model will accept categorical/numerical data on the MLP
# input and texts on the CNN input, outputting a single value //successful or not
model = Model(inputs=[mlpModel.input, modelcnn.input], outputs=x)

In [24]:
opt = Adam(lr=1e-3, decay=1e-3)
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])
# train the model
print("[INFO] training model...")

model.fit(
	x=[X_train_nottext, X_train_text], y=y_train,
	validation_split=0.1,
	epochs=1)
# make predictions on the testing data
print("[INFO] predicting success...")
model.evaluate([X_test_nottext, X_test_text], y=y_test)

[INFO] training model...
[INFO] predicting success...


[0.6399879455566406, 0.6450939178466797]

In [25]:
modellstm = Sequential()
modellstm.add(Embedding(num_words, 300, input_length=100, weights= [embedding_matrix], trainable=False))
modellstm.add(LSTM(300))
modellstm.add(Dense(2, activation='sigmoid'))
modellstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modellstm.summary()

combinedInput = concatenate([mlpModel.output, modellstm.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(2, activation="sigmoid")(x)
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlpModel.input, modellstm.input], outputs=x)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 300)          300300    
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 602       
Total params: 1,022,102
Trainable params: 721,802
Non-trainable params: 300,300
_________________________________________________________________


In [26]:
opt = Adam(lr=1e-3, decay=1e-3)
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])
# train the model
print("[INFO] training model...")

model.fit(
	x=[X_train_nottext, X_train_text], y=y_train,
	validation_split=0.1,
	epochs=1)
# make predictions on the testing data
print("[INFO] predicting success...")
model.evaluate([X_test_nottext, X_test_text], y=y_test)

[INFO] training model...
[INFO] predicting success...


[0.6041766405105591, 0.6537407636642456]