<h1>Project Delivery #3 - Deep Learning based Text Classification - Predict Pledged Money</h1>

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import texthero as hero
from texthero import preprocessing
from sklearn.preprocessing import MinMaxScaler

ks2018 = pd.read_csv("ks-projects-201801.csv")
#ks2016 = pd.read_csv("ks-projects-201612.csv")
#print(ks2016.info())

df=ks2018
# remove currently live campaigns
df = df[df.state != 'live']

# drop columns we don't need
df = df.drop('ID', axis=1)
df = df.drop('category', axis=1) #we keep main category
df = df.drop('state', axis=1)
df = df.drop('usd pledged', axis=1)
df = df.drop("usd_pledged_real", axis=1)
df = df.drop("usd_goal_real", axis=1)
df = df.drop('currency', axis=1)


df = df[df['name'].notna()]
custom_pipeline = [preprocessing.fillna,
                   #preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_diacritics
                   #preprocessing.remove_brackets
                  ]
df['name'] = hero.clean(df['name'], custom_pipeline)
df['name'] = [n.replace('{','') for n in df['name']]
df['name'] = [n.replace('}','') for n in df['name']]
df['name'] = [n.replace('(','') for n in df['name']]
df['name'] = [n.replace(')','') for n in df['name']]

print(df.dtypes)

X=df
X=X.drop([ 'deadline', 'launched', 'backers','pledged'], axis=1)
y=df["pledged"]/df["goal"]

#encoding category and country //nominal attributes
X["main_category"] = X["main_category"].astype('category')
X["main_category"] = X["main_category"].cat.codes
X["country"] = X["country"].astype('category')
X["country"] = X["country"].cat.codes

print(X.dtypes)
print(y.dtypes)

#Removing Outliers
y1= y.between(0, 1, inclusive=True) 
y1= y1.index[y1 == True].tolist()
print(len(y))
#print(y1)
y=y[y1]
X=X.loc[y1,:]
print(len(X))
print(len(y))

#https://github.com/sdevalapurkar/kickstarter-prediction
#https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
#https://www.kaggle.com/stacykurnikova/using-glove-embedding
#https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
#https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
#https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
#https://realpython.com/python-keras-text-classification/#convolutional-neural-networks-cnn
#https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/

name              object
main_category     object
deadline          object
goal             float64
launched          object
pledged          float64
backers            int64
country           object
dtype: object
name              object
main_category       int8
goal             float64
country             int8
dtype: object
float64
375858
243441
243441


In [2]:
embeddings_index = {}
f = open('glove.6B.300d.txt',encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [3]:
import re

from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
## Iterate over the data to preprocess by removing stopwords
lines_without_stopwords=[] 
for line in X['name'].values: 
    line = line.lower()
    line_by_words = re.findall(r'(?:\w+)', line, flags = re.UNICODE) # remove punctuation ans split
    new_line=[]
    for word in line_by_words:
        if word not in stop:
            new_line.append(word)
    lines_without_stopwords.append(new_line)
texts = lines_without_stopwords

In [4]:
## Code adapted from (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
# Vectorize the text samples
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X['name']=texts
scaler = MinMaxScaler()
scaler.fit(X.iloc[:,1:])
normalized_arr=scaler.transform(X.iloc[:,1:])
X.iloc[:,1:]=normalized_arr

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True)

MAX_NUM_WORDS = 1000
MAX_SEQUENCE_LENGTH = 100
tokenizertrain = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizertrain.fit_on_texts(X_train["name"])
sequences = tokenizertrain.texts_to_sequences(X_train["name"])

word_index = tokenizertrain.word_index
print('Found %s unique tokens.' % len(word_index))

X_train_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

tokenizertest = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizertest.fit_on_texts(X_test["name"])
sequences = tokenizertest.texts_to_sequences(X_test["name"])

word_index = tokenizertest.word_index
print('Found %s unique tokens.' % len(word_index))

X_test_text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

X_train_nottext=X_train.drop('name', axis=1)
X_test_nottext=X_test.drop('name', axis=1)

print(X_train_text.shape)
print(X_test_text.shape)

Found 94029 unique tokens.
Found 37918 unique tokens.
(194752, 100)
(48689, 100)


In [5]:
## More code adapted from the keras reference (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
# prepare embedding matrix 
from keras.layers import Embedding
from keras.initializers import Constant
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, GlobalMaxPooling1D, Dropout, Activation, MaxPooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

## EMBEDDING_DIM =  ## seems to need to match the embeddings_index dimension
EMBEDDING_DIM = embeddings_index.get('a').shape[0]
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [6]:
def create_mlp(dim, regress=False): #Multi-layer perceptron for nominal, numeric data
	# define our MLP network
	model = Sequential()
	model.add(Dense(8, input_dim=dim, activation="relu"))
	model.add(Dense(4, activation="relu"))
	# check to see if the regression node should be added
	if regress:
		model.add(Dense(1, activation="linear"))
	# return our model
	return model

mlpModel=create_mlp(X_train_nottext.shape[1], regress=False)

In [7]:
modelcnn = Sequential()
modelcnn.add(Embedding(num_words, 300, input_length=100, weights= [embedding_matrix], trainable=False))
modelcnn.add(Conv1D(128, 5, activation='relu'))
modelcnn.add(GlobalMaxPooling1D())
modelcnn.add(Dense(32, activation='relu'))
modelcnn.add(Dense(16, activation='relu'))
modelcnn.add(Dense(4, activation='relu'))
modelcnn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          300300    
_________________________________________________________________
conv1d (Conv1D)              (None, 96, 128)           192128    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_3 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 68        
Total params: 497,152
Trainable params: 196,852
Non-trainable params: 300,300
__________________________________________

In [8]:
from keras.constraints import nonneg

combinedInput = concatenate([mlpModel.output, modelcnn.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="sigmoid")(x)
# our final model will accept categorical/numerical data on the MLP
# input and texts on the CNN input, outputting a single value //successful or not
model = Model(inputs=[mlpModel.input, modelcnn.input], outputs=x)

In [9]:
opt = Adam(lr=1e-5, decay=1e-5)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt, metrics=['mse', 'mae', 'mape'])
# train the model
print("[INFO] training model...")

model.fit(
	x=[X_train_nottext, X_train_text], y=y_train,
	validation_data=([X_test_nottext, X_test_text], y_test),
	epochs=1)
# make predictions on the testing data
print("[INFO] predicting pledged money...")
preds = model.predict([X_test_nottext, X_test_text])
axs=np.unique(preds)
print(len(axs))

[INFO] training model...
[INFO] predicting pledged money...
38450


In [10]:
from sklearn.metrics import mean_squared_error
import math
print("predicted minimum: {:.16f}".format(preds.min()))
print("predicted maximum: {:.16f}".format(preds.max()))
print("predicted mean: {:.16f}".format(preds.mean()))
print("test value mean: {:.16f}".format(y_test.mean()))
print("test value minimum: {:.16f}".format(y_test.min()))
print("test value maximum: {:.16f}".format(y_test.max()))
a=math.sqrt(mean_squared_error(y_test, preds))
print("mse:",math.sqrt(mean_squared_error(y_test, preds)))
from sklearn.metrics import mean_absolute_error
b=mean_absolute_error(y_test, preds)
print("mae:","{:.16f}".format(float(mean_absolute_error(y_test, preds))))

predicted minimum: 0.0000000062358554
predicted maximum: 0.3552994430065155
predicted mean: 0.0559774860739708
test value mean: 0.1037705110751163
test value minimum: 0.0000000000000000
test value maximum: 1.0000000000000000
mse: 0.23356446957152685
mae: 0.1336586111550642


In [11]:
modellstm = Sequential()
modellstm.add(Embedding(num_words, 300, input_length=100, weights= [embedding_matrix], trainable=False))
modellstm.add(LSTM(300))
modelcnn.add(Dense(32, activation='relu'))
modelcnn.add(Dense(16, activation='relu'))
modelcnn.add(Dense(4, activation='relu'))

combinedInput = concatenate([mlpModel.output, modellstm.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="sigmoid")(x)
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlpModel.input, modellstm.input], outputs=x)

In [12]:
opt = Adam(lr=1e-6, decay=1e-6)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt, metrics=['mse', 'mae', 'mape'])
# train the model
print("[INFO] training model...")

model.fit(
	x=[X_train_nottext, X_train_text], y=y_train,
	validation_data=([X_test_nottext, X_test_text], y_test),
	epochs=1)
# make predictions on the testing data
print("[INFO] predicting pledged money...")
preds = model.predict([X_test_nottext, X_test_text])
axs=np.unique(preds)
print(len(axs))

[INFO] training model...
[INFO] predicting pledged money...
33695


In [13]:
from sklearn.metrics import mean_squared_error
import math
print("predicted minimum: {:.16f}".format(preds.min()))
print("predicted maximum: {:.16f}".format(preds.max()))
print("predicted mean: {:.16f}".format(preds.mean()))
print("test value mean: {:.16f}".format(y_test.mean()))
print("test value minimum: {:.16f}".format(y_test.min()))
print("test value maximum: {:.16f}".format(y_test.max()))
a=math.sqrt(mean_squared_error(y_test, preds))
print("mse:",math.sqrt(mean_squared_error(y_test, preds)))
from sklearn.metrics import mean_absolute_error
b=mean_absolute_error(y_test, preds)
print("mae:","{:.16f}".format(float(mean_absolute_error(y_test, preds))))

predicted minimum: 0.0353583097457886
predicted maximum: 0.0471842586994171
predicted mean: 0.0410560891032219
test value mean: 0.1037705110751163
test value minimum: 0.0000000000000000
test value maximum: 1.0000000000000000
mse: 0.2017116543465942
mae: 0.1041053031337072
