<h1>Project Delivery #3 - Deep Learning based Text Classification - Predict Success</h1>

In [19]:
import pandas as pd
import numpy as np
from scipy import stats
import texthero as hero
from texthero import preprocessing

ks2018 = pd.read_csv("ks-projects-201801.csv")
#ks2016 = pd.read_csv("ks-projects-201612.csv")
#print(ks2016.info())

df=ks2018

# remove currently live campaigns
df = df[df.state != 'live']

# drop columns we don't need
df = df.drop('ID', axis=1)
df = df.drop('category', axis=1) #we keep main category
df = df.drop('goal', axis=1)
df = df.drop('pledged', axis=1)
df = df.drop('usd pledged', axis=1)
df = df.drop('currency', axis=1)

# if successful then 1, else 0, this is our class label
df['state'] = df['state'].map(lambda x: 1 if x == "successful" else 0)
print(df.state.value_counts())


df = df[df['name'].notna()]
custom_pipeline = [preprocessing.fillna,
                   #preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_diacritics
                   #preprocessing.remove_brackets
                  ]
df['name'] = hero.clean(df['name'], custom_pipeline)
df['name'] = [n.replace('{','') for n in df['name']]
df['name'] = [n.replace('}','') for n in df['name']]
df['name'] = [n.replace('(','') for n in df['name']]
df['name'] = [n.replace(')','') for n in df['name']]

X=df
X=X.drop(['state', 'deadline', 'launched', 'backers', 'usd_pledged_real'], axis=1)
y=df["state"]

#encoding category and country //nominal attributes
X["main_category"] = X["main_category"].astype('category')
X["main_category"] = X["main_category"].cat.codes
X["country"] = X["country"].astype('category')
X["country"] = X["country"].cat.codes

print(X.dtypes)
print(y.dtypes)

print(X.head())
#https://github.com/sdevalapurkar/kickstarter-prediction
#https://towardsdatascience.com/how-to-vectorize-text-in-dataframes-for-nlp-tasks-3-simple-techniques-82925a5600db
#https://www.kaggle.com/stacykurnikova/using-glove-embedding
#https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
#https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
#https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
#https://realpython.com/python-keras-text-classification/#convolutional-neural-networks-cnn

0    241906
1    133956
Name: state, dtype: int64
name              object
main_category       int8
country             int8
usd_goal_real    float64
dtype: object
int64
                                                name  main_category  country  \
0                    The Songs of Adelaide & Abullah             12        9   
1      Greeting From Earth: ZGAC Arts Capsule For ET              6       22   
2                                     Where is Hank?              6       22   
3  ToshiCapital Rekordz Needs Help to Complete Album             10       22   
4  Community Film Project: The Art of Neighborhoo...              6       22   

   usd_goal_real  
0        1533.95  
1       30000.00  
2       45000.00  
3        5000.00  
4       19500.00  


In [20]:
embeddings_index = {}
f = open('glove.6B.300d.txt',encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [21]:
import re

from wordcloud import WordCloud
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
## Iterate over the data to preprocess by removing stopwords
lines_without_stopwords=[] 
for line in X['name'].values: 
    line = line.lower()
    line_by_words = re.findall(r'(?:\w+)', line, flags = re.UNICODE) # remove punctuation ans split
    new_line=[]
    for word in line_by_words:
        if word not in stop:
            new_line.append(word)
    lines_without_stopwords.append(new_line)
texts = lines_without_stopwords

print(texts[0:5])

[['songs', 'adelaide', 'abullah'], ['greeting', 'earth', 'zgac', 'arts', 'capsule', 'et'], ['hank'], ['toshicapital', 'rekordz', 'needs', 'help', 'complete', 'album'], ['community', 'film', 'project', 'art', 'neighborhood', 'filmmaking']]


In [52]:
## Code adapted from (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
# Vectorize the text samples
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

X['name']=texts
X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=0.2)

MAX_NUM_WORDS = 1000
MAX_SEQUENCE_LENGTH = 100
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(y_train))

print(data.shape)
print(labels.shape)

Found 126532 unique tokens.
(300686, 100)
(300686, 2)


In [53]:
## More code adapted from the keras reference (https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py)
# prepare embedding matrix 
from keras.layers import Embedding
from keras.initializers import Constant

## EMBEDDING_DIM =  ## seems to need to match the embeddings_index dimension
EMBEDDING_DIM = embeddings_index.get('a').shape[0]
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word) ## This references the loaded embeddings dictionary
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [54]:
## Code from: https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa
## To create and visualize a model

from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, GlobalMaxPooling1D, Dropout, Activation, MaxPooling1D

modelcnn = Sequential()
modelcnn.add(Embedding(num_words, 300, input_length=100, weights= [embedding_matrix], trainable=False))
modelcnn.add(Conv1D(128, 5, activation='relu'))
modelcnn.add(GlobalMaxPooling1D())
modelcnn.add(Dense(10, activation='relu'))
modelcnn.add(Dense(2, activation='sigmoid'))
modelcnn.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
modelcnn.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_24 (Embedding)     (None, 100, 300)          300300    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 96, 128)           192128    
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 128)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_27 (Dense)             (None, 2)                 22        
Total params: 493,740
Trainable params: 193,440
Non-trainable params: 300,300
_________________________________________________________________


In [55]:
## Fit train data
modelcnn.fit(data, np.array(labels), validation_split=0.1, epochs = 1)



<tensorflow.python.keras.callbacks.History at 0x1963745df28>

In [57]:
tokenizer.fit_on_texts(X_test)
sequences = tokenizer.texts_to_sequences(X_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = to_categorical(np.asarray(y_test))
print(test.shape)
print(labels_test.shape)

modelcnn.evaluate(test, np.array(labels_test)) #cnn test

Found 146149 unique tokens.
(75172, 100)
(75172, 2)


[0.6322205662727356, 0.6372851729393005]

In [58]:
modellstm = Sequential()
modellstm.add(Embedding(num_words, 300, input_length=100, weights= [embedding_matrix], trainable=False))
modellstm.add(LSTM(300))
modellstm.add(Dense(2, activation='sigmoid'))
modellstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modellstm.summary()

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 100, 300)          300300    
_________________________________________________________________
lstm_13 (LSTM)               (None, 300)               721200    
_________________________________________________________________
dense_29 (Dense)             (None, 2)                 602       
Total params: 1,022,102
Trainable params: 721,802
Non-trainable params: 300,300
_________________________________________________________________


In [59]:
## Fit train data
modellstm.fit(data, np.array(labels), validation_split=0.1, epochs = 1)



<tensorflow.python.keras.callbacks.History at 0x196458b1d30>

In [60]:
modellstm.evaluate(test, np.array(labels_test)) #lstm test



[0.6280394196510315, 0.6333076357841492]