# Carvana Take Home Interview Exercise

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Apple-Twitter-Sentiment-DFE.csv", encoding = "ISO-8859-1", parse_dates=["date"])
df.shape

(3886, 12)

In [3]:
# study the target
df["sentiment"].value_counts()

3               2162
1               1219
5                423
not_relevant      82
Name: sentiment, dtype: int64

In [4]:
# based on the result I think we have three categories, so I drop the not_relevant 
df = df[df.sentiment != "not_relevant"]
df.shape

(3804, 12)

In [5]:
# Clean the text
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def text_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [6]:
df["clean"] = df["text"].apply(lambda x: text_cleaner(x))

In [7]:
# I use date to create a new features
df["hour"] = df["date"].dt.hour
df["weekday"] = df["date"].dt.weekday

In [8]:
# create target
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df["target"] = le.fit_transform(df["sentiment"])

## GloVe  
I want to use the pre-trained GloVe vectors for prediction using  
- Gensim  
- Keras

## Gensim

In [9]:
import gensim.downloader as api
glove_twitter = api.load("glove-twitter-100") 

In [10]:
def get_w2v(tweet, size, vectors, aggregation='mean'):
    """convert words into vectors
    INPUT
    tweet: string
    size: int, size of the output
    vectors: pre-trained word embeddings
    aggregation: method to aggregate doublication
    OUTPUT
    a vector with a lenght of the sie"""
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tweet.split():
        try:
            vec += vectors[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if aggregation == 'mean':
        if count != 0:
            vec /= count
        return vec
    elif aggregation == 'sum':
        return vec

In [11]:
vecs_glove = np.concatenate([get_w2v(z, 100, glove_twitter,'sum') for z in df["clean"].values])
print("Shape of vector: ", vecs_glove.shape)

Shape of vector:  (3804, 100)


In [12]:
# merge embeding vectors and two features
data = np.concatenate([vecs_glove, df[["hour", "weekday"]].as_matrix()], axis=1)

  


In [13]:
# spilit data into train and test
np.random.seed(123)
index = np.random.rand(data.shape[0])  > 0.2

train = data[index]
test = data[~index]

print("Number of train:", train.shape[0])
print("Number of test:", test.shape[0])

Number of train: 3043
Number of test: 761


### Logistic Regression for prediction

In [14]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [15]:
# Train the model
clf.fit(train, df.loc[index, "target"])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
# Evaluation
from sklearn.metrics import log_loss

pred = clf.predict_proba(train)
print("log loss error of the train:", log_loss(df.loc[index, "target"], pred))

pred = clf.predict_proba(test)
print("log loss error of the test:", log_loss(df.loc[~index, "target"], pred))

log loss error of the train: 0.5859452568761673
log loss error of the test: 0.6372300676607052


### Improvement  
- optimize parameters  
- apply feature selection methods and regularization
- use no-linear models such as xgboost or deep learning

## Keras  
In this part, I only use text for prediction. In the next part, I will add two engineered features to the embedded vector.

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

Using TensorFlow backend.


In [10]:
t = Tokenizer()
t.fit_on_texts(df["clean"])

In [11]:
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(df["clean"])

In [12]:
# pad documents to a max length of 20 words
# I select 20 to save the memory and computation but one can increase to improve the performance
max_length = 20
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [13]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/Users/mrahimi/Downloads/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [14]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=20, trainable=False)
model.add(e)
model.add(Flatten())
#model.add(Dense(50, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(3, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 100)           545700    
_________________________________________________________________
flatten_1 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                20010     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 33        
Total params: 565,743
Trainable params: 20,043
Non-trainable params: 545,700
_________________________________________________________________
None


In [16]:
# spilit data to train and test
np.random.seed(123)
index = np.random.rand(df.shape[0])  > 0.2
# create labels
labels = pd.get_dummies(df["target"])

In [17]:
# train the model
model.fit(padded_docs[index], labels[index], epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a4349e390>

In [18]:
# Evaluation
from sklearn.metrics import log_loss

pred = model.predict(padded_docs[index])
pred = pred[:2000]
y = df.loc[index, "target"].values
y = y[:2000]
print("log loss error of the train:", log_loss(y, pred))

pred = model.predict(padded_docs[~index])
print("log loss error of the test:", log_loss(df.loc[~index, "target"], pred))

log loss error of the train: 0.12046771376364764
log loss error of the test: 1.4625597579098981


## Keras 
concatenate embedded vectors with engineered features

In [19]:
from keras.layers import Input, Embedding, Dense
from keras.models import Model
from keras.layers import concatenate

embed_input = Input(shape=(20,), dtype='int32', name='embed_input')
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=20, trainable=False)(embed_input)
flatten = Flatten()(e)

engin_input = Input(shape=(2,), name='engin_input')
x = concatenate([flatten, engin_input])

# I stack a deep densely-connected network on top
x = Dense(64, activation='relu')(x)
x = Dense(32, activation='relu')(x)
x = Dense(16, activation='relu')(x)

# # And finally I add the main layer
output = Dense(3, activation='softmax')(x)

# create model
model = Model(inputs=[embed_input, engin_input], outputs=[output])
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [20]:
model.fit([padded_docs[index], df.loc[index,["hour", "weekday"]].values], [labels[index]], epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a59711748>

In [22]:
# Evaluation
from sklearn.metrics import log_loss

pred = model.predict([padded_docs[index], df.loc[index,["hour", "weekday"]].values])
pred = pred[:2000]
y = df.loc[index, "target"].values
y = y[:2000]
print("log loss error of the train:", log_loss(y, pred))

pred = model.predict([padded_docs[~index], df.loc[~index,["hour", "weekday"]].values])
print("log loss error of the test:", log_loss(df.loc[~index, "target"], pred))

log loss error of the train: 0.10310982171697017
log loss error of the test: 1.8950074991264985


### Improvment  
the differnece between the train and test error shows that the model over fit to the train data, so I need to use some methods to avoid overfitting.  
- early stop
- regularization
- drop out