# Question 3

In [2]:
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import re
import numpy as np
from nltk.stem.porter import PorterStemmer
from gensim.models import KeyedVectors
import pandas as pd
import string
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import SGD
from keras import regularizers
from keras.models import Sequential
from IPython.core.debugger import set_trace
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [9]:
# loading the dataset containing the labels

labels = pd.read_csv('./ds_technical_test_labels.csv')
original_df = pd.read_csv('./ds_technical_test_data.csv')
enriched_df = pd.read_csv('./Enriched_dataset.csv')
enriched_df = enriched_df.drop(columns=['Unnamed: 0'])
enriched_df['text'] = original_df['text']

In [10]:
enriched_df.head()

Unnamed: 0,text,Numbers_of_product,Negative,Null,Positive,Excellent
0,This would be a nice mouse if it didn't have n...,1,0,0,1,0
1,this item it put together well but the ball on...,1,0,1,0,0
2,I bought two of these cards for two different ...,2,0,0,1,0
3,This cable 1.6$ and price tell you what you bu...,1,0,1,0,0
4,it did not work on my 2012 tundra and Clarion ...,1,0,1,0,0


In [11]:
enriched_df = pd.concat([enriched_df,labels], axis=1)


In [12]:
enriched_df.head()

Unnamed: 0,text,Numbers_of_product,Negative,Null,Positive,Excellent,label_ids
0,This would be a nice mouse if it didn't have n...,1,0,0,1,0,1
1,this item it put together well but the ball on...,1,0,1,0,0,1
2,I bought two of these cards for two different ...,2,0,0,1,0,1
3,This cable 1.6$ and price tell you what you bu...,1,0,1,0,0,1
4,it did not work on my 2012 tundra and Clarion ...,1,0,1,0,0,1


# Pre processing

In [13]:
#pre_processing phase where :

# - making any word starts with lowercase 
enriched_df['text'] = enriched_df['text'].apply(lambda word: word.lower())

# - removing stopwords
stop = stopwords.words('english')
enriched_df['text'] = enriched_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# - removing punctuation 
enriched_df['text'] = enriched_df['text'].apply(lambda row: row.translate(str.maketrans('','',string.punctuation)))

# - removing bad and useless chars (numbers @ and so on)
enriched_df['text'] = enriched_df['text'].apply(lambda row: re.sub('[/(){}\[\]\|@,;]', '', row))
enriched_df['text'] = enriched_df['text'].apply(lambda row: re.sub('[^0-9a-z #+_]', '', row))
enriched_df['text'] = enriched_df['text'].apply(lambda row: re.sub(r'\d+', '', row))

# performing stemming 
porter_stemmer = PorterStemmer()
enriched_df['text'] = enriched_df['text'].apply(lambda x:porter_stemmer.stem(x))

In [14]:
# the number of total words 
enriched_df['text'].apply(lambda x: len(x.split(' '))).sum()

1322

In [15]:
# shuffling the dataset 
enriched_df = enriched_df.sample(frac = 1)

In [16]:
enriched_df.head()

Unnamed: 0,text,Numbers_of_product,Negative,Null,Positive,Excellent,label_ids
35,used item xbox notice change clarity wayshape ...,1,0,1,0,0,2
63,item works advertised easy installation would ...,1,0,1,0,0,4
66,good price ties straighten whole house one fe...,1,0,0,1,0,4
52,thing loud spinning full speed would think day...,1,0,1,0,0,3
86,good quality light stands sturdy work well out...,2,0,0,0,1,4


# Feature Engineering

In [17]:
# Considering that I do not have a considerable amount of words in order to perform 
#a right embedding I will use the technique called  tranfer learning in order to get 
#a very good representation already trained and tested. 
#I will use the vector representation present in the model that I have previously 
#loaded and used ./wiki-news-300d-1M.vec'

# in this method, for each sentence , I will find all representations 
# of each word of the sentence in the pre-trained model
# summing up and calculating the average vector

def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
    
    return np.asarray(sent_vec) / numw

In [18]:

model = KeyedVectors.load_word2vec_format('./wiki-news-300d-1M.vec')

V=[]
for sentence in enriched_df['text']:
    V.append(sent_vectorizer(sentence, model))   

In [19]:
# each word is represented by a vector of 300 dimensions
print(len(V[0]))

300


In [20]:
embedding_text = pd.DataFrame(V)

In [21]:
embedding_text.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.087947,0.020667,-0.1041,-0.053849,0.034681,0.049691,0.009879,-0.057007,0.137291,0.034321,...,0.012884,0.026733,-0.012449,0.056414,0.037851,0.037716,-0.014835,-0.036888,-0.018656,0.006342
1,-0.105175,0.015656,-0.116204,-0.073106,0.036933,0.058558,0.022065,-0.047164,0.140024,0.056547,...,0.013775,0.010547,0.013869,0.054386,0.028128,0.030481,0.022136,-0.03944,-0.02454,0.023769
2,-0.116379,0.010431,-0.120437,-0.043281,0.048761,0.060336,0.03085,-0.035298,0.147069,0.034765,...,0.004626,0.012487,-0.018458,0.063113,0.036301,0.017113,0.026238,-0.042786,-0.033376,0.015224
3,-0.113641,0.024922,-0.096267,-0.037478,0.03959,0.03855,0.029522,-0.02894,0.1587,0.051969,...,0.009007,0.023576,0.006347,0.074329,0.045714,0.029452,0.025645,-0.028291,-0.02196,0.023878
4,-0.109616,0.013444,-0.104768,-0.044748,0.048805,0.061438,0.028794,-0.044738,0.146986,0.038207,...,0.007153,0.019109,0.002195,0.065937,0.029935,0.023584,0.01978,-0.040238,-0.030493,0.020916


In [22]:
# I will use two method to performa the classification. The deep learning LSTM and the logistic regression. 
# I choose this two techniques because I have tried almost 10 and these two gave me the highest results in terms of accuracy.

# LSTM

In [23]:
# Below there is the method that will perform an LSTM. 
# This method receives in input the number of features and 
# the dataset (enriched and not enrichd)



def lstm_method(dataset,features):
    # Instead of having a single colum containing the categorical label I will create dummy 
    # variables and so 7 more columns will be added. Each column contains 0 if that instance dose not belong
    # to that class, 1 otherwise
    dataset = pd.get_dummies(dataset, columns=['label_ids'])

    # I separete the input text and the information that the training set has to contain (number of product and rating)
    input_text = dataset.iloc[:,:-7]
    labels = dataset.iloc[:,-7:]

    # splitting in training and test, 70% and 30% respectively
    X_train, X_test, y_train, y_test = train_test_split(input_text, labels, test_size=0.3, random_state = 42)

    X_train_LSTM = X_train.values.reshape(84,1,features)
    X_test_LSTM = X_test.values.reshape(36,1,features)

    y_train_LSTM = y_train.values
    y_test_LSTM = y_test.values


    #   512 neurons
    #   7 output nodes


    model_lstm = Sequential()
    model_lstm.add(LSTM(512,input_shape=(X_train_LSTM.shape[1], X_train_LSTM.shape[2]), activation = 'softmax'))
    model_lstm.add(Dropout(0.7))
    model_lstm.add(Dense(7, activation='softmax', kernel_regularizer=regularizers.l2(0)))
    model_lstm.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])



    # fit network
    history = model_lstm.fit(X_train_LSTM, y_train_LSTM, epochs=10, batch_size=5, verbose=2, shuffle=False)


    score,acc = model_lstm.evaluate(X_test_LSTM, y_test_LSTM, batch_size=5)
    print('Test loss:', score)
    print('Test accuracy:', acc)

# LSTM using the enriched dataset

In [24]:
# For this first classification task using LSTM 
#I will use the enriched Dataset (including the information about the number of products and the approval rating of the review)

# I drop the old text, that will be replaced by the embedding one 
enriched_df_LSTM = enriched_df
enriched_df_LSTM = enriched_df_LSTM.drop(columns=['text'])
enriched_df_LSTM =  pd.concat([embedding_text,enriched_df_LSTM],axis=1)
enriched_df_LSTM.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,Numbers_of_product,Negative,Null,Positive,Excellent,label_ids
0,-0.087947,0.020667,-0.1041,-0.053849,0.034681,0.049691,0.009879,-0.057007,0.137291,0.034321,...,-0.014835,-0.036888,-0.018656,0.006342,1,0,0,1,0,1
1,-0.105175,0.015656,-0.116204,-0.073106,0.036933,0.058558,0.022065,-0.047164,0.140024,0.056547,...,0.022136,-0.03944,-0.02454,0.023769,1,0,1,0,0,1
2,-0.116379,0.010431,-0.120437,-0.043281,0.048761,0.060336,0.03085,-0.035298,0.147069,0.034765,...,0.026238,-0.042786,-0.033376,0.015224,2,0,0,1,0,1
3,-0.113641,0.024922,-0.096267,-0.037478,0.03959,0.03855,0.029522,-0.02894,0.1587,0.051969,...,0.025645,-0.028291,-0.02196,0.023878,1,0,1,0,0,1
4,-0.109616,0.013444,-0.104768,-0.044748,0.048805,0.061438,0.028794,-0.044738,0.146986,0.038207,...,0.01978,-0.040238,-0.030493,0.020916,1,0,1,0,0,1


In [25]:
# I will call noe the lstm_method

lstm_method(enriched_df_LSTM,len(enriched_df_LSTM.iloc[0]) - 1)

Epoch 1/10
 - 5s - loss: 1.9452 - acc: 0.1786
Epoch 2/10
 - 1s - loss: 1.9418 - acc: 0.2619
Epoch 3/10
 - 1s - loss: 1.9385 - acc: 0.2738
Epoch 4/10
 - 1s - loss: 1.9354 - acc: 0.2738
Epoch 5/10
 - 1s - loss: 1.9321 - acc: 0.2738
Epoch 6/10
 - 1s - loss: 1.9294 - acc: 0.2738
Epoch 7/10
 - 1s - loss: 1.9271 - acc: 0.2738
Epoch 8/10
 - 1s - loss: 1.9241 - acc: 0.2738
Epoch 9/10
 - 1s - loss: 1.9217 - acc: 0.2738
Epoch 10/10
 - 1s - loss: 1.9191 - acc: 0.2738
Test loss: 1.92022524939643
Test accuracy: 0.2222222255335914


# LSTM using the not enriched dataset

In [26]:
# Now I will just use the embedding text and the labels without the additional information

not_enriched_df = pd.concat([embedding_text,labels],axis=1)

In [27]:
not_enriched_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,label_ids
0,-0.087947,0.020667,-0.1041,-0.053849,0.034681,0.049691,0.009879,-0.057007,0.137291,0.034321,...,0.026733,-0.012449,0.056414,0.037851,0.037716,-0.014835,-0.036888,-0.018656,0.006342,1
1,-0.105175,0.015656,-0.116204,-0.073106,0.036933,0.058558,0.022065,-0.047164,0.140024,0.056547,...,0.010547,0.013869,0.054386,0.028128,0.030481,0.022136,-0.03944,-0.02454,0.023769,1
2,-0.116379,0.010431,-0.120437,-0.043281,0.048761,0.060336,0.03085,-0.035298,0.147069,0.034765,...,0.012487,-0.018458,0.063113,0.036301,0.017113,0.026238,-0.042786,-0.033376,0.015224,1
3,-0.113641,0.024922,-0.096267,-0.037478,0.03959,0.03855,0.029522,-0.02894,0.1587,0.051969,...,0.023576,0.006347,0.074329,0.045714,0.029452,0.025645,-0.028291,-0.02196,0.023878,1
4,-0.109616,0.013444,-0.104768,-0.044748,0.048805,0.061438,0.028794,-0.044738,0.146986,0.038207,...,0.019109,0.002195,0.065937,0.029935,0.023584,0.01978,-0.040238,-0.030493,0.020916,1


In [28]:
# I will call noe the lstm_method

lstm_method(not_enriched_df,len(not_enriched_df.iloc[0]) - 1)

Epoch 1/10
 - 2s - loss: 1.9449 - acc: 0.2262
Epoch 2/10
 - 1s - loss: 1.9414 - acc: 0.2738
Epoch 3/10
 - 1s - loss: 1.9384 - acc: 0.2738
Epoch 4/10
 - 1s - loss: 1.9352 - acc: 0.2738
Epoch 5/10
 - 1s - loss: 1.9319 - acc: 0.2738
Epoch 6/10
 - 1s - loss: 1.9297 - acc: 0.2738
Epoch 7/10
 - 1s - loss: 1.9266 - acc: 0.2738
Epoch 8/10
 - 1s - loss: 1.9242 - acc: 0.2738
Epoch 9/10
 - 1s - loss: 1.9215 - acc: 0.2738
Epoch 10/10
 - 1s - loss: 1.9187 - acc: 0.2738
Test loss: 1.920167002413008
Test accuracy: 0.2222222255335914


# Conclusion on LSTM

# Logistic Regression

In [29]:
def logistic_method(dataset):
    X = dataset.iloc[:,:-1]
    y = dataset.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

# Logistic Regression not enriched dataset

In [30]:
# Now I will just use the embedding text and the labels without the additional information

not_enriched_df = pd.concat([embedding_text,labels],axis=1)
not_enriched_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,label_ids
0,-0.087947,0.020667,-0.1041,-0.053849,0.034681,0.049691,0.009879,-0.057007,0.137291,0.034321,...,0.026733,-0.012449,0.056414,0.037851,0.037716,-0.014835,-0.036888,-0.018656,0.006342,1
1,-0.105175,0.015656,-0.116204,-0.073106,0.036933,0.058558,0.022065,-0.047164,0.140024,0.056547,...,0.010547,0.013869,0.054386,0.028128,0.030481,0.022136,-0.03944,-0.02454,0.023769,1
2,-0.116379,0.010431,-0.120437,-0.043281,0.048761,0.060336,0.03085,-0.035298,0.147069,0.034765,...,0.012487,-0.018458,0.063113,0.036301,0.017113,0.026238,-0.042786,-0.033376,0.015224,1
3,-0.113641,0.024922,-0.096267,-0.037478,0.03959,0.03855,0.029522,-0.02894,0.1587,0.051969,...,0.023576,0.006347,0.074329,0.045714,0.029452,0.025645,-0.028291,-0.02196,0.023878,1
4,-0.109616,0.013444,-0.104768,-0.044748,0.048805,0.061438,0.028794,-0.044738,0.146986,0.038207,...,0.019109,0.002195,0.065937,0.029935,0.023584,0.01978,-0.040238,-0.030493,0.020916,1


In [31]:
# I will call now the logistic_method

logistic_method(not_enriched_df)

Accuracy of logistic regression classifier on test set: 0.22


  y = column_or_1d(y, warn=True)


# Logistic Regression enriched dataset 


In [32]:
enriched_df_regr = enriched_df
enriched_df_regr = enriched_df_regr.drop(columns=['text'])
enriched_df_regr =  pd.concat([embedding_text,enriched_df_regr],axis=1)
enriched_df_regr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,Numbers_of_product,Negative,Null,Positive,Excellent,label_ids
0,-0.087947,0.020667,-0.1041,-0.053849,0.034681,0.049691,0.009879,-0.057007,0.137291,0.034321,...,-0.014835,-0.036888,-0.018656,0.006342,1,0,0,1,0,1
1,-0.105175,0.015656,-0.116204,-0.073106,0.036933,0.058558,0.022065,-0.047164,0.140024,0.056547,...,0.022136,-0.03944,-0.02454,0.023769,1,0,1,0,0,1
2,-0.116379,0.010431,-0.120437,-0.043281,0.048761,0.060336,0.03085,-0.035298,0.147069,0.034765,...,0.026238,-0.042786,-0.033376,0.015224,2,0,0,1,0,1
3,-0.113641,0.024922,-0.096267,-0.037478,0.03959,0.03855,0.029522,-0.02894,0.1587,0.051969,...,0.025645,-0.028291,-0.02196,0.023878,1,0,1,0,0,1
4,-0.109616,0.013444,-0.104768,-0.044748,0.048805,0.061438,0.028794,-0.044738,0.146986,0.038207,...,0.01978,-0.040238,-0.030493,0.020916,1,0,1,0,0,1


In [33]:
# I will call now the logistic_method

logistic_method(enriched_df_regr)

Accuracy of logistic regression classifier on test set: 0.28


  y = column_or_1d(y, warn=True)


# Using Smote and enriched dataset repeating logistic regression


In [34]:
# Noticing the distribution of the labels it is clear that the dataset is unbalanced. 
# Almost the 60% of the total instances belong to the class 4 and 1. 
# In order to improve the performance we can try to balance the dataset using the 
# technique calle Smote

labels['label_ids'].value_counts()

4    31
1    25
3    17
2    14
5    13
7    10
6    10
Name: label_ids, dtype: int64

In [35]:

sm = SMOTE(random_state=2)
X_balanced, y_balanced = sm.fit_sample(enriched_df_regr.iloc[:,:-1], enriched_df_regr.iloc[:,-1:])

  y = column_or_1d(y, warn=True)


In [36]:
y_balanced = pd.Series(y_balanced)

In [37]:
y_balanced.value_counts()

# after using the smote it is noticing that the labels are equally distributed across all instances of the dataset.

7    31
6    31
5    31
4    31
3    31
2    31
1    31
dtype: int64

In [38]:
balanced_df = pd.concat([pd.DataFrame(X_balanced),y_balanced], axis=1)
balanced_df = balanced_df.sample(frac = 1)

In [39]:
def smote_regression_log(dataset):

    # Using the enriched Dataset

    X = dataset.iloc[:,:-1]
    y = dataset.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    y_pred_smote_regression = classifier.predict(X_test)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))
    return y_pred_smote_regression

In [40]:
y_pred = smote_regression_log(balanced_df)

Accuracy of logistic regression classifier on test set: 0.38


  y = column_or_1d(y, warn=True)


# Consideration

In [41]:
pd.Series(y_pred).value_counts()

1    19
6    15
3    15
4     8
7     4
2     3
5     2
dtype: int64

In [287]:
enriched_df[enriched_df['label_ids']==6]

Unnamed: 0,text,Numbers_of_product,Negative,Null,Positive,Excellent,label_ids
101,couldnt keep alive battery terrible hand camer...,2,0,1,0,0,6
106,camera ok hand battery awful days able keep al...,1,1,0,0,0,6
105,buy risk battery bad hand camera good way get ...,2,0,1,0,0,6
109,battery disappointing hand camera fin,2,0,1,0,0,6
104,couldnt keep alive camera good hand battery ba...,2,0,1,0,0,6
100,mixed bag camera great hand battery terrible s...,2,0,1,0,0,6
107,battery awful hand camera ok would people this...,2,0,1,0,0,6
103,why battery bad hand camera good sure would re...,2,0,1,0,0,6
108,camera fine hand battery disappoint,2,0,1,0,0,6
102,camera good hand battery bad would people this...,2,0,1,0,0,6


# Question 4