In [169]:
import pandas as pd
import os
import numpy as np

In [170]:
df = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/train_data/train.csv', header=None)

In [171]:
df_test = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/test_data/test.csv', header=None)

In [172]:
df_test = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [173]:
df = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [174]:
df.head()

Unnamed: 0,label,text
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [178]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(data, percentage_of_sentences=None):
    
    train_sentences = data['text']
    y_train = data['label']
    
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
    X_train = [text_to_word_sequence(_) for _ in train_sentences]
  
    return X_train, y_train



In [179]:
X_train, y_train = load_data(df, percentage_of_sentences=10)

In [181]:
X_test, y_test = load_data(df_test, percentage_of_sentences=10)

In [184]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer

In [185]:
def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [144]:
X_train_cleaned = X_train["text"].apply(cleaning)
# X_train_cleaned.head()
X_train_cleaned = pd.DataFrame(X_train_cleaned)

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer

In [146]:
X_train_cleaned.head()

Unnamed: 0,text
0,unfortunately frustration dr goldbergs patient...
1,go dr goldberg years think one st patients sta...
2,dont know dr goldberg like move arizona let te...
3,im write review give head see doctor office st...
4,food great best thing wing wing simply fantast...


In [76]:
vectorizer = TfidfVectorizer(ngram_range = (1,1), 
                             min_df=0.01, 
                             max_df = 0.05).fit(X_train_cleaned.text)

In [77]:
vectors = pd.DataFrame(vectorizer.transform(X_train_cleaned.text).toarray(),
                       columns = vectorizer.get_feature_names_out())
vectors.head()

Unnamed: 0,able,absolutely,accommodate,across,act,add,afternoon,ago,agree,ahead,...,write,wrong,yeah,year,yelp,yes,yet,youll,young,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.125923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.113901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
sum_tfidf = vectors.sum(axis = 0)
sum_tfidf

able           323.373608
absolutely     343.480369
accommodate    142.288906
across         248.486920
act            157.513087
                  ...    
yes            310.276774
yet            318.089017
youll          218.869533
young          163.071594
yummy          236.132563
Length: 729, dtype: float64

In [79]:
tfidf_list = [(word, sum_tfidf[word]) 
              for word, idx in vectorizer.vocabulary_.items() 
              if word in vectorizer.vocabulary_.keys() ]
tfidf_list

[('unfortunately', 215.84020316400546),
 ('dr', 351.7822085608716),
 ('doctor', 258.85758615241264),
 ('terrible', 444.70495079477826),
 ('simply', 181.14432263811526),
 ('answer', 228.2311689041434),
 ('phone', 431.3113306920039),
 ('usually', 510.5971049188142),
 ('hours', 451.153690185051),
 ('deal', 467.5403156045202),
 ('problem', 375.52289278398763),
 ('office', 385.61625246884233),
 ('isnt', 426.05844520342237),
 ('anyone', 369.376395467726),
 ('st', 139.97732911510514),
 ('picture', 168.30073293021783),
 ('options', 313.0829185638113),
 ('understand', 295.8777654615506),
 ('doesnt', 439.99804557261115),
 ('question', 273.707854251599),
 ('life', 264.35142583911494),
 ('move', 451.434909155999),
 ('arizona', 200.8219967639064),
 ('interest', 243.15411875532666),
 ('refill', 221.5076416352615),
 ('month', 222.34116719762358),
 ('less', 429.7112241594372),
 ('days', 397.3714654667481),
 ('joke', 153.07696924366775),
 ('matter', 200.51105715888815),
 ('worse', 229.47701760754984),


In [80]:
sorted_tfidf_list =sorted(tfidf_list, key = lambda x: x[1], reverse=True)
sorted_tfidf_list

[('pizza', 1010.654473421609),
 ('sandwich', 788.0112228275459),
 ('car', 756.3892822094797),
 ('awesome', 650.3367924107396),
 ('sushi', 618.5488697651995),
 ('breakfast', 579.372908135774),
 ('roll', 577.7485499114933),
 ('excellent', 570.6810714240773),
 ('waitress', 568.5803431341825),
 ('park', 567.0592737808101),
 ('selection', 562.2919435873062),
 ('bread', 559.5227051655908),
 ('kid', 558.132128030541),
 ('server', 554.5738221554574),
 ('charge', 548.2265876166904),
 ('worst', 538.0090884285909),
 ('rude', 534.4631350786751),
 ('family', 532.4989167745755),
 ('horrible', 528.4132340374431),
 ('special', 525.859540204336),
 ('soup', 525.8149402738993),
 ('flavor', 525.7404147655166),
 ('atmosphere', 522.4803384908088),
 ('line', 519.4788430894604),
 ('busy', 515.0096252883282),
 ('free', 512.3671981999864),
 ('fast', 511.04387406472443),
 ('usually', 510.5971049188142),
 ('rice', 508.55574808667046),
 ('friends', 500.97194571057395),
 ('coffee', 497.5800711306208),
 ('pick', 490

In [81]:
y_train.value_counts()

1    30232
2    25768
Name: label, dtype: int64

In [202]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train, vector_size=60, min_count=10, window=10)

In [203]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [204]:
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)

In [205]:
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=200)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=200)

In [217]:
from sklearn.metrics import accuracy_score

unique, counts = np.unique(y_train, return_counts=True)
counts = dict(zip(unique, counts))
print('Number of labels in train set', counts)

y_pred = 1 if counts[1] > counts[2] else 2

print('Baseline accuracy: ', accuracy_score(y_test, [y_pred]*len(y_test)))

Number of labels in train set {1: 30232, 2: 25768}
Baseline accuracy:  0.5398571428571428


In [220]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    model.add(layers.LSTM(20, activation='tanh'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

model = init_model()

In [223]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train_pad, y_train, 
          batch_size = 32,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

KeyboardInterrupt: 

In [224]:
import gensim.downloader as api
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [225]:
word2vec_transfer = api.load("glove-wiki-gigaword-50")



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [226]:
print(len(word2vec_transfer.key_to_index))

400000


In [228]:
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train)
X_test_embed_2 = embedding(word2vec_transfer, X_test)

In [229]:
X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=200)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=200)

In [231]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model = init_model()

model.fit(X_train_pad_2, y_train, 
          batch_size = 32,
          epochs=10,
          validation_split=0.3,
          callbacks=[es],
          verbose = 1
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
  70/1225 [>.............................] - ETA: 45s - loss: -436.5373 - accuracy: 0.5196

KeyboardInterrupt: 

In [232]:
res = model.evaluate(X_test_pad_2, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

The accuracy evaluated on the test set is of 53.986%
