In [74]:
import pandas as pd
import os
import numpy as np

In [75]:
df = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/train_data/train.csv', header=None)

In [76]:
df_test = pd.read_csv('/Users/marikhomeriki/code/marikhomeriki/product_review_analysis/raw_data/test_data/test.csv', header=None)

In [77]:
df_test = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [78]:
df = df.rename({0: 'label', 1: 'text'}, axis = 1)

In [79]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(data, percentage_of_sentences=None):
    
    train_data = data['text']
    label_data = data['label']

    train_sentences = train_data

    y_train = label_data
    
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
  
    return train_sentences, y_train



In [37]:
X_train, y_train = load_data(df, percentage_of_sentences=10)

In [70]:
X_test, y_test = load_data(df_test, percentage_of_sentences=10)

In [39]:
X_train = pd.DataFrame(X_train)

In [40]:
X_train.head()

Unnamed: 0,text
0,"Unfortunately, the frustration of being Dr. Go..."
1,Been going to Dr. Goldberg for over 10 years. ...
2,I don't know what Dr. Goldberg was like before...
3,I'm writing this review to give you a heads up...
4,All the food is great here. But the best thing...


In [26]:
X_train.shape

(56000,)

In [41]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer

In [42]:
def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [43]:
X_train.head()

Unnamed: 0,text
0,"Unfortunately, the frustration of being Dr. Go..."
1,Been going to Dr. Goldberg for over 10 years. ...
2,I don't know what Dr. Goldberg was like before...
3,I'm writing this review to give you a heads up...
4,All the food is great here. But the best thing...


In [44]:
X_train_cleaned = X_train["text"].apply(cleaning)
X_train_cleaned.head()

0    unfortunately frustration dr goldbergs patient...
1    go dr goldberg years think one st patients sta...
2    dont know dr goldberg like move arizona let te...
3    im write review give head see doctor office st...
4    food great best thing wing wing simply fantast...
Name: text, dtype: object

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.feature_extraction.text import CountVectorizer

In [51]:

vectorizer = TfidfVectorizer(ngram_range = (2,2), 
                             min_df=0.01, 
                             max_df = 0.05).fit(df.text)

In [52]:
vectors = pd.DataFrame(vectorizer.transform(X_train.text).toarray(),
                       columns = vectorizer.get_feature_names_out())
vectors.head()

Unnamed: 0,10 minutes,15 minutes,20 minutes,30 minutes,able to,about it,about this,across the,after the,after we,...,you know,you like,you ll,you need,you should,you to,you want,you will,you would,your money
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
sum_tfidf = vectors.sum(axis = 0)
sum_tfidf

10 minutes    203.816533
15 minutes    170.246774
20 minutes    182.173124
30 minutes    154.332473
able to       348.151945
                 ...    
you to        179.680721
you want      424.804616
you will      317.303867
you would     163.047953
your money    140.719652
Length: 926, dtype: float64

In [55]:
tfidf_list = [(word, sum_tfidf[word]) 
              for word, idx in vectorizer.vocabulary_.items() 
              if word in vectorizer.vocabulary_.keys() ]
tfidf_list

[('the experience', 131.72130775804456),
 ('ve had', 343.49743370338024),
 ('so many', 224.83189950476833),
 ('it seems', 145.68736521523996),
 ('the phone', 245.7569883860526),
 ('for that', 194.50868419793602),
 ('with it', 151.15048697778604),
 ('and just', 223.1745602914998),
 ('get it', 219.88253180833274),
 ('you have', 455.5253013983304),
 ('and not', 440.0601339552372),
 ('it with', 128.7632338302373),
 ('that have', 182.49563552362832),
 ('to give', 314.7191265688911),
 ('was one', 120.93203304091442),
 ('over the', 384.7197686766323),
 ('and is', 181.50991777578474),
 ('is really', 212.48233431987265),
 ('because of', 283.1866487897507),
 ('is very', 475.3669242670273),
 ('the right', 185.38608123305664),
 ('of your', 152.22973402307593),
 ('don know', 299.86483056376744),
 ('know what', 252.7746313561113),
 ('was like', 128.2786922489175),
 ('let me', 210.51986787984217),
 ('tell you', 165.60826923383544),
 ('away from', 213.49591775685528),
 ('from this', 185.6742994047958)

In [56]:
sorted_tfidf_list =sorted(tfidf_list, key = lambda x: x[1], reverse=True)
sorted_tfidf_list

[('customer service', 702.147265483018),
 ('to this', 607.4258230164288),
 ('ve been', 573.1995721105594),
 ('service is', 549.2849076760285),
 ('for my', 541.8361610900085),
 ('this location', 528.0234762970313),
 ('is great', 523.3665569019604),
 ('and have', 511.58326888016705),
 ('on my', 509.25859396123263),
 ('food and', 508.67114090067764),
 ('at all', 508.26775263953056),
 ('go back', 504.4959045159648),
 ('used to', 495.3513304533479),
 ('my favorite', 492.7765929708395),
 ('do not', 490.01818569134815),
 ('will be', 489.77137956418744),
 ('is always', 487.4822275620763),
 ('here for', 484.46525662149475),
 ('to find', 483.1504904172855),
 ('is very', 475.3669242670273),
 ('friendly and', 472.794092872362),
 ('at least', 470.4417172772417),
 ('for me', 470.2086715547312),
 ('very good', 467.59514126128477),
 ('that is', 467.32096012429776),
 ('to see', 466.0542455584581),
 ('was good', 463.94378478092597),
 ('went to', 463.6722288326513),
 ('to eat', 457.3967847792017),
 ('you

In [57]:
y_train.value_counts()

1    30232
2    25768
Name: label, dtype: int64

In [60]:
from gensim.models import Word2Vec

word2vec = Word2Vec(sentences=X_train.text)

In [61]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [63]:
X_embed = embedding(word2vec, X_train)


In [65]:
X_pad = pad_sequences(X_embed, dtype='float32', padding='post', maxlen=200)

In [73]:
print(np.unique(y_test, return_counts=True))
baseline_accuracy = 30232/(25768+30232)
len(y_test)
baseline_accuracy

(array([1, 2]), array([30232, 25768]))


0.5398571428571428