In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
%matplotlib inline

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')


## Data Cleaning

In [None]:
data = pd.read_csv('Reviews.csv', nrows=10000)
data = data[data.Score!=3] # 1,2 bad - 3 normal - 4,5 good feedback
data.head()

In [None]:
actualScore = data['Score']
data['flag'] = data['Score'].apply(lambda x: 0 if x<4 else 1) # +ve or -ve review flag

In [None]:
# drop duplicates entries
data = data.sort_values('ProductId').drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
data = data[data.HelpfulnessNumerator <= data.HelpfulnessDenominator]

In [None]:
data.Score.value_counts() 
# data is biased, need to do sampling while building the model.

## Text Preprocessing

NLP text preprocessing
- Remove the html tags first.
- Eliminate all punctuation and a select group of special characters, such as, or., or #, etc.
- Verify that the term is composed of English letters alone and not any other characters.
- Make sure the term is longer than two by measuring its length (as it was researched that there is no adjective in 2-letters)
- Change the word's case to lowercase.
- Eliminate Stopwords
- Snowball Stemming (it was obsereved to be better than Porter Stemming)

In [None]:
stop = stopwords.words('english')
stop.remove('not')
stop.remove('but')
# 'not' is useful word as it changes the meaning of sentence
snoStem = nltk.stem.SnowballStemmer('english') 

In [None]:
def preprocess(text):

    def removeTag(sentence): 
        clr = re.compile('<.*?>')
        text = re.sub(clr, ' ', sentence)
        return text

    def removePunctuation(sentence): 
        text = re.sub(r'[?|!|\'|"|#]',r'',sentence)
        text = re.sub(r'[.|,|)|(|\|/]',r' ',text)
        return  text

    def removeUrls(text):
        return re.sub(r'https?://\S+|www.\.\S+', '', text)

    def removeStopwords(text, stopwords=None):
        if stopwords is None:
            stopwords = set(stopwords.words('english'))
        return ' '.join([word for word in text.split() if word not in stopwords])

    def lemmaSentence(text, lemmatizer):
        new_text = ''
        tok_text = word_tokenize(text)
        tags = nltk.pos_tag(tok_text)
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        tags = [tag_dict.get(tag[1][0],  wordnet.NOUN) for tag in tags]
        for i in range(len(tok_text)):
            new_text = new_text + ' ' + lemmatizer.lemmatize(tok_text[i], tags[i])
        return new_text[1:]

    def stemmingSentence(text):
        return snoStem.stem(text)    
    
    
    
    # Function calling
    text = text.lower()
    text = removeStopwords(text, stop)
    text = removeTag(text)
    text = removePunctuation(text)
    text = stemmingSentence(text)
    text = lemmaSentence(text, WordNetLemmatizer())

    return text

In [None]:
# Perform preprocessing
data['clean_text'] = data['Text'].apply(preprocess)

In [None]:
x = data['clean_text'].values
y = data['flag'].values

## BoW (Bag of Word)

In [None]:
count_vect = CountVectorizer() 
final_counts = count_vect.fit_transform(x)

In [None]:
final_counts.shape #check number of features we got

## Bi-Grams 

For simplicity just look at growth of features

In [None]:
count_vect = CountVectorizer(ngram_range=(1,2) ) #in scikit-learn
final_bigram_counts = count_vect.fit_transform(x)

In [None]:
final_bigram_counts.shape # just with bi-gram feature size grow exponentially. just think what happens with3 or 4 grams

## TF-IDF

In [None]:
tfidf_vect = TfidfVectorizer(min_df=5, max_features=10000, ngram_range=(1,2), lowercase=False, tokenizer=word_tokenize)
x_tf = tfidf_vect.fit_transform(x)

In [None]:
x_tf.shape

## Avg. Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:
x_token = [word_tokenize(sentence) for sentence in x]

In [None]:
# Word2Vec model
model_w2v = Word2Vec(min_count=8,window=3,sample=6e-5, alpha=0.02, min_alpha=0.0005, negative=15)
model_w2v.build_vocab(x_token) # Build vocab

In [None]:
model_w2v.train(x_token, total_examples=model_w2v.corpus_count, epochs=20, report_delay=1)

In [None]:
# # save and use model for future use
# model_w2v.save("Aw2v.model")
# model_w2v = gensim.models.word2vec.Word2Vec.load("Aw2v.model")

In [None]:
model_w2v.wv.most_similar('food', topn=10)

In [None]:
# mean vector calculation
def AW2V_eachWord(model_w2v, words):
    words = [word for word in words if word in model_w2v.wv.vocab]
    if len(words) >= 1:
        return np.mean(model_w2v[words], axis=0)
    else:
        return np.zeros(300)

In [None]:
x_w2v = [AW2V_eachWord(model_w2v, review) for review in x_token]