In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import seaborn as sb
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from numba import vectorize
import unidecode
from nltk.corpus import stopwords
import time
import spacy

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer



import string,lxml,bs4,nltk
from warnings import simplefilter

# 0- Functions & Definitions

#### Variables

In [2]:
contractions = {
"ain't": "are not",
"aren't": "am not",
"can't": "can not",
"can't've": "can not have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "I had",
"i'd've": "I would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that had",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they had",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

#### Function: to load file

In [3]:
#This function will load file and return it as Data Frame and also return list of column names
def loadFile(fileName):
    #reading the temp file to a dataframe with the new headers    
    print('Loading File...',end='')
    result = pd.read_csv(fileName)
    print('[ok]')
    print('Loaded {:,} reviews' .format(len(result)))    
    print('Column Names: ', result.columns.tolist())
    return result, result.columns.tolist()

#### Function: to drop unused columns

In [4]:
#This fuction will return modified dataframe with selected columns only
def neededColumnsOnly(df, columns_needed):
    #return df.filter(columns_needed) 
    #Or
    to_drop_columns = list(x for x in df.columns.tolist() if x not in columns_needed)
    df.drop(to_drop_columns,axis=1,inplace=True)
    return df

#### Function: to find columns containing nulls

In [5]:
#This fuction will check which columns having null values
def columns_with_nulls(df, index_column):
    null_columns = []
    for column in df.columns:
        if column != index_column:
            check_null = df.isnull()[[column,index_column]].groupby(column).agg('count')
            try:
                if check_null.loc[True][0] > 0:
                    null_columns.append(column)
            except:
                pass
    return null_columns

#### Function: to remove all null values from columns

In [6]:
#This fuction will remove all nulls in columns found earlier
def remove_nulls(df, columns):
    for column in columns:
        df[column].fillna('', inplace=True)
    return df

#### Function: to find columns containing html tags

In [7]:
#This fuction will check which columns having html tags
def columns_with_html(df):
    null_columns = []
    for column in df.columns:
        try:
            text_html = df[column].str.find('<')
            text_html = text_html[text_html != -1]
            check_null = len(text_html)
            if check_null > 0:
                null_columns.append(column)
        except:
            pass
    return null_columns

#### Function: to remove html tags from data

In [8]:
#This fuction will remove all html tags in text
def remove_html_tags(text):
    sp = BeautifulSoup(text, "html.parser")
    returned_text = sp.get_text(separator=" ")
    return returned_text

#### Function: to merge Summary and Text in one column and get rid of productId column

In [9]:
#This fuction will result in two columns only score and review
def merge_summary_text(df):
    df['review']=df['Summary']+' '+df['Text'] 
    df.drop(['Summary','Text','ProductId'],axis=1,inplace=True)
    return df

#### Function: to correct accent in letters

In [10]:
#This function will replace accents in letters with regular letters example nescafé will be nescafe
def correct_accent(text):
    returned_text = unidecode.unidecode(text)
    return returned_text

#### Function: to visualized score count

In [11]:
def visualize_score_count(df):
    # visualize total review by score count
    final_df_grouped = df[['Score', 'review']].groupby('Score').agg('count')
    final_df_grouped= final_df_grouped.reset_index()
    f, ax = plt.subplots(figsize=(8, 6))
    fig = sb.barplot(x='Score', y="review", data=final_df_grouped)

#### Function: to remove punctuations

In [12]:
#This function will remove punctiatons
def remove_punctuations(text):
    for punc in string.punctuation.replace("'",""):
        if punc in text:
            text = text.replace(punc," ")
    return text

#### Function: to remove extra spaces

In [13]:
#This function will remove extra spaces
def remove_extra_space(text):
    removed_space = " ".join(text.split())
    return removed_space

#### Function: to expand short words

In [14]:
#This function will expand short words such as isn't => is not
def expand_words(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text

#### Function: to replace remaining punctuation '

In [15]:
#This function will remove '
def remove_apostrophe(text):
    text = text.replace("'", " ")
    return text

#### Function: to remove english stop words

In [16]:
# we will remove stop words except no and not since they will effect our classification
def remove_stop_words(text):
    sWords = set(stopwords.words('english'))
    sWords = list(sWords)
    deselect_stop_words = ['no', 'not']

    for item in deselect_stop_words:
        sWords.remove(item)
    for word in sWords:
        text = text.replace(" " + word.lower() + " "," ")
    return text

#### Function: to transform all to lower case

In [17]:
def toLower(text):
    text = text.lower()
    return text

#### Function: to tokenize text

In [18]:
def tokeniz(text):
    return word_tokenize(text)

#### Function: to stemm text

In [19]:
def stemm(text):
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in text.split()]
    stemmed = ' '.join(stemmed) 
    return stemmed

#### Function: to lemmetize text

In [20]:
def lemm(text):
    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(word) for word in text.split()]
    lemmed = ' '.join(lemmed) 
    return lemmed

#### Function: to lemm text using Spacy

In [21]:
def spacy_lemm(text):
    nlp = spacy.load('en', disable=['parser', 'ner'])
    doc = nlp(text)
    lemmed = [token.lemma_ for token in doc]
    lemmed = ' '.join(lemmed) 
    return lemmed

#### Function: to Represent Vector Count Model

In [22]:
# this fuction will take the type of model (CountVector or TFIDF) and will return train data, feature name and vector repr. matrix

def VCM (vectorizer,df, colum_name, class_column, ngram_min = 1, ngram_max = 1):
    if vectorizer == "count":
        vect = CountVectorizer(ngram_range=(ngram_min,ngram_max)).fit(df[colum_name])
    elif vectorizer == "tfidf":
        vect = TfidfVectorizer(ngram_range=(ngram_min,ngram_max)).fit(df[colum_name])
    
    x_Train = vect.transform(df[colum_name])
    y_Train = df[class_column]
    feature_names = vect.get_feature_names()
    dense_vect = x_Train.todense()
    dense_list = dense_vect.tolist()
    vectDF = pd.DataFrame(dense_list, columns=feature_names)
    return x_Train, y_Train, vect, feature_names, vectDF

# 1- Data Exploration and Visualization:

## 1.1 Data Preparation and Cleaning

#### 1.1.1 Load Data File

In [23]:
# Load Data Set, this will return our main Data set and list of column names

original_df, column_names = loadFile('train.csv')

Loading File...[ok]
Loaded 426,340 reviews
Column Names:  ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


#### 1.1.2 Extract Needed Columns

In [24]:
# Get rid of un needed columns

df_required_columns = neededColumnsOnly(original_df, ['ProductId','Score','Summary','Text'])

In [25]:
df_required_columns.head(10)

Unnamed: 0,ProductId,Score,Summary,Text
0,B0034EDLS2,5,Very Good,I received this product early from the seller!...
1,B001I7HJE4,5,"Organic, Kosher, Tasty Assortment of Premium T...",*****<br />Numi's Collection Assortment Melang...
2,B000LKTB90,5,"excellent gluten-free spaghetti: great taste, ...","I was very careful not to overcook this pasta,..."
3,B001HXJPS2,5,Lindt is Lindt,Buying this multi-pack I was misled by the pic...
4,B006H34CUS,5,YUM!!!!!,These bars are so good! I loved them warmed up...
5,B004728MI4,5,Delicious,"I love these chips, I buy the 24 pack once a m..."
6,B001BZ5EFE,5,Tastes great and it's organic!!,I'm a huge fan of eating cereal for breakfast....
7,B0001AVRQK,2,Poor taste,I was really disappointed with the Sorghum we ...
8,B005GV9RZC,3,Better than US Instant Coffee,A friend who has gone to Korea gave me a coupl...
9,B004FEN3GK,4,Hard not to like!,No need for plastic baggies or sloppy tin foil...


#### 1.1.3 Remove Nulls

In [26]:
# Get list of columns containing null values

list_columns_with_nulls = columns_with_nulls(df_required_columns, 'ProductId')

In [27]:
list_columns_with_nulls

['Summary']

In [28]:
# Replace null values with empty string

df_without_nulls = remove_nulls(df_required_columns, list_columns_with_nulls)

In [29]:
df_without_nulls.head()

Unnamed: 0,ProductId,Score,Summary,Text
0,B0034EDLS2,5,Very Good,I received this product early from the seller!...
1,B001I7HJE4,5,"Organic, Kosher, Tasty Assortment of Premium T...",*****<br />Numi's Collection Assortment Melang...
2,B000LKTB90,5,"excellent gluten-free spaghetti: great taste, ...","I was very careful not to overcook this pasta,..."
3,B001HXJPS2,5,Lindt is Lindt,Buying this multi-pack I was misled by the pic...
4,B006H34CUS,5,YUM!!!!!,These bars are so good! I loved them warmed up...


In [30]:
# double check if removed

columns_with_nulls(df_without_nulls, 'ProductId')

[]

#### 1.1.3 Remove Html Tags

In [31]:
# Get list of columns containing html tags

list_columns_with_html = columns_with_html(df_without_nulls)

In [32]:
list_columns_with_html

['Text']

In [33]:
# example of html tag in text column

df_without_nulls.loc[1]['Text']

'*****<br />Numi\'s Collection Assortment Melange includes:<br />5 Herbal Teas (caffeine-free, also called "teasans"):<br />* Dry Desert Lime: Lime Herbal Teasan<br />* Fields of Gold: Lemongrass Herbal Teasan<br />* Red Mellow Bush: Rooibos Herbal Teasan<br />* Bushmen\'s Brew: Honeybush Herbal Teasan<br />* Simply Mint: Moroccan Mint<br /><br />2 Green Teas (lower in caffeine):<br />* Temple of Heaven: Gunpowder Green Tea<br />* Monkey King: Jasmine Green Tea<br /><br />2 Black Teas (contain caffeine):<br />* Chinese Breakfast: Yunnan Black Tea<br />* Smoky Tarry: Lapsang Souchong Black Tea<br /><br />This is a total of nine different teas, two tea bags of each one in each box. Numi teas are known for their high-quality, organic and kosher ingredients, and in my opinion, are some of the tastiest and best teas I have ever tried. They do not include artificial ingredients or flavorings.<br /><br />On the box, the manufacturer writes: "From mist-covered mountains to sun-drenched deserts

In [34]:
df_without_html = df_without_nulls.copy()

In [35]:
# Get rid of html tags 
df_without_html['Text'] = df_without_html['Text'].apply(remove_html_tags)

In [None]:
df_without_html.loc[1]['Text']

#### 1.1.4 Merge Summary and Text in review column

In [None]:
# final dataframe preparation

final_df = merge_summary_text(df_without_html)

In [None]:
final_df.head(10)

##### 1.1.5 Correct accented letters

In [None]:
#check if accent exist
text_html = final_df['review'].str.find('é')
text_html = text_html[text_html != -1]

In [None]:
text_html.head(10)

In [None]:
final_df['review'].iloc[15712] #nescafé  is found

In [None]:
#remove accents
final_df['review'] = final_df['review'].apply(correct_accent)

In [None]:
#double check if removed
final_df['review'].iloc[15712]

##### 1.1.6 Remove Punctuations

In [None]:
#remove punctuations
final_df['review'] = final_df['review'].apply(remove_punctuations)

In [None]:
final_df.iloc[1]["review"]

##### 1.1.7 Expand Short Words

In [None]:
final_df['review'] = final_df['review'].apply(expand_words)

In [None]:
final_df.iloc[1]["review"]

##### 1.1.8 Remove remaining apostrophe

In [None]:
final_df['review'] = final_df['review'].apply(remove_apostrophe)

In [None]:
final_df.iloc[1]["review"]

#### 1.1.9 Remove Extra Spaces

In [None]:
final_df['review'] = final_df['review'].apply(remove_extra_space)

In [None]:
final_df.iloc[1]["review"]

In [None]:
final_df.head(10)

#### 1.1.10 transform to lower case

In [None]:
final_df['review'] = final_df['review'].apply(toLower)

In [None]:
final_df.iloc[1]["review"]

#### 1.1.11 remove stop words

In [None]:
final_df['review'] = final_df['review'].apply(remove_stop_words)

In [None]:
final_df.iloc[1]["review"]

#### 1.1.12 remove extra spaces again

In [None]:
final_df['review'] = final_df['review'].apply(remove_extra_space)

In [None]:
final_df.head(10)

## 1.2 Data Explore and Visualize

#### 1.2.1 Visualize score count

In [None]:
visualize_score_count(final_df)

#### 1.2.2 Describe Score Data

In [None]:
final_df.groupby('Score').describe()

#### 1.2.3 Analyze finding

#### we found that not all are unique. so we investigate further for uniqness

In [None]:
final_df.groupby('review').agg('count')

#### visualize one of the duplicate data

In [None]:
t = final_df['review'].str.find('I only used two maybe three tea bags and got pregnan')
t = t[t != -1]

In [None]:
t

In [None]:
final_df.loc[43935]['review']

In [None]:
final_df.loc[187578]['review']

In [None]:
final_df.loc[306678]['review']

#### check initial row count

In [None]:
final_df.count()

#### 1.2.4 Remove Duplicates

In [None]:
final_df.drop_duplicates(keep="first",inplace=True)

#### check new row count

In [None]:
final_df.count()

In [None]:
final_df.groupby('Score').describe()

In [None]:
final_df.head(10)

In [None]:
final_df.iloc[124]['review']

# 2- Text Processing and Normalization:

#### 2.1 Extract part of final data

In [None]:
final_df_sub = final_df.head(10).copy()

In [None]:
final_df_sub.head(10)

#### 2.2 Tokenize Data

In [None]:
final_df_sub['token'] = final_df_sub['review'].apply(tokeniz)

In [None]:
final_df_sub.head(10)

#### 2.3 Stemming Data

In [None]:
# Stem data and calculate time needed for that
start_time = time.time()
final_df_sub['stem'] = final_df_sub['review'].apply(stemm)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
final_df_sub.head(10)

#### 2.4 Lemmatizing Data

In [None]:
# Lemmatize data and calculate time needed for that
start_time = time.time()
final_df_sub['lemm'] = final_df_sub['review'].apply(lemm)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
final_df_sub.head(10)

#### 2.5 Lemmatizing Data using spacy

In [None]:
# Lemmatize data and calculate time needed for that
start_time = time.time()
final_df_sub['spacy_lemm'] = final_df_sub['review'].apply(spacy_lemm)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
final_df_sub.head(10)

#### 2.5 Compare Results

In [None]:
final_df_sub['stem'].iloc[0]

In [None]:
final_df_sub['lemm'].iloc[0]

In [None]:
final_df_sub['review'].iloc[0]

In [None]:
final_df_sub['spacy_lemm'].iloc[0]

# 3- Vector Space Model and Feature Representation:

#### 3.1 Count Vector

In [None]:
xTrain, y_Train, count_vect, feature_names, count_vectDF = VCM("count", final_df_sub, "stem", "Score")

In [None]:
count_vectDF.head(10)

In [None]:
#visualize important features 
count_vectDF.mean(axis=0).sort_values()

#### 3.2 TFIDF Vector

In [None]:
xTrain2, y_Train2, tfidf_vect, feature_names2, tfidf_vectDF = VCM("tfidf", final_df_sub, "stem", "Score")

In [None]:
tfidf_vectDF.head(10)

In [None]:
#visualize important features 
tfidf_vectDF.mean(axis=0).sort_values()

### <font color='red'> here we can note that some features although are important, but they are irrelevent in 1 gram and may need bi gram to have good meaning example: taste it can represent either good or bad taste </FONT>

#### 3.3 Count Vector WITH BIGRAM

In [None]:
xTrain_ngram_2, y_Train_ngram_2, count_vect_ngram_2, feature_names_ngram_2, count_vectDF_ngram_2 = VCM("count", final_df_sub, "stem", "Score",2,2)

In [None]:
count_vectDF_ngram_2.head(10)

In [None]:
count_vectDF_ngram_2.mean(axis=0).sort_values()

### <FONT COLOR="RED"> here we can see the combination of taste with other word how it has a clear meaning </FONT>

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(max_iter = 10000), x_Train_1, y_Train_1, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))