In [96]:
import spacy
import re
import joblib
from nltk.corpus import stopwords
import en_core_web_lg
import pandas as pd
import nltk
import heapq

In [97]:
def normalize_text(text):
    tm1 = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
    tm2 = re.sub('<code>.*?</code>', '', tm1, flags=re.DOTALL)
    tm3 = re.sub('<[^>]+>', '', tm1, flags=re.DOTALL)
    return tm3.replace("\n", "")

In [98]:
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
def cleanup_text(docs, logging=False):
    stop_words = stopwords.words('english')
    texts = []
    doc = nlp(docs, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stop_words and tok not in punctuations]
    tokens = ' '.join(tokens)
    texts.append(tokens)
    return pd.Series(texts)


In [99]:
def generate_summary(text_without_removing_dot, cleaned_text):
    sample_text = text_without_removing_dot
    doc = nlp(sample_text)
    sentence_list=[]
    for idx, sentence in enumerate(doc.sents): # we are using spacy for sentence tokenization
        sentence_list.append(re.sub(r'[^\w\s]','',str(sentence)))

    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}  
    for word in nltk.word_tokenize(cleaned_text):  
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1


    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)


    sentence_scores = {}  
    for sent in sentence_list:  
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)
    return summary

In [100]:
df =joblib.load("data/dataframe_nearest_1day.pkl")
df.rename(columns={'text_x':'tip', 'text_y':'review'}, inplace=True)

#TEST 1000 Sentences
df = df[:1000]
nlp = en_core_web_lg.load()

In [None]:
df['review_cleaned_1'] = df['review'].apply(normalize_text)
df['review_cleaned'] = df['review_cleaned_1'].apply(lambda x: cleanup_text(x, False))
df['review_answer'] = df.apply(lambda x: generate_summary(x.review_cleaned_1, x.review_cleaned), axis=1)

1_DONE
2_DONE
3_DONE
9
4_DONE
Original Text::::::::::::

Just finished a good meal here. Clean, friendly, effecient service. Not a place for varieties of pho but if you're looking for out of the norm Vietnamese dishes then this is the place to try. We had Bun Thang , Cha Ca + extra noodles, and combo C (house soup, imperial roll, chicken with peanut/coconut milk sauce, rice and tea), Vietnamese style iced-coffee which came to about $48 with tax and tip for 3 people.  Our waitress spoke Vietnamese, French and a decent amount of English.  It also participates in BYOW.


Summarized text::::::::

chicken with peanutcoconut milk sauce rice and tea Vietnamese style icedcoffee which came to about 48 with tax and tip for 3 people   Cha Ca  extra noodles and combo C house soup imperial roll
