# Sentiment Analysis, Text Summarization & Visualisation

## Setup
To prepare your environment, you need to install some packages.

### Install the necessary packages

You need the latest versions of these packages:<br>

In [None]:
!pip install gensim

In [None]:
!pip install watson-developer-cloud==1.5

In [None]:
!pip install pyldavis

In [None]:
!pip install wordcloud

In [None]:
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
import watson_developer_cloud
# import ibm_boto3
# from botocore.client import Config
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import urllib
from bs4 import BeautifulSoup
import requests
import nltk
nltk.download('all')

## 1. Summarization & keywords extraction

### 1a. Helper functions to extract summary and keywords

In [None]:
'''Get the summary of the text'''

def get_summary(text, pct):
    summary = summarize(text,ratio=pct,split=True)
    return summary

def complete_summary(summary):
    summary = " ".join(summary)
    print(type(summary))
    return summary

'''Get the keywords of the text'''

def get_keywords(text):
    res = keywords(text, ratio=0.1, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=False)
    res = res.split('\n')
    return res

'''Tokenize the sentence into words & remove punctuation'''

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def split_sentences(text):
    """ Split text into sentences.
    """
    sentence_delimiters = re.compile(u'[\\[\\]\n.!?]')
    sentences = sentence_delimiters.split(text)
    return sentences

def split_into_tokens(text):
    """ Split text into tokens.
    """
    tokens = nltk.word_tokenize(text)
    return tokens
    
def POS_tagging(text):
    """ Generate Part of speech tagging of the text.
    """
    POSofText = nltk.tag.pos_tag(text)
    return POSofText

def extract_title_text(url):
    page = urllib.request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page,'lxml')
    text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
    return soup.title.text, text

### 1b. Summarization & keywords extraction

In [None]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_6b4ef4db85984d6cb9f05ef0da73c427 = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='Mfkrien7_WF2rNDAEHoJEHHrKvXHZANIcPRqsHeuZPFW',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_6b4ef4db85984d6cb9f05ef0da73c427.get_object(Bucket='submissionanalysis-donotdelete-pr-jdnlxbx7mat2e1',Key='ProblemSolutionDBData.json')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object 

if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# Since JSON data can be semi-structured and contain additional metadata, it is possible that you might face an error during data loading.
# Please read the documentation of 'pandas.read_json()' and 'pandas.io.json.json_normalize' to learn more about the possibilities to adjust the data loading.
# pandas documentation: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
# and http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html

#print(body)
#df_data_1 = pd.read_json(body, orient='values')
df_data_1 = pd.read_json(body)
#df_data_1.head()
rawtext = ' '
problem = df_data_1["PROBLEM"]["uploads"][1]["text"]
solution = df_data_1["SOLUTION"]["uploads"][1]["text"]
comments = df_data_1['SOLUTION']["comments"]
for c in comments:
    rawtext = rawtext + c['comment'] + ' '
#print(rawtext)

print ('-------------------------------------------------------------------------------------------------------------------')
print('Printing Summary of review comments')
print('--------------------------')

summary = get_summary(rawtext, 0.13)
print(summary)

print ('-------------------------------------------------------------------------------------------------------------------')
print ('-------------------------------------------------------------------------------------------------------------------')
print('Printing Keywords')
print('--------------------------')
hashtag_list =[]
for i in get_keywords(rawtext):
    hashtag = '#'+i
    hashtag_list.append(hashtag)
    
hashtag_string = str(" ".join(hashtag_list))
print(hashtag_string)
print ('-------------------------------------------------------------------------------------------------------------------')

# 2. Topic Modelling

# 2a. Start the preprocessing for Topic Modelling

Topic Modelling is an approach for finding topics in large amounts of text. Topic modeling is great for document clustering, information retrieval from unstructured text, and feature selection.
 
Topic Modeling with Latent Dirichlet Allocation technique.

Why Latent Dirichlet Allocation? This technique can create model which can be generalized easily on any new text corpus and help us in identifying the important topics from the corpus. 

Some of the advantages are :

Training documents may come in sequentially, no random access required.

Runs in constant memory w.r.t. the number of documents: size of the training corpus does not affect memory footprint, can process corpora larger than RAM.

Is distributed & makes use of a cluster of machines, if available, to speed up model estimation.

In [None]:
# article_text = summary
stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
filteredtext = rawtext
filteredtext = filteredtext.replace('the', '')
filteredtext = filteredtext.replace('It', '')
filteredtext = filteredtext.replace('may', '')
filteredtext = filteredtext.replace('maybe', '')
filteredtext = filteredtext.replace('wish', '')

word_tokens = word_tokenize(str(filteredtext)) 
filtered_sentence = [w for w in word_tokens if not w in stop_words]
normalized = " ".join(lemma.lemmatize(word) for word in filtered_sentence)

# 3. Visualization

In [None]:
from wordcloud import WordCloud
#wordcloud = WordCloud(background_color='white',max_font_size=60).generate(normalized)

wordcloud = WordCloud(background_color="white", max_words=70, contour_width=10, contour_color='firebrick').generate(normalized)

plt.figure(figsize=(16,12))

'''plot wordcloud in matplotlib'''

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### 4. Create Topic Model

In [None]:
tokenized_sents = list(sent_to_words(filtered_sentence))
print(tokenized_sents)

# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(tokenized_sents)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_sents]
print(doc_term_matrix)

In [None]:
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=1, id2word = dictionary, passes=30)

# Print the model output
topics = ldamodel.print_topics(num_words=10)



In [None]:
print(topics)


In [None]:
for i in topics:
    print(i[1].split('"')[0])

In [None]:
print ('---------------------------------------')
tweet_with_summary_hashtags = complete_summary(summary) + " "+hashtag_string
print(len(tweet_with_summary_hashtags))
print(tweet_with_summary_hashtags)


#### Coherence score is 'higher the better' metric and given the score of 0.86 we can be assured that we have selected the right number of topics for this corpus.

In [None]:
'''Compute Perplexity'''

# a measure of how good the model is. Lower the better.
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix))

'''Compute Coherence Score'''

coherence_model_lda = CoherenceModel(model=ldamodel, texts=tokenized_sents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### We have seen how to summarize & visualize review comments of a submitted solution and to create hash tags to get quick information about the data. This methodology can be applied to lot of usecases to extract insights from unstructured data.