In [1]:
import pandas as pd

In [4]:
df = pd.read_json('data_ccrit.json')

# get proposal wise comments
ideas = df.idea.unique()
proposal_wise_comments = []

for idea in ideas:
    proposal_wise_comments.append(list(df.loc[df['idea'] == idea, 'comment']))

In [9]:
import re
from pprint import pprint

import numpy as np
from nltk import sent_tokenize, word_tokenize

from nltk.cluster.util import cosine_distance

MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)


def normalize_whitespace(text):
    """
    Translates multiple whitespace into single space character.
    If there is at least one new line character chunk is replaced
    by single LF (Unix new line) character.
    """
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)


def _replace_whitespace(match):
    text = match.group()

    if "\n" in text or "\r" in text:
        return "\n"
    else:
        return " "


def is_blank(string):
    """
    Returns `True` if string contains only white-space characters
    or is empty. Otherwise `False` is returned.
    """
    return not string or string.isspace()


def get_symmetric_matrix(matrix):
    """
    Get Symmetric matrix
    :param matrix:
    :return: matrix
    """
    return matrix + matrix.T - np.diag(matrix.diagonal())


def core_cosine_similarity(vector1, vector2):
    """
    measure cosine similarity between two vectors
    :param vector1:
    :param vector2:
    :return: 0 < cosine similarity value < 1
    """
    return 1 - cosine_distance(vector1, vector2)


'''
Note: This is not a summarization algorithm. This Algorithm pics top sentences irrespective of the order they appeared.
'''


class TextRank4Sentences():
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stopwords=None):
        if stopwords is None:
            stopwords = []

        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stopwords:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stopwords:
                continue
            vector2[all_words.index(w)] += 1

        return core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stopwords=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stopwords=stopwords)

        # Get Symmeric matrix
        sm = get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is to ignore the 0 element in norm

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector

    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):

        top_sentences = {}

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(number):
                print (str(sorted_pr[index]) + " : " + str(self.pr_vector[sorted_pr[index]]))
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
                top_sentences[sent] = self.pr_vector[sorted_pr[index]]
                index += 1

        return top_sentences

    def analyze(self, text, stop_words=None):
        self.text_str = text
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]

        similarity_matrix = self._build_similarity_matrix(tokenized_sentences, stop_words)

        self.pr_vector = self._run_page_rank(similarity_matrix)
        print(self.pr_vector)

In [10]:
text_str = '''
    Those Who Are Resilient Stay In The Game Longer
    “On the mountains of truth you can never climb in vain: either you will reach a point higher up today, or you will be training your powers so that you will be able to climb higher tomorrow.” — Friedrich Nietzsche
    Challenges and setbacks are not meant to defeat you, but promote you. However, I realise after many years of defeats, it can crush your spirit and it is easier to give up than risk further setbacks and disappointments. Have you experienced this before? To be honest, I don’t have the answers. I can’t tell you what the right course of action is; only you will know. However, it’s important not to be discouraged by failure when pursuing a goal or a dream, since failure itself means different things to different people. To a person with a Fixed Mindset failure is a blow to their self-esteem, yet to a person with a Growth Mindset, it’s an opportunity to improve and find new ways to overcome their obstacles. Same failure, yet different responses. Who is right and who is wrong? Neither. Each person has a different mindset that decides their outcome. Those who are resilient stay in the game longer and draw on their inner means to succeed.
    '''

tr4sh = TextRank4Sentences()
tr4sh.analyze(text_str)
pprint(tr4sh.get_top_sentences(5), width=1, depth=2)

[1.27124764 1.09149529 0.49907963 1.24689674 1.08644157 1.24013595
 1.24151475 0.9964572  0.6098623  0.81096533 0.8501204  1.05578319]
0 : 1.271247636683798
3 : 1.2468967412932273
6 : 1.2415147486210913
5 : 1.2401359537858996
1 : 1.0914952900297563
{'\nThose Who Are Resilient Stay In The Game Longer\n“On the mountains of truth you can never climb in vain: either you will reach a point higher up today, or you will be training your powers so that you will be able to climb higher tomorrow.” — Friedrich Nietzsche\nChallenges and setbacks are not meant to defeat you, but promote you.': 1.271247636683798,
 'However, I realise after many years of defeats, it can crush your spirit and it is easier to give up than risk further setbacks and disappointments.': 1.0914952900297563,
 'However, it’s important not to be discouraged by failure when pursuing a goal or a dream, since failure itself means different things to different people.': 1.2401359537858996,
 'To a person with a Fixed Mindset failur

In [18]:
text_str = '.'.join(proposal_wise_comments[1])

text_str

'This could be an obelisk or major art piece authored by Barrio artists..It is envisioned as a small traffic roundabout with a major art piece in the center in the spirit of Tijuana or Mexico City..Reach out to the School system (i.e. FIDM, NewSchool, Community College, soon-to-be UCSD).I think it\'s a great idea. Will provide a space for exchanging ideas and cultural conversations..Relevant and great..Public art always enlivens the area and sets a positive tone.  Just think of Millennial Park in Chicago..AGAIN---COPYING ANOTHER IDEA....I think it should be something NEVER DONE BEFORE.Like this idea, but you will need to \\curate\\" or make sure the art is appropriate.".I think its feasible, there is a lot of local artists.It can bring in tourist so it may impact the families who live there..By beautifying the neighborhood, and having the community participate in the \\making\\" will increase pride and care for all.".I dont think so..more activity, people, likely to be safer.TRUE---any

In [19]:
tr4sh = TextRank4Sentences()
tr4sh.analyze(text_str)
pprint(tr4sh.get_top_sentences(5), width=1, depth=2)

[0.96930672 0.84734092 0.7343535  1.02086526 1.21447309 1.00588615
 1.20777436]
4 : 1.214473085964252
6 : 1.2077743584676508
3 : 1.0208652630424895
5 : 1.0058861489689706
0 : 0.9693067237297542
{'".I dont think so..more activity, people, likely to be safer.TRUE---any of the ideas only work if the homeless population is no longer there!': 1.0058861489689706,
 '".I think its feasible, there is a lot of local artists.It can bring in tourist so it may impact the families who live there..By beautifying the neighborhood, and having the community participate in the \\making\\" will increase pride and care for all.': 1.214473085964252,
 'It\'s a nightmare every where, #1 issue to be dealt with first..I like art a lot.I think a half-a-million dollar condo next to some ugly mural is, in fact, going to have a negative impact on quality of life..It would beautify a blight area; allow the opportunity to involve the schools/universities into the project.It wont because its art.No one is going to car