In [1]:
import json
import spacy
import re
import urllib.parse as ulp
import requests
from textblob import TextBlob
from nltk.corpus import stopwords
import pandas as pd

NLP = spacy.load('en')
stopwords = stopwords.words('english')

In [2]:
def clean_article_title(title):
    """Take an article-source URL and return a cleanly formatted title

    Args:
        title (string): a source URL
    Return:
        article_title (string): A cleanly formatted title to include in response
    """
    article_title = ulp.unquote(title)
    article_title = re.split('title=', article_title)[1].replace('+', ' ')
    title_len = len(article_title)
    if (article_title[title_len - 5:] in ['&amp', '&amp;']):
        article_title = article_title[:title_len - 5]
    return article_title

In [3]:
class Corpus(object):
    """A lightweight class that provides a consistent interface to copora

    Attributes:
        es_host (string): The URL and Port of the ES cluster
        author (string): The desired corpus, currently supports either 'jones' or 'ebert'

    """
    def __init__(self, author, es_host='http://localhost:9200/'):
        self.es_host = es_host

        if author in ['jones', 'ebert']:
            self.author = author
        else:
            raise ValueError('author must be one of "jones" or "ebert", you gave {}'.format(author))

    def retrieve_articles(self, keyword_string):
        """Retrieve articles with text relevant to the keyword string

        Args:
            keyword_string (string): A string with keywords, e.g. 'Tracy Letts' or 'Apocalypse Now'

        Return:
            A list of tuples: (title, article text)
        """

        if self.author == 'jones':
            q = {
                "_source": ["Full text:", "ProQ:"],
                "from": 0, "size": 10000,
                "query": {
                    "bool" : {
                        "must": [{"match":{"Full text:": i}} for i in keyword_string.split(' ')]
                    }
                }
            }
            r = json.loads(requests.post(self.es_host+"flattened-articles/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
            return [Article(clean_article_title(i['_source']['ProQ:']), i['_source']['Full text:']) for i in r['hits']['hits']]
        elif self.author == 'ebert':
            q = {
                "_source": ["text", "title"],
                "from": 0, "size": 10000,
                "query": {
                    "bool" : {
                        "must": [{"match":{"text": i}} for i in keyword_string.split(' ')]
                    }
                }
            }
            r = json.loads(requests.post(self.es_host+"ebert-reviews/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
            return [Article(i['_source']['title'], i['_source']['text']) for i in r['hits']['hits']]
        else:
            raise ValueError('author must be one of "jones" or "ebert", you gave {}'.format(author))

    def filter_articles_by_person(self, name):
        """Retrieve articles with a persons name

        Args:
            name (string): A string with a name e.g. 'Tracy Letts'

        Return:
            A list of tuples: (title, article text)
        """

        if self.author == 'jones':
            q = {
                "_source": ["Full text:", "ProQ:"],
                "from": 0, "size": 10000,
                "query": {
                    "bool" : {
                        "must": [{"match":{"Full text:": i}} for i in name.split(' ')]
                    }
                }
            }
            r = json.loads(requests.post(self.es_host+"flattened-articles/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
            return [Article(clean_article_title(i['_source']['ProQ:']), i['_source']['Full text:']) for i in r['hits']['hits']]
        elif self.author == 'ebert':
            q = {
                "_source": ["text", "title"],
                "from": 0, "size": 10000,
                "query": {
                    "bool" : {
                        "must": [{"match":{"text": i}} for i in name.split(' ')]
                    }
                }
            }
            r = json.loads(requests.post(self.es_host+"ebert-reviews/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
            return [Article(i['_source']['title'], i['_source']['text']) for i in r['hits']['hits']]
        else:
            raise ValueError('author must be one of "jones" or "ebert", you gave {}'.format(author))

    def retrieve_article(self, article_string):
        """Retreive a single article with a title relevant to article_string

        Args:
            article_string (string): A string with relevant keywords to an article title

        Returns:
            An Article corresponding to the article_string
        """
        if self.author == 'jones':
            q = {
                "_source": ["Full text:", "ProQ:"],
                "from": 0, "size": 1,
                "query": {
                    "bool" : {
                        "must": [{"match":{"Full text:": i}} for i in article_string.split(' ')]
                    }
                }
            }
            r = json.loads(requests.post(self.es_host+"flattened-articles/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
            return [Article(clean_article_title(i['_source']['ProQ:']), i['_source']['Full text:']) for i in r['hits']['hits']][0]
        elif self.author == 'ebert':
            q = {
                "_source": ["text", "title"],
                "from": 0, "size": 1,
                "query": {
                    "bool" : {
                        "must": [{"match":{"title": i}} for i in article_string.split(' ')]
                    }
                }
            }
            r = json.loads(requests.post(self.es_host+"ebert-reviews/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
            return [Article(i['_source']['title'], i['_source']['text']) for i in r['hits']['hits']][0]
        else:
            raise ValueError('author must be one of "jones" or "ebert", you gave {}'.format(author))
        
        
    def slurp_articles(self):
        q = {
            "size":10000,
            "query" : {
                "match_all" : {}
        }
}
        r = json.loads(requests.post(self.es_host+"ebert-reviews/_search", json.dumps(q)).content.decode('utf-8', 'ignore'))
        return [i["_source"]["text"] for i in r['hits']['hits']]



In [22]:
#c = Corpus('ebert')
#a = c.slurp_articles()

#all_sents = []
#for i in a:
#    si = NLP(i)
#    for x in si.sents:
#        all_sents.append(x.text)
        
#with open('all_ebert_sents.txt', 'w') as f:
#    for i in all_sents:
#        f.write(i+'\n')


In [21]:
#allsentslist = list(all_sents)
#allsentslist[8]

"Will Smith plays Spooner, a Chicago Police Department detective who doesn't think it's suicide."

In [4]:

class Article(object):
    """A class to containt an article

    Attributes:
        text (string): The full text of the article
        sentences (list of Content): The sentences of the article

    """
    def __init__(self, title, text):
        self.text = str(text.encode('utf-8', 'ignore').decode('ascii', 'ignore'))
        self.title = str(title.encode('utf-8', 'ignore').decode('ascii', 'ignore'))
        for i in ["[Tt]h(e|(is)) [Ff]ilm", "[Tt]h(e|(is)) [Mm]ovie"]:
            self.text = re.sub(i, self.title, self.text)
        self.paragraphs = self.text.splitlines()

        self.sentences = []
        for p in self.paragraphs:
            self.sentences += [Content(i, title) for i in NLP(p).sents]

    def get_paragraph(self, c):
        """Return the full paragraph that a Content, c, came from

        Args:
            c (Content): a content object

        Returns:
            A full text paragraph containing the content arg
        """

        for p in self.paragraphs:
            if re.search(c.text.text.split('(')[0], p) is not None:
                return p
        raise ValueError('Content must be in the Article, no match found for {}'.format(c.text.text))


In [5]:

class Content(object):
    """A lightweight class to hold content and metadata

    Attributes:
        text (string): The text
        title (string): The title of the article the text came from
        theater (string): The name of the theater the article is talking about
        director (string): The name of the director

    """

    def __init__(self, text, title=None, theater=None, director=None, names=[]):
        self.text = text
        textblob_sentiment = TextBlob(text.text).sentiment
        self.sentiment = textblob_sentiment[0]
        self.subjectivity = textblob_sentiment[1]
        self.title = title
        self.theater = theater
        self.director = director
        self.names = names
        self.filter_depth = 0


In [6]:

def parse_articles(article_list):
    """Parse the sentence content out of the full text of an article

    Args:
        article_list (list of tuples): a list of tuples returned from Corpus.retrieve_articles

    Returns:
        a list of content ojects, with text and title fields populated
    """
    results = []
    for i in article_list:
        for j in NLP(i[1]).sents:
            results.append(Content(j, title=i[0]))
    return results

def compose_filters(content_list, filter_list, **kwargs):

    l = content_list
    for filt in filter_list:
        tmp_l = [filt(i, **kwargs) for i in l]
        tmp_l = [i for i in tmp_l if i is not None]
        for i in tmp_l:
            i.filter_depth += 1
        if len(tmp_l) > 0:
            l = tmp_l
    return l

def filter_opinion(content, **kwargs):
    return content if content.subjectivity > 0 else None

def filter_name(content, **kwargs):
    return content if re.search(kwargs['name'], content.text.text) is not None else None

def filter_name_parentheses(content, **kwargs):
    return content if re.search("\([a-zA-Z]* *"+kwargs['name']+"\)", content.text.text) is None else None

def filter_fragments(content, **kwargs):
    return content if re.search("[a-z]", content.text.text[0]) is None \
        and re.search("[\.\?!]", content.text.text[-1]) is not None else None

def filter_stop_chars(content, **kwargs):
    return content if not any(i in content.text.text for i in ["$", "@", "- -"]) else None

def filter_person_subject_object(content, **kwargs):
    return content if len([1 for j in content.text if j.dep_ in ['nsubj', 'pobj'] and j.ent_type_ == 'PERSON']) > 0 else None

def filter_person_subject(content, **kwargs):
    return content if len([1 for j in content.text if j.dep_ in ['nsubj'] and j.ent_type_ == 'PERSON']) > 0 else None

def filter_name_subject_object(content, **kwargs):
    # return content if len([1 for j in content.text if j.dep_ in ['nsubj', 'pobj'] and j.ent_type_ == 'PERSON']) > 0 else None
    subject_object_tokens = [j.text for j in content.text if j.dep_ in ['nsubj', 'pobj']]
    name_tokens = kwargs['name'].split(' ')
    for i in name_tokens:
        if i in subject_object_tokens:
            return content
    return None


def filter_name_subject(content, **kwargs):
    subject_tokens = [j.text for j in content.text if j.dep_ in ['nsubj']]
    name_tokens = kwargs['name'].split(' ')
    for i in name_tokens:
        if i in subject_tokens:
            return content
    return None

In [7]:

def always_like_name(corpus, name):
    c = Corpus(corpus)
    a = c.retrieve_articles(name)
    results = []
    for article in a:
        tmp_c_l = sorted([i for i in compose_filters(article.sentences,
                                                    [filter_opinion,
                                                     filter_stop_chars,
                                                     filter_name,
                                                     filter_name_subject_object,
                                                     filter_name_subject],
                                                     name=name) if i.filter_depth >=3],
                        key=lambda x: x.sentiment,
                         reverse=False)
        if len(tmp_c_l) > 0:
            results.append((article, tmp_c_l[0]))

    sorted_results = sorted(results, key=lambda x: x[1].sentiment, reverse=False)

    for i in sorted_results:
        try:
            return i[0].get_paragraph(i[1])
        except ValueError:
            pass
    return "I'm sorry, I don't have anything to say about {}".format(name)


def always_dislike_name(corpus, name):
    c = Corpus(corpus)
    a = c.retrieve_articles(name)
    results = []
    for article in a:
        tmp_c_l = sorted([i for i in compose_filters(article.sentences,
                                                    [filter_opinion,
                                                     filter_stop_chars,
                                                     filter_name,
                                                     filter_name_subject_object,
                                                     filter_name_subject],
                                                     name=name) if i.filter_depth >=3],
                        key=lambda x: x.sentiment,
                         reverse=True)
        if len(tmp_c_l) > 0:
            results.append((article, tmp_c_l[0]))

    sorted_results = sorted(results, key=lambda x: x[1].sentiment, reverse=True)

    for i in sorted_results:
        try:
            return i[0].get_paragraph(i[1])
        except ValueError:
            pass
    return "I'm sorry, I don't have anything to say about {}".format(name)

def favorite_person_in_article(corpus, article):
    c = Corpus(corpus)
    a = c.retrieve_article(article)

    sorted_results = sorted([i for i in compose_filters(a.sentences,
                                                [filter_opinion,
                                                 filter_stop_chars,
                                                 filter_person_subject_object,
                                                 filter_person_subject]) if i.filter_depth >=3],
                    key=lambda x: x.sentiment,
                     reverse=True)

    for i in sorted_results:
        try:
            return a.get_paragraph(i)
        except ValueError:
            pass
    return "I'm sorry, I don't have anything to say about {}".format(article)


def least_favorite_in_article(corpus, article):
    c = Corpus(corpus)
    a = c.retrieve_article(article)

    results = []
    sorted_results = sorted([i for i in compose_filters(a.sentences,
                                                [filter_opinion,
                                                 filter_stop_chars,
                                                 ]) if i.filter_depth >=2],
                    key=lambda x: x.sentiment,
                            reverse=False)

    for i in sorted_results:
        try:
            return a.get_paragraph(i)
        except ValueError:
            pass
    return "I'm sorry, I don't have anything to say about {}".format(article)


def compare_person_works(corpus, name, title):
    c = Corpus(corpus)

    final_results = []
    a = c.retrieve_article(title)
    sorted_results = sorted([i for i in compose_filters(a.sentences,
                                                [filter_opinion,
                                                 filter_stop_chars,
                                                 filter_name,
                                                 filter_name_subject_object,
                                                 filter_name_subject],
                                                        name=name) if i.filter_depth >=3],
                    key=lambda x: x.subjectivity,
                     reverse=True)

    for i in sorted_results:
        try:
           final_results.append((a, i))
           break
        except ValueError:
            pass



    a2 = c.retrieve_articles(name)
    a2 = [i for i in a2 if i is not a]
    results = []
    for article in a2:
        tmp_c_l = sorted([i for i in compose_filters(article.sentences,
                                                    [filter_opinion,
                                                     filter_stop_chars,
                                                     filter_name,
                                                     filter_name_subject_object,
                                                     filter_name_subject],
                                                     name=name) if i.filter_depth >=5],
                        key=lambda x: x.subjectivity,
                         reverse=True)
        if len(tmp_c_l) > 0:
            results += [(article, i) for i in tmp_c_l]

    sorted_results = sorted(results, key=lambda x: x[1].sentiment, reverse=True)

    for i in sorted_results:
        try:
            final_results.append((i[0], i[1]))
            break
        except ValueError:
            pass
    for i in sorted_results[::-1]:
        try:
            final_results.append((i[0], i[1]))
            break
        except ValueError:
            pass

    for i in final_results:
        print(i[0].get_paragraph(i[1]))

# How do you feel about ___person___ in ___show___
def opinion_person_in_article(corpus, name, article):
    c = Corpus(corpus)
    a = c.retrieve_article(article)

    sorted_results = sorted([i for i in compose_filters(a.sentences,
                                                        [filter_opinion,
                                                         filter_stop_chars,
                                                         filter_name,
                                                         filter_name_subject_object,
                                                         filter_name_subject],
                                                        name=name) if i.filter_depth >=3],
                            key=lambda x: x.subjectivity,
                            reverse=True)

    for i in sorted_results:
        try:
            return a.get_paragraph(i)
        except ValueError:
            pass
    return "I'm sorry, I don't have anything to say about {}".format(article)


In [14]:
    #    print('\n')
    #    print('Roger Ebert, do you always like Francis Ford Coppola?')
    #    print(always_like_name('ebert', 'Francis Ford Coppola'))
    #    print('\n')
    #    print('Roger Ebert, do you always like Viggo Mortensen?')
    #    print(always_like_name('ebert', 'Viggo Mortensen'))
    #    print('\n')
    #    print('Chris Jones, do you always like Aaron Sorkin?')
    #    print(always_like_name('jones', 'Aaron Sorkin'))
    #    print('\n')
    #    print('Chris Jones, do you always like Aaron Todd Douglas?')
    #    print(always_like_name('jones', 'Aaron Todd Douglas'))
    #    print('\n')
    #    print('Chris Jones, do you always dislike Aaron Todd Douglas?')
    #    print(always_dislike_name('jones', 'Aaron Todd Douglas'))
    #    print('\n')
    #    print('Chris Jones, do you always like Tracy Letts?')
    #    print(always_like_name('jones', 'Tracy Letts'))
    #    print('\n')
    #    print('Chris Jones, do you always dislike Steppenwolf?')
    #    print(always_dislike_name('jones', 'Steppenwolf'))

    #    print('\n')
    #    print('Roger Ebert, who was your favorite person in Apocalypse Now?')
    #    print(favorite_person_in_article('ebert', 'Apocalypse Now'))
    #    print('\n')
    #    print('Chris Jones, who was your favorite person in Killer Joe?')
    #    print(favorite_person_in_article('jones', 'Killer Joe'))
    #    print('\n')
    #    print('Roger Ebert, what was the worst thing in Apocalypse Now Redux?')
    #    print(least_favorite_in_article('ebert', 'Apocalypse Now Redux'))

    #    print('\n')
    #    # One of the original questions that started this endeavor was
    #    # "Chris, well doy just always dislike mexican directors?". The approach
    #    # that I favor is to actually use an external databse, knowledge graph, etc.
    #    # to get a list of related directors according to whatever the characteristic
    #    # in question may be (in this case, country of origin), and then execute the
    #    # same dislike_name function we've been calling before for each of them.

    #    # Shockingly, if we call it for Alfonso Cuaron (who would very likely be
    #    # near the top of any list of prominent Mexican filmmakers, especially
    #    # after Children of Men and Gravity), we get the following:

    #    # print('Roger Ebert, do you always dislike Alejandro Gonzalez Inarritu?')
    #    # print(always_dislike_name('ebert', 'Alejandro Gonzalez Inarritu'))
    #    print('Roger Ebert, do you always dislike Mexican Filmmakers?')
    #    print(always_dislike_name('ebert', 'Alfonso Cuaron'))

    #    print('\n')
    #    compare_person_works('jones', 'Tracy Letts', 'August Osage County')

    #    print('\n')
    #    compare_person_works('jones', 'Steppenwolf Theatre', 'Buried Child')

In [17]:
print('\n')
print('Ebert, what did you think of Jennifer Lawrence in Winters Bone?')
print(opinion_person_in_article('ebert', 'Jennifer Lawrence', "Winter's Bone"))

print('\n')
print('Jones, what did you think of Tom Bateman in AmeriKafka?')
print(opinion_person_in_article('jones', 'Bateman', "AmeriKafka"))


c = Corpus('jones')
arts = c.filter_articles_by_person("Bateman's")
for a in arts:
    print(a.title)



Ebert, what did you think of Jennifer Lawrence in Winters Bone?
Ree is played by Jennifer Lawrence, a 19-year-old newcomer who has already starred in Jodie Foster's next film. Lawrence embodies a fierce, still center that is the source of her heroism. She makes no boasts, issues no threats, depends on a dogged faith that people will do the right thing  even when no one we meet seems to deserve that faith. "Don't ask for what ought to be offered," she tells her little brother, although the lives of her parents seem to be an exercise in asking and not offering. Did she raise herself?


Jones, what did you think of Tom Bateman in AmeriKafka?
And there are three excellent -- really excellent -- lead performances. Bateman's droopy-eyed dead (or dying) Franz Kafka has just the right note of pained craziness. As his younger alter ego (the theatrical version of the man, under the rules of this play), K.K. Dodds oozes vulnerability. She's (yes, it's a she) nothing short of superb. And as Lowy

In [20]:
compare_person_works('ebert', 'Marlon Brando', 'Apocalypse Now')

Apocalypse Now has one of the most haunting endings in cinema, a poetic evocation of what Kurtz has discovered, and what we hope not to discover for ourselves. The river journey creates enormous anticipation about Kurtz, and Brando fulfills it. When Apocalypse Now was released in 1979, his casting was criticized and his enormous paycheck of $1 million was much discussed, but it's clear he was the correct choice, not only because of his stature as an icon, but because of his voice, which enters Apocalypse Now from darkness or half-light, repeating the words of T.S. Eliot's despairing "The Hollow Men." That voice sets the final tone of Apocalypse Now.
There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? He is doing a reprise here of his most popular character, Don Vito Corleone of "The Godfather," and he does it with such wit, discipline and seriousness that it's 

In [8]:
def like_person(corpus, name):
    c = Corpus(corpus)

    final_results = []
    
    # Get all articles that have the person's name
    article_list = c.retrieve_articles(name)
    last_name = name.split()[-1]
    
    filtered_sentences = []
    
    for article in article_list:
        filtered_sentences += [(article, i) for i in compose_filters(article.sentences,
                                       [filter_opinion,
                                        filter_person_subject,
                                        filter_name_parentheses,
                                        filter_name],
                                       name=last_name) if i.filter_depth > 3]
        
    print([i[1].sentiment for i in filtered_sentences])
    print(sum([i[1].sentiment for i in filtered_sentences])/ len(filtered_sentences))
    
    for i in sorted(filtered_sentences, key=lambda x: x[1].sentiment, reverse=True):
        if i[1].filter_depth > 3:
            print(i[1].sentiment)
            print(i[1].text.text)
            try:
                print(i[0].get_paragraph(i[1]))
            except ValueError:
                print('PARAGRAPH NOT FOUND')
            print('\n')
            
    
        
    

In [9]:
like_person('ebert', 'Steven Spielberg')

[0.040624999999999994, 0.07500000000000001, 0.0, -0.30000000000000004, -0.15555555555555559, 0.7, 0.5, 0.05, 0.27, 0.0925, 0.17222222222222222, 0.3625, 0.35000000000000003, 0.05, 0.038888888888888876, 0.17777777777777778, 0.1, 0.125, 0.16666666666666666, -0.05681818181818182, 0.5, 0.1787878787878788, 0.12333333333333332, 0.3, 0.3, 0.175, 0.5, -0.3, -0.041666666666666664, 0.125, 0.175, 0.3, 0.25, 0.2520833333333333, 0.5, -0.09583333333333333, -0.05833333333333332, 0.0, 0.2228571428571428, 0.2333333333333333, 0.0, 0.5, 0.2813492063492064, 0.3, 0.039999999999999994, 0.2, 0.22428571428571428, 0.24404761904761907, -0.12380952380952379, -0.2, 1.0, 0.45, 0.05, 0.06666666666666667, 0.2, 0.34375, 0.6, 0.4, 0.4, 0.5, -0.39375, 0.5, 0.3444444444444444, -0.4, 0.14375, 0.19166666666666665, 0.00416666666666667, 0.3, 0.02, 0.095, 0.15, 0.3375, 0.39444444444444443, -0.0625, -0.17916666666666664, 0.08333333333333333, 0.15833333333333335, 0.1, 0.4230769230769231, 0.0, 0.4, 0.5285714285714286, 0.425, 0.3

In [94]:
like_person('ebert', 'Marlon Brando')

[-0.4166666666666666, -0.125, 0.2, 0.4, 0.13636363636363635, 0.5, 0.25, -0.1125, 0.3, 0.0, 0.26666666666666666, -0.5, 0.0, -0.6]
0.021347402597402605
0.5
There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"?
There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? He is doing a reprise here of his most popular character, Don Vito Corleone of "The Godfather," and he does it with such wit, discipline and seriousness that it's not a ripoff and it's not a cheap shot, it's a brilliant comic masterstroke.


0.4
Marlon Brando didn't win the Academy Award in 1951 for his acting in "A Streetcar Named Desire."
Marlon Brando didn't win the Academy Award in 1951 for his acting in "A Streetcar Named Desire." The Oscar went to Humphrey Bogart, for "The Afri

Notes:

- Sentiment and opinion gets messed up when he's describing qualities of characters. For example:

> -0.4166666666666666
In one class, Vince creates a spot-on imitation, not of Marlon Brando, but of bad Brando imitators.
Vince is a man with a dream. Marlon Brando is his god. He would like to become an actor. This is not likely. He's well into his 40s, a prison guard living with his family on City Island, a bucolic outcrop of the Bronx known mostly to its residents. Telling his wife he's going to a poker game, Vince attends acting classes in Manhattan. In one class, Vince creates a spot-on imitation, not of Marlon Brando, but of bad Brando imitators.

In [9]:


def like_person_2(corpus, name):
    c = Corpus(corpus)

    final_results = []
    
    # Get all articles that have the person's name
    article_list = c.retrieve_articles(name)
    
    last_name = name.split()[-1]
    
    filtered_sentences = []
    
    for article in article_list:
        filtered_sentences += [(article, i) for i in compose_filters(article.sentences,
                                       [filter_opinion,
                                        filter_person_subject,
                                        filter_name_parentheses,
                                        filter_name],
                                       name=last_name) if i.filter_depth > 3]
        
    print([i[1].sentiment for i in filtered_sentences])
    print(sum([i[1].sentiment for i in filtered_sentences])/ len(filtered_sentences))
    
    ctr = 0
    sorted_results = sorted(filtered_sentences, key=lambda x: x[1].sentiment, reverse=True)
    for i in sorted_results:
        if ctr < 2:
            if i[1].filter_depth > 3:
                #print(i[1].sentiment)
                #print(i[1].text.text)
                try:
                    print(i[0].get_paragraph(i[1]))
                    ctr += 1
                except ValueError:
                    pass#print('PARAGRAPH NOT FOUND')
                #print('\n')
    try:
        print("However, " + sorted_results[-1][0].get_paragraph(sorted_results[-1][1]))
    except ValueError:
        pass

In [102]:
like_person_2('ebert', 'Marlon Brando')

[-0.4166666666666666, -0.125, 0.2, 0.4, 0.13636363636363635, 0.5, 0.25, -0.1125, 0.3, 0.0, 0.26666666666666666, -0.5, 0.0, -0.6]
0.021347402597402605
There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? He is doing a reprise here of his most popular character, Don Vito Corleone of "The Godfather," and he does it with such wit, discipline and seriousness that it's not a ripoff and it's not a cheap shot, it's a brilliant comic masterstroke.
Marlon Brando didn't win the Academy Award in 1951 for his acting in "A Streetcar Named Desire." The Oscar went to Humphrey Bogart, for "The African Queen." But you could make a good case that no performance had more influence on modern film acting styles than Brando's work as Stanley Kowalski, Tennessee Williams' rough, smelly, sexually charged hero.
However,  "Superman" pointed the way for a B picture genre of earlier decades

In [103]:
like_person_2('ebert', 'Viggo Mortensen')

[0.3020833333333333, 0.26666666666666666, -0.5, 0.6]
0.1671875
So persuasive are these Indiana scenes that, despite A History of Violences opening moments, we wonder if Cronenberg has abandoned his own history of violence and decided to make a small-town slice of life: A Capra picture, perhaps, with Viggo Mortensen as Jimmy Stewart. Then all hell breaks loose. Two tough guys enter the diner to try a stickup. They have guns, mean business, threaten the customers and a waitress. Moving so quickly he seems to have been practicing the scene as choreography, Tom Stall takes out the two guys and ends up on the local front pages as a hero.
So the underlying strength of the story is there. Unfortunately, the casting and some of the romantic scenes sabotage it. Liev Schreiber is a good actor, and I have admired him in many movies, but put him beside Viggo Mortensen and the Blouse Man wins; you can hardly blame Pearl for surrendering. (I am reminded of a TV news interview about that movie where 

In [105]:
like_person_2('ebert', 'Stanley Kubrick')

[0.2857142857142857, 0.11666666666666665, 0.2857142857142857, 0.0062500000000000056, 0.8, 0.0, 0.1, -0.05, 0.1, 0.39]
0.20343452380952382
"Paths of Glory" was Paths of Glory by which Stanley Kubrick entered the ranks of great directors, never to leave them. When I interviewed Kirk Douglas in 1969, he recalled it as the summit of his acting career: "There's a picture that will always be good, years from now. I don't have to wait 50 years to know that; I know it now." It has an economy of expression that is almost brutal; it is one of the few narrative films in which you sense the anger in the telling. Samuel Fuller, who fought all the way through World War II, remembered it in "The Big Red One" with nostalgia for the camaraderie of his outfit. There is no nostalgia in "Paths of Glory." Only nightmare.
The 10 films are not philosophical abstractions but personal stories that involve us immediately; I hardly stirred during some of them. After seeing the series, Stanley Kubrick observed th

In [106]:
like_person('ebert', "Stanley Kubrick")

[0.2857142857142857, 0.11666666666666665, 0.2857142857142857, 0.0062500000000000056, 0.8, 0.0, 0.1, -0.05, 0.1, 0.39]
0.20343452380952382
0.8
"Paths of Glory" was Paths of Glory by which Stanley Kubrick entered the ranks of great directors, never to leave them.
"Paths of Glory" was Paths of Glory by which Stanley Kubrick entered the ranks of great directors, never to leave them. When I interviewed Kirk Douglas in 1969, he recalled it as the summit of his acting career: "There's a picture that will always be good, years from now. I don't have to wait 50 years to know that; I know it now." It has an economy of expression that is almost brutal; it is one of the few narrative films in which you sense the anger in the telling. Samuel Fuller, who fought all the way through World War II, remembered it in "The Big Red One" with nostalgia for the camaraderie of his outfit. There is no nostalgia in "Paths of Glory." Only nightmare.


0.39
After seeing the series, Stanley Kubrick observed that Ki

In [107]:
like_person_2('jones', 'Tracy Letts')

[0.25, 0.13636363636363635, 0.25, 0.13636363636363635, 0.0, 0.35, 0.35, -0.15, 0.7, 0.7, 0.5681818181818181, 0.8, 0.5681818181818181, 0.8, 0.3444444444444444, 0.3444444444444444, 0.1, 0.7, 0.04545454545454545, 0.1632727272727273, 0.04545454545454545, 0.1875, 0.25833333333333336, 0.31666666666666665, 0.21733766233766233, 1.0, 0.0, 1.0, -0.1666666666666667, 0.0, 0.06666666666666667, 0.44000000000000006, 0.44000000000000006, 0.09166666666666666, 0.09166666666666666, 0.0, 0.0, 0.0, -0.1666666666666667, 0.04, 1.0, 0.0, 0.1125, 0.0, 0.3497727272727273, 0.15, 0.45, 0.0, 0.25833333333333336, 0.25833333333333336, 0.3, -0.30625, 0.0, 0.0, 0.16666666666666666, 0.2, 0.3, -0.11666666666666665, 0.08409090909090909, 0.0, 0.10416666666666667, 0.0, 0.10000000000000002, -0.1666666666666667, 0.42083333333333334, -0.2, 0.125, 0.39999999999999997, 0.08333333333333333, -0.1666666666666667, 0.2833333333333333, 0.0, 0.03409090909090909, -0.1666666666666667, 0.39999999999999997, 0.39999999999999997, -0.0535714

In [108]:
like_person('jones', 'Tracy Letts')

[0.25, 0.13636363636363635, 0.25, 0.13636363636363635, 0.0, 0.35, 0.35, -0.15, 0.7, 0.7, 0.5681818181818181, 0.8, 0.5681818181818181, 0.8, 0.3444444444444444, 0.3444444444444444, 0.1, 0.7, 0.04545454545454545, 0.1632727272727273, 0.04545454545454545, 0.1875, 0.25833333333333336, 0.31666666666666665, 0.21733766233766233, 1.0, 0.0, 1.0, -0.1666666666666667, 0.0, 0.06666666666666667, 0.44000000000000006, 0.44000000000000006, 0.09166666666666666, 0.09166666666666666, 0.0, 0.0, 0.0, -0.1666666666666667, 0.04, 1.0, 0.0, 0.1125, 0.0, 0.3497727272727273, 0.15, 0.45, 0.0, 0.25833333333333336, 0.25833333333333336, 0.3, -0.30625, 0.0, 0.0, 0.16666666666666666, 0.2, 0.3, -0.11666666666666665, 0.08409090909090909, 0.0, 0.10416666666666667, 0.0, 0.10000000000000002, -0.1666666666666667, 0.42083333333333334, -0.2, 0.125, 0.39999999999999997, 0.08333333333333333, -0.1666666666666667, 0.2833333333333333, 0.0, 0.03409090909090909, -0.1666666666666667, 0.39999999999999997, 0.39999999999999997, -0.0535714

In [114]:
like_person('ebert', 'Francis Ford Coppola')

[0.8, 0.3666666666666667, 0.6666666666666666, 0.25, 0.11363636363636363, -0.1875, -0.5, 0.08333333333333333, 0.34761904761904766, 0.1, 0.05, 0.6333333333333334, 0.4, 0.31212121212121213, 0.37124999999999997, 0.31212121212121213, 0.37124999999999997, -0.3625, 0.3333333333333333, 0.5, 0.375, 0.4642857142857143, 0.5, 0.25, 0.5, 0.0, 0.23666666666666666, 0.5, 0.1377551020408163, 0.4, 0.1, 0.1825, 0.25, -0.05, 0.2, 0.25, -0.3333333333333333, 0.2357142857142857, 0.25, 0.16193181818181818, -0.06944444444444443, 0.125, -0.09375, -0.225, 0.375, 0.1, 0.21428571428571427, 0.16666666666666666, 0.3, 0.5, 0.4375, 0.29545454545454547, 0.25, -0.06666666666666667, -0.15555555555555559]
0.21318803058413438
0.8
Francis Ford Coppola fills every frame with great billowing clouds of smoke, steam, and fog.
During the course of Rumble Fish, the characters meet, fight, talk, philosophize, make out, drink, and sometimes die. Rumble Fish treats these activities in a highly stylized way; one death scene is follow

In [10]:
def like_person_3(corpus, name):
    c = Corpus(corpus)

    final_results = []
    
    # Get all articles that have the person's name
    article_list = c.retrieve_articles(name)
    
    last_name = name.split()[-1]
    
    filtered_sentences = []
    
    for article in article_list:
        filtered_sentences += [(article, i) for i in compose_filters(article.sentences,
                                       [filter_opinion,
                                        filter_person_subject,
                                        filter_name_parentheses,
                                        filter_fragments,
                                        filter_name],
                                       name=last_name) if i.filter_depth > 4]
        
    #print([i[1].sentiment for i in filtered_sentences])
    #print(sum([i[1].sentiment for i in filtered_sentences])/ len(filtered_sentences))
    
    ctr = 0
    sorted_results = sorted(filtered_sentences, key=lambda x: x[1].sentiment, reverse=True)
    
    first_last_candidates = []
    
    for i in sorted_results:
        if ctr < 300:
            if i[1].filter_depth > 3:
                #print(i[1].sentiment)
                #print(i[1].text.text)
                try:
                    matched_paragraph = i[0].get_paragraph(i[1])
                    paragraph_sentences = list(NLP(matched_paragraph).sents)
                    if i[1].text.text in paragraph_sentences[0].text:
                        #print('FIRST SENTENCE')
                        if i[1] not in [i[1] for i in first_last_candidates]:
                            first_last_candidates.append(i)
                    if i[1].text.text in paragraph_sentences[-1].text:
                        #pass#print('LAST SENTENCE')
                        if i[1] not in [i[1] for i in first_last_candidates]:
                            first_last_candidates.append(i)
                    #print(matched_paragraph)
                    ctr += 1
                except ValueError:
                    pass#print('PARAGRAPH NOT FOUND')
                #print('\n')
    #try:
    #    matched_paragraph = sorted_results[-1][0].get_paragraph(sorted_results[-1][1])
    #    print("However, " + matched_paragraph)
    #except ValueError:
    #    pass
    
    
    #for i in first_last_candidates:
    #    print(i[1].sentiment, i[0].title, i[1].text.text)
    
    grouped = {}    
    for i in first_last_candidates:
        if grouped.get(i[0].title) is not None:
            grouped[i[0].title].append(i[1])
        else:
            grouped[i[0].title] = [i[1]]
            
    # sort groups by length
    sorted_by_length = sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True)
    ctr = 0
    previous_average_sentiment = None
    running_response = []
    for s in sorted_by_length:
        if ctr < 4:
            curr_par = ""
            cas = sum(i.sentiment for i in s[1]) / len(s)
            if previous_average_sentiment is not None and (cas < 0 and previous_average_sentiment > 0):
                curr_par += "However, "
            previous_average_sentiment = cas
            curr_par += " ".join(i.text.text for i in s[1])
            ctr += 1
            running_response.append(curr_par)
            
    return "\n".join(running_response)
        

In [163]:
like_person_3('ebert', 'Marlon Brando')

There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? Think of the criticism Brando risked, from those ready to attack him for cashing in on his most famous performance by reprising it for a comedy. Brando must have known the dangers, but he must have had confidence in himself, too - enough to go ahead anyway, and to win a considerable gamble. When Brando finished filming "The Freshman" last September, he attacked The Freshman in a notorious interview in the Toronto Globe and Mail, claiming it was trash and that he was retiring from acting. The other actors, perhaps aware of the chances Brando was taking, seem a little in awe of him - which is as it should be.
However, Brando doesn't so much walk through Don Juan DeMarco as coast, in a gassy, self-indulgent performance no one else could have gotten away with. In a new book about how he makes movies, director Sidn

In [164]:
like_person_3('ebert', 'Stanley Kubrick')

Why he likes Beethoven is never explained, but my notion is that Alex likes Beethoven in the same way that Kubrick likes to load his sound track with familiar classical music -- to add a cute, cheap, dead-end dimension. I don't know quite how to explain my disgust at Alex (whom Kubrick likes very much, as his visual style reveals and as we shall see in a moment). No, I think Kubrick is being too modest: Alex is all his. When Kubrick shows us Alex, however, he either places him in the center of a wide-angle shot (so Alex alone has normal human dimensions,) or uses a standard lens that does not distort. Kubrick has used visuals to alter the book's point of view and to nudge us toward a kind of grudging pal-ship with Alex. But that isn't what Kubrick is saying, He actually seems to be implying something simpler and more frightening: that in a world where society is criminal, the citizen might as well be a criminal, too. Kubrick uses the wide-angle lens almost all the time when he is showi

Questions and next steps:
- How to deal with things like "In this scene and others, Brando regains the peak of his magnificent talent."
    - the "In this scene and others" doesn't make sense out of context.
    - I think for now what I'm going to do is 

In [165]:
like_person_3('ebert', 'Martin Scorsese')

Cape Fear is impressive moviemaking, showing Scorsese as a master of a traditional Hollywood genre who is able to mold it to his own themes and obsessions. The way he sees the character of Sam Bowden is the key to why Martin Scorsese wanted to remake the 1962 thriller Cape Fear. This is the first film in a production deal Scorsese has with Universal Studios and Steven Spielberg's Amblin Entertainment, and it represents his access to budgets much larger than those he has worked with in the past. And with this new version of Max Cady, Scorsese gives us not simply a bad man, but an evil one - whose whole purpose is to show Sam Bowden he's a criminal, too. Unlike the simplistic version of this scene we have seen in a hundred thrillers, what Scorsese gives us is a villain who has been wronged, seeking to harm a hero who has sinned.
Remarkably, writer-director Martin Scorsese is only 25 years old, and this is his first film. It is possible that with more experience and maturity Scorsese will

In [166]:
like_person_3('ebert', 'Jennifer Lawrence')

The Hunger Games is an effective entertainment, and Jennifer Lawrence is strong and convincing in the central role.


In [169]:
like_person_3('ebert', 'Quentin Tarantino')

Now that we know Quentin Tarantino can make a movie like "Reservoir Dogs," it's time for him to move on and make a better one. Tarantino himself is also interesting as an actor; he could play great crazy villains. Reservoir Dogs feels like it's going to be terrific, but Tarantino's script doesn't have much curiosity about these guys. Having created the characters and fashioned the outline, Tarantino doesn't do much with his characters except to let them talk too much, especially when they should be unconscious from shock and loss of blood.
But this is all one film, and now that we see it whole, it's greater than its two parts; Tarantino remains the most brilliantly oddball filmmaker of his generation, and this is one of the best films of the year. Put the two parts together, and Tarantino has made a masterful saga that celebrates the martial arts genre while kidding it, loving it, and transcending it. Kill Bill, Volume 2 is a distillation of the countless grind house kung-fu movies Tar

In [172]:
like_person_3('ebert', 'Tom Cruise')

I'd sorta rather see Diaz and Cruise in action scenes on a human scale, rather than have it rubbed in that for long stretches, they're essentially replaced by animation. Knight and Day knows this sequence is monumentally silly, and so do Cruise and Diaz, and Cruise keeps up a reassuring line of patter all during it, even while trying to crash-land safely.
The archetypal Tom Cruise Movie is "Top Gun," in which the young fighter pilot, a natural, was tutored by a once-great pilot and emotionally nurtured by an older female flight instructor before testing his wings against the hot dogs of his unit, in preparation for a final showdown. And Cruise is so efficiently packaged in this product that he plays the same role as a saint in a Mexican village's holy day procession: It's not what he does that makes him so special; it's the way he manifests everybody's faith in him.
However, Tom Cruise, who initially seemed to many people an unlikely choice to play Lestat, is never less than convincing

# Try and Convert Filtering to Scores

In [11]:
def like_person_4(corpus, name):
    c = Corpus(corpus)

    final_results = []
    
    # Get all articles that have the person's name
    article_list = c.retrieve_articles(name)
    
    last_name = name.split()[-1]
    
    filtered_sentences = []
    
    for article in article_list:
        filtered_sentences += [(article, i) for i in compose_filters(article.sentences,
                                       [filter_opinion,
                                        filter_person_subject,
                                        filter_name_parentheses,
                                        filter_fragments,
                                        filter_name],
                                       name=last_name) if i.filter_depth > 4]
    
    ctr = 0
    sorted_results = sorted(filtered_sentences, key=lambda x: x[1].sentiment, reverse=True)
    
    first_last_candidates = []
    
    for i in sorted_results:
        if ctr < 300:
            if i[1].filter_depth > 0:

                try:
                    matched_paragraph = i[0].get_paragraph(i[1])
                    paragraph_sentences = list(NLP(matched_paragraph).sents)
                    if i[1].text.text in paragraph_sentences[0].text:
                        #print('FIRST SENTENCE')
                        if i[1] not in [j[1] for j in first_last_candidates]:
                            tmp = i
                            tmp[1].filter_depth = tmp[1].filter_depth * 1.5
                            first_last_candidates.append(tmp)
                    elif i[1].text.text in paragraph_sentences[-1].text:
                        #pass#print('LAST SENTENCE')
                        if i[1] not in [j[1] for j in first_last_candidates]:
                            tmp = i
                            tmp[1].filter_depth = tmp[1].filter_depth * 1.5
                            first_last_candidates.append(tmp)
                    #print(matched_paragraph)
                    else:
                        first_last_candidates.append(i)
                    ctr += 1
                except ValueError:
                    pass#print('PARAGRAPH NOT FOUND')
                #print('\n')

    grouped = {}    
    for i in first_last_candidates:
        
        if any(j in i[1].text.text.lower() for j in ['here', 'it']):
               i[1].filter_depth = i[1].filter_depth * 0.9
        
        if grouped.get(i[0].title) is not None:
            grouped[i[0].title].append(i[1])
        else:
            grouped[i[0].title] = [i[1]]
    
    for k in grouped:
        
        v = sorted(grouped[k], key=lambda x: x.filter_depth, reverse=True)
        avg_sim = []
        for i in range(len(v)):
            for j in range(1, len(v)):
                avg_sim += [v[i].text.similarity(v[j].text)]
        if len(avg_sim) > 0:
            avg_sim = sum(avg_sim)/len(avg_sim)
        else:
            avg_sim = 0.0
        for i in v:
            i.filter_depth = i.filter_depth * avg_sim
        
        grouped[k] = v
        
    
    # sort groups by score
    sorted_by_length = sorted(grouped.items(), key=lambda x: max(a.filter_depth for a in x[1]), reverse=True)
    ctr = 0
    previous_average_sentiment = None
    running_response = []
    for s in sorted_by_length:
        if ctr < 4:
            curr_par = ""
            cas = sum(i.sentiment for i in s[1]) / len(s)
            if previous_average_sentiment is not None and (cas < 0 and previous_average_sentiment > 0):
                curr_par += "However, "
            previous_average_sentiment = cas
            curr_par += " ".join(i.text.text for i in s[1])
            ctr += 1
            running_response.append(curr_par)
            
    return "\n".join(running_response)
        

In [72]:
like_person_4('ebert', 'Martin Scorsese')

This sounds like an entertaining story, I suppose, but Scorsese doesn't direct a single scene for a payoff. Scorsese and De Niro are the most creative, productive director/actor team in The King of Comedys right now, and the fact that they feel the freedom to make such an odd, stimulating, unsatisfyng movie is good news, I guess. It's hard to believe Scorsese made it; instead of the big-city life, the violence and sexuality of his movies like "Taxi Driver" and "Mean Streets," what we have here is an agonizing portrait of lonely, angry people with their emotions all tightly bottled up.
Scorsese seems so much in command of his gift in GoodFellas. So says Henry Hill in the opening moments of Martin Scorseses GoodFellas, a movie about the tradecraft and culture of organized crime in New York.
However, The way he sees the character of Sam Bowden is the key to why Martin Scorsese wanted to remake the 1962 thriller Cape Fear. Unlike the simplistic version of this scene we have seen in a hundr

In [73]:
like_person_4('ebert', 'Marlon Brando')

Brando must have known the dangers, but he must have had confidence in himself, too - enough to go ahead anyway, and to win a considerable gamble. There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? Think of the criticism Brando risked, from those ready to attack him for cashing in on his most famous performance by reprising it for a comedy. When Brando finished filming "The Freshman" last September, he attacked The Freshman in a notorious interview in the Toronto Globe and Mail, claiming it was trash and that he was retiring from acting. The other actors, perhaps aware of the chances Brando was taking, seem a little in awe of him - which is as it should be.
Brando even tested for the role, but he fell out, Bergman was troubled in her controversial marriage to Roberto Rossellini, and Count Luchino Visconti settled for Valli, an important European star, and Gran

In [74]:
like_person_4('ebert', 'Francis Ford Coppola')

Coppola had just finished a successful mainstream production for Warner, "Finian's Rainbow," and his proposal sounded good in that era of youth films, bike films, trip films and other high hopes. Francis Ford Coppola, a young director of promise, persuaded Warner Bros. to help finance and distribute a group of features by the bright new filmmakers he'd gathered around him.
Coppola had just finished a successful mainstream production for Warners, "Finian's Rainbow," and his proposal sounded good in that era of youth films, bike films, trip films, and other high hopes. Francis Ford Coppola, a young director of promise, persuaded Warner Bros. to help finance and distribute a group of features by the bright new filmmakers he'd gathered around him.
However, Coppola says he has been fascinated by the Tucker legend ever since he first saw a Tucker car in the late 40s, and he has owned a rare collectors model during the 10 years he has been trying to get this dream film off the ground. The par

In [75]:
like_person_4('ebert', 'Tom Cruise')

The archetypal Tom Cruise Movie is "Top Gun," in which the young fighter pilot, a natural, was tutored by a once-great pilot and emotionally nurtured by an older female flight instructor before testing his wings against the hot dogs of his unit, in preparation for a final showdown. And Cruise is so efficiently packaged in this product that he plays the same role as a saint in a Mexican village's holy day procession: It's not what he does that makes him so special; it's the way he manifests everybody's faith in him.
However, I'd sorta rather see Diaz and Cruise in action scenes on a human scale, rather than have it rubbed in that for long stretches, they're essentially replaced by animation. Knight and Day knows this sequence is monumentally silly, and so do Cruise and Diaz, and Cruise keeps up a reassuring line of patter all during it, even while trying to crash-land safely. Roy Miller, the Cruise character, has something, and there are bad guys who want it.
They find true love, which 

# Determine an Independent Objective Function

## Desirable Qualities of a Summary
- Favor longer paragraphs -> MAYBE NOT
- Thesis statement is at beginning of paragraph
- Thesis statement is about the desired subject (person in this case)
- If the person is an actor/actress, the sentences are about the person, not a character
- Thesis statement does not have any unresolved coreferences
- Thesis statement sentiment is representative of the paragraph sentiment
- Intra-paragraph sentiment is similar
- There are no duplicate sentences
- All sentences are subjective
- Reward summaries that have things like "I like", "I dislike", etc. => Personalization
- Loop in a generalized readability measure (or multiple) as a way of capturing coherence
    - Will start with https://pypi.python.org/pypi/readability , though a good port of Coh-Metrix may be better since it seems to pull in coherence more directly

## Other Ideas
We know that sentences in the beginning and ends of paragraphs (and likely even articles e.g. intros/conclusions) tend to work well. My thought is that this function should be able to evaluate using only the paragraph itself. We just would end up preferring those sentences at construction time since they exhibit these qualities.

## Other Evaluation Techniques
ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metrics are used in most major conferences/competitions. The idea here is to compare against a set of human-written summaries across a few different features.
- ROUGE-N: N-gram[3] based co-occurrence statistics.
- ROUGE-L: Longest Common Subsequence (LCS)[4] based statistics. Longest common subsequence problem takes into account sentence level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically.
- ROUGE-W: Weighted LCS-based statistics that favors consecutive LCSes .
- ROUGE-S: Skip-bigram[5] based co-occurrence statistics. Skip-bigram is any pair of words in their sentence order.
- ROUGE-SU: Skip-bigram plus unigram-based co-occurrence statistics.

This is absolutely an area for further development. Though, likely out of scope for this project.



In [76]:
import readability

In [12]:
sample_string = """
Brando must have known the dangers, but he must have had confidence in himself, too - enough to go ahead anyway, and to win a considerable gamble. There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? Think of the criticism Brando risked, from those ready to attack him for cashing in on his most famous performance by reprising it for a comedy. When Brando finished filming "The Freshman" last September, he attacked The Freshman in a notorious interview in the Toronto Globe and Mail, claiming it was trash and that he was retiring from acting. The other actors, perhaps aware of the chances Brando was taking, seem a little in awe of him - which is as it should be.
Brando even tested for the role, but he fell out, Bergman was troubled in her controversial marriage to Roberto Rossellini, and Count Luchino Visconti settled for Valli, an important European star, and Granger, a lesser but well-known star from America. Brando was an imposing masculine presence, and Mahler needs to be a faithless coward.
And, yes, Brando is good as the old Brando. "Reflections in a Golden Eye" was proof that Brando is far from dead as an actor, although he may be finished as a box office draw. But Brando has not done any acting in the last ten years to equal his magnificent performance in "Reflections in a Golden Eye" (1967) a box office flop. But the question remains: Should Brando really be wasting his time on this sort of movie? Miss Moreno has a difficult scene with Brando (she's on dope and keeps going to sleep in a bathtub) that she handles with quiet conviction.
However, In a new book about how he makes movies, director Sidney Lumet speculates that Marlon Brando has a way of testing his directors. If the director fails the test, Brando walks through the rest of Don Juan DeMarco. Brando doesn't so much walk through Don Juan DeMarco as coast, in a gassy, self-indulgent performance no one else could have gotten away with. Brando does a scene twice, he writes, once really putting his soul into it, the second time using only technique. And there are moments so bizarre we wonder if the director actually suggested them, or whether Brando had a brainstorm, and everybody was afraid to tell him it wasn't such a hot idea.
"""

In [81]:
readability.getmeasures(sample_string, "en")

OrderedDict([('readability grades',
              OrderedDict([('Kincaid', 41.764043062200955),
                           ('ARI', 51.92485645933014),
                           ('Coleman-Liau', 10.262860600478472),
                           ('FleschReadingEase', -18.23919856459328),
                           ('GunningFogIndex', 45.14928229665072),
                           ('LIX', 123.16028708133972),
                           ('SMOGIndex', 19.20185174601965),
                           ('RIX', 19.5),
                           ('DaleChallIndex', 13.730465550239236)])),
             ('sentence info',
              OrderedDict([('characters_per_word', 4.480861244019139),
                           ('syll_per_word', 1.4066985645933014),
                           ('words_per_sentence', 104.5),
                           ('sentences_per_paragraph', 4.0),
                           ('type_token_ratio', 0.5789473684210527),
                           ('characters', 1873),
             

- Kincaid: low - hard to read, high - easy to read
- Automatic Readability Index (ARI): low - easy to read, high - hard to read

In [13]:
# Going to use textacy since it still sits on top of spacy and consistency is nice
from textacy import text_stats
import textacy

In [14]:
t = text_stats.TextStats(NLP(sample_string))

In [15]:
t.readability_stats



{'automated_readability_index': 11.938512605042014,
 'coleman_liau_index': 9.236950795238098,
 'flesch_kincaid_grade_level': 10.481008403361347,
 'flesch_readability_ease': 63.92281512605044,
 'gulpease_index': 56.52380952380952,
 'gunning_fog_index': 13.596638655462186,
 'lix': 43.03921568627451,
 'smog_index': 11.78182255692154,
 'wiener_sachtextformel': 5.10251400560224}

In [17]:
# flesch_readability_ease HIGH => easy, LOW => hard

In [18]:
# lets get a baseline for Ebert:

apocalypse_now_review = """
Francis Ford Coppola's film "Apocalypse Now" was inspired by Heart of Darkness, a novel by Joseph Conrad about a European named Kurtz who penetrated to the farthest reaches of the Congo and established himself like a god. A boat sets out to find him, and on the journey the narrator gradually loses confidence in orderly civilization; he is oppressed by the great weight of the jungle all around him, a pitiless Darwinian testing ground in which each living thing tries every day not to be eaten.

What is found at the end of the journey is not Kurtz so much as what Kurtz found: that all of our days and ways are a fragile structure perched uneasily atop the hungry jaws of nature that will thoughtlessly devour us. A happy life is a daily reprieve from this knowledge.

A week ago I was in Calcutta, where I saw mile upon square mile of squatter camps in which hundreds of thousands live generation after generation in leaky huts of plastic, cardboard and scrap metal, in poverty so absolute it is impossible to see any hope of escape. I do not mean to equate the misery of those hopeless people with a movie; that would be indecent. But I was deeply shaken by what I saw, and realized how precious and precarious is a happy life. And in such a mood I watched "Apocalypse Now" and came to the scene where Col. Kurtz (Marlon Brando) tells Capt. Willard (Martin Sheen) about "the horror."

Kurtz is a decorated hero, one of the best soldiers in the Army, who has created a jungle sanctuary upriver inside enemy territory, and rules Montagnard tribesmen as his private army. He tells Willard about a day when his Special Forces men inoculated the children of a village against polio: "This old man came running after us and he was crying, he couldn't see. We went back there, and they had come and hacked off every inoculated arm. There they were in a pile, a pile of little arms. . . ."

What Kurtz learned is that the Viet Cong were willing to go to greater lengths to win: "Then I realized they were stronger than we. They have the strength, the strength to do that. If I had 10 divisions of those men, then our troubles here would be over very quickly. You have to have men who are moral and at the same time who are able to utilize their primordial instincts to kill without feeling, without passion, without judgment." This is the "horror" that Kurtz has found, and it threatens to envelop Willard, too.

The whole movie is a journey toward Willard's understanding of how Kurtz, one of the Army's best soldiers, penetrated the reality of war to such a depth that he could not look any longer without madness and despair.

The film has one of the most haunting endings in cinema, a poetic evocation of what Kurtz has discovered, and what we hope not to discover for ourselves. The river journey creates enormous anticipation about Kurtz, and Brando fulfills it. When the film was released in 1979, his casting was criticized and his enormous paycheck of $1 million was much discussed, but it's clear he was the correct choice, not only because of his stature as an icon, but because of his voice, which enters the film from darkness or half-light, repeating the words of T.S. Eliot's despairing "The Hollow Men." That voice sets the final tone of the film.

Another crucial element in the ending is the photojournalist (Dennis Hopper) who has somehow found Kurtz's camp and stayed there, stoned, as a witness. He blathers to Willard that Kurtz is "a poet-warrior in the classic sense" and "we're all his children." In the photographer's spaced-out ravings we hear disconnected snatches of the poetry he must have heard Kurtz reciting: If you can keep your head when all about you . . . I should have been a pair of ragged claws, scuttling across the floor of a silent sea. . . ." The photographer is the guide, the clown, the fool, providing the balance between Willard and Kurtz.

Why has "Apocalypse Now" been so long bedeviled by rumors that Coppola was not happy with this ending? At the film's premiere at Cannes, I saw the confusion begin. Coppola originally intended to show the movie as a 70mm roadshow with no credits (they would be printed in a booklet). But the 35mm release would need end titles. After he was finished filming on the huge set of the Kurtz compound, Coppola was required by the Philippine government to destroy it, and he photographed it being blown up. He decided to use this footage over his closing 35mm credits, even though (this is crucial) he did not intend the destruction of the compound as an alternative "ending" to the film. Alas, confusion about the endings spread from Cannes into movie folklore, and most people thought that by "ending" he meant all of the material involving Kurtz. In the 20th anniversary DVD release, Coppola patiently explains all of this once again.

In any event, seen again now at a distance of 20 years, "Apocalypse Now" is more clearly than ever one of the key films of the century. Most films are lucky to contain a single great sequence. "Apocalypse Now" strings together one after another, with the river journey as the connecting link. The best is the helicopter attack on a Vietnam village, led by Col. Kilgore (Robert Duvall), whose choppers use loudspeakers at top volume to play Wagner's "Ride of the Valkyries" as they swoop down on a yard full of schoolchildren. Duvall won an Oscar nomination for his performance and its unforgettable line, "I love the smell of napalm in the morning." His emptiness is frightening: A surfing fanatic, he agrees to the attack only to liberate a beach said to offer great waves ("Charlie don't surf").

There is also the sequence where the patrol boat stops a small fishing boat with a family on board. A little girl makes a sudden dash, and the jumpy machine-gunner (a young Laurence Fishburne) opens fire, wiping out the entire family. It turns out the girl was running for her puppy. The mother is not quite dead. The boat chief (Albert Hall) wants to take her for medical treatment. Willard puts a bullet into her; nothing can delay his mission. He and "Chief" are the only two seasoned military men on the boat, trying to do things by the book; later, in a scene with peculiar power, the chief is astonished to be killed by a spear.

For me the most remarkable visuals in the film occur when Chef (Fredric Forrest), one of Willard's crew members, insists on venturing into the forest in search of mangos. Willard can't stop him, so he joins him. The great cinematographer Vittorio Storaro shows them as little human specks at the foot of towering trees, and this is a Joseph Conrad moment, showing how nature dwarfs us.

The rock 'n' roll soundtrack opens and closes with "The End" by the Doors, and includes disc jockeys on transistor radios ("Good morning, Vietnam!"). The music underlines surrealistic moments, as when Lance (Sam Bottoms), one of Willard's crew, water-skis behind the boat. It also shows how the soldiers try to use the music of home, and booze and drugs, to ease their loneliness and apprehension.

Other important films such as "Platoon," "The Deer Hunter," "Full Metal Jacket" and "Casualties of War" take their own approaches to Vietnam. Once at the Hawaii Film Festival I saw five North Vietnamese films about the war. (They never mentioned "America," only "the enemy," and one director told me, "It is all the same--we have been invaded by China, France, the U.S. . . .") But "Apocalypse Now" is the best Vietnam film, one of the greatest of all films, because it pushes beyond the others, into the dark places of the soul. It is not about war so much as about how war reveals truths we would be happy never to discover.

In a way I cannot quite explain, my thoughts since Calcutta prepared me to understand the horror that Kurtz found. If we are lucky, we spend our lives in a fool's paradise, never knowing how close we skirt the abyss. What drives Kurtz mad is his discovery of this.
"""

In [19]:
t = text_stats.TextStats(NLP(apocalypse_now_review))
t.readability_stats

{'automated_readability_index': 9.938535250405238,
 'coleman_liau_index': 8.7505075641749,
 'flesch_kincaid_grade_level': 8.916074984737808,
 'flesch_readability_ease': 68.83445834999895,
 'gulpease_index': 59.042313117066286,
 'gunning_fog_index': 11.850721007094288,
 'lix': 41.121865987411326,
 'smog_index': 10.77446382781758,
 'wiener_sachtextformel': 4.4982910847735935}

In [20]:
godfather_review = """
“The Godfather” is told entirely within a closed world. That’s why we sympathize with characters who are essentially evil. The story by Mario Puzo and Francis Ford Coppola is a brilliant conjuring act, inviting us to consider the Mafia entirely on its own terms. Don Vito Corleone (Marlon Brando) emerges as a sympathetic and even admirable character; during the entire film, this lifelong professional criminal does nothing of which we can really disapprove.

During the movie we see not a single actual civilian victim of organized crime. No women trapped into prostitution. No lives wrecked by gambling. No victims of theft, fraud or protection rackets. The only police officer with a significant speaking role is corrupt.

The story views the Mafia from the inside. That is its secret, its charm, its spell; in a way, it has shaped the public perception of the Mafia ever since. The real world is replaced by an authoritarian patriarchy where power and justice flow from the Godfather, and the only villains are traitors. There is one commandment, spoken by Michael (Al Pacino): “Don’t ever take sides against the family.”

It is significant that the first shot is inside a dark, shuttered room. It is the wedding day of Vito Corleone’s daughter, and on such a day a Sicilian must grant any reasonable request. A man has come to ask for punishment for his daughter’s rapist. Don Vito asks why he did not come to him immediately.

“I went to the police, like a good American,” the man says. The Godfather’s reply will underpin the entire movie: “Why did you go to the police? Why didn’t you come to me first? What have I ever done to make you treat me so disrespectfully? If you’d come to me in friendship, then this scum that ruined your daughter would be suffering this very day. And if, by chance, an honest man like yourself should make enemies . . . then they would become my enemies. And then they would fear you.”

As the day continues, there are two more scenes in the Godfather’s darkened study, intercut with scenes from the wedding outside. By the end of the wedding sequence, most of the main characters will have been introduced, and we will know essential things about their personalities. It is a virtuoso stretch of filmmaking: Coppola brings his large cast onstage so artfully that we are drawn at once into the Godfather’s world.

The screenplay of “The Godfather” follows no formulas except for the classic structure in which power passes between the generations. The writing is subtly constructed to set up events later in the film. Notice how the request by Johnny Fontane, the failing singer, pays off in the Hollywood scenes; how his tears set up the shocking moment when a mogul wakes up in bed with what is left of his racehorse. Notice how the undertaker is told “someday, and that day may never come, I will ask a favor of you. . .” and how when the day comes the favor is not violence (as in a conventional movie) but Don Vito’s desire to spare his wife the sight of their son’s maimed body. And notice how a woman’s “mistaken” phone call sets up the trap in which Sonny (James Caan) is murdered: It’s done so neatly that you have to think back through the events to figure it out.

Now here is a trivia question: What is the name of Vito’s wife? She exists in the movie as an insignificant shadow, a plump Sicilian grandmother who poses with her husband in wedding pictures but plays no role in the events that take place in his study. There is little room for women in “The Godfather.” Sonny uses and discards them, and ignores his wife. Connie (Talia Shire), the Don’s daughter, is so disregarded that her husband is not allowed into the family business. He is thrown a bone--”a living”--and later, when he is killed, Michael coldly lies to his sister about what happened.

The irony of the title is that it eventually comes to refer to the son, not the father. As the film opens Michael is not part of the family business, and plans to marry a WASP, Kay Adams (Diane Keaton). His turning point comes when he saves his father’s life by moving his hospital bed, and whispers to the unconscious man: “I’m with you now.”

After he shoots the corrupt cop, Michael hides in Sicily, where he falls in love with and marries Appolonia (Simonetta Stefanelli). They do not speak the same language; small handicap for a Mafia wife. He undoubtedly loves Appolonia, as he loved Kay, but what is he thinking here: that he can no longer marry Kay because he has chosen a Mafia life? After Appolonia’s death and his return to America, he seeks out Kay and eventually they marry. Did he tell her about Appolonia? Such details are unimportant to the story.

What is important is loyalty to the family. Much is said in the movie about trusting a man’s word, but honesty is nothing compared to loyalty. Michael doesn’t even trust Tom Hagen (Robert Duvall) with the secret that he plans to murder the heads of the other families. The famous “baptism massacre” is tough, virtuoso filmmaking: The baptism provides him with an airtight alibi, and he becomes a godfather in both senses at the same time.

Vito Corleone is the moral center of the film. He is old, wise and opposed to dealing in drugs. He understands that society is not alarmed by “liquor, gambling . . . even women.” But drugs are a dirty business to Don Vito, and one of the movie’s best scenes is the Mafia summit at which he argues his point. The implication is that in the godfather’s world there would be no drugs, only “victimless crimes,” and justice would be dispatched evenly and swiftly.

My argument is taking this form because I want to point out how cleverly Coppola structures his film to create sympathy for his heroes. The Mafia is not a benevolent and protective organization, and the Corleone family is only marginally better than the others. Yet when the old man falls dead among his tomato plants, we feel that a giant has passed.

Gordon Willis’ cinematography is celebrated for its darkness; it is rich, atmospheric, expressive. You cannot appreciate this on television because the picture is artificially brightened. Coppola populates his dark interior spaces with remarkable faces. The front-line actors--Brando, Pacino, Caan, Duvall--are attractive in one way or another, but those who play their associates are chosen for their fleshy, thickly lined faces--for huge jaws and deeply set eyes. Look at Abe Vigoda as Tessio, the fearsome enforcer. The first time we see him, he’s dancing with a child at the wedding, her satin pumps balanced on his shoes. The sun shines that day, but never again: He is developed as a hulking presence who implies the possibility of violent revenge. Only at the end is he brightly lit again, to make him look vulnerable as he begs for his life.

The Brando performance is justly famous and often imitated. We know all about his puffy cheeks, and his use of props like the kitten in the opening scene. Those are actor’s devices. Brando uses them but does not depend on them: He embodies the character so convincingly that at the end, when he warns his son two or three times that “the man who comes to you to set up a meeting--that’s the traitor,” we are not thinking of acting at all. We are thinking that the Don is growing old and repeating himself, but we are also thinking that he is probably absolutely right.

Pacino plays Michael close to his vest; he has learned from his father never to talk in front of outsiders, never to trust anyone unnecessarily, to take advice but keep his own counsel. All of the other roles are so successfully filled that a strange thing happened as I watched this restored 1997 version: Familiar as I am with Robert Duvall, when he first appeared on the screen I found myself thinking, “There’s Tom Hagen.”

Coppola went to Italy to find Nino Rota, composer of many Fellini films, to score the picture. Hearing the sadness and nostalgia of the movie’s main theme, I realized what the music was telling us: Things would have turned out better if we had only listened to the Godfather.
"""

In [21]:
# also look at similarity, check out word movers
similarity.word_movers(NLP(apocalypse_now_review), NLP(godfather_review))

0.6768157809129848

In [16]:
EBERT_READABILITY = 9.782916286401726
def score_summary(summary_text, desired_person):
    """Score a summarized piece of text
    """
    # Want high similarity between paragraphs
    inter_paragraph_similarities = []
    avg_similarity = None
    
    # readability measures close to ebert baseline
    readability = abs(text_stats.TextStats(NLP(summary_text)).automated_readability_index - EBERT_READABILITY)/EBERT_READABILITY
    
    
    # more mentions of person
    num_person = 0
    sentences = []
    for s in summary_text.split('.'):
        sentences += s.split('?')
            
    
    for i in sentences:
        if desired_person in i:
            num_person += 1
    person_score = num_person / len(sentences)
    
    # more subjective is better
    total_subjectivity = 0
    for i in sentences:
        total_subjectivity += TextBlob(i).sentiment[1]
    subjectivity = total_subjectivity/len(sentences)
    
    # thesis sentence doesn't have "this", "here", "it"
    if sentences[0] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[0] for i in [" this ", " This ", " here ", " Here"])
    elif sentences[1] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[1] for i in [" this ", " This ", " here ", " Here"])
    else:
        thesis_penalty = 0
    
    # Prefer expressions from the author
    author_count = 0
    for s in sentences:
        if any(i in s for i in ["I ", "I'd"]):
            author_count += 1
    
    # iterate through the paragraphs
    # sentiment within a paragraph is similar
    paragraphs = summary_text.split('\n')
    for i in range(1, len(paragraphs)):
        if paragraphs[i - 1] not in [' ', '', '\n'] and paragraphs[i] not in [' ', '', '\n']:
            inter_paragraph_similarities.append(similarity.word_movers(NLP(paragraphs[i - 1]), NLP(paragraphs[i])))
            
    max_diff = 0
    for p in paragraphs:
        p_sent_min = None
        p_sent_max = None    
        for s in p.split('.'):
            sent = TextBlob(s).sentiment[0]
            if p_sent_min is None:
                p_sent_min = sent
            if p_sent_max is None:
                p_sent_max = sent
                
            if sent < p_sent_min:
                p_sent_min = sent
            if sent > p_sent_max:
                p_sent_max = sent
        if max_diff < abs(p_sent_max - p_sent_min):
            max_diff = abs(p_sent_max - p_sent_min)
    max_diff = 1 - max_diff 
    avg_similarity = sum(inter_paragraph_similarities)/len(inter_paragraph_similarities)
    
    # high num_sentences / paragraph
    sents_per_par = 1 - (len(paragraphs) / len(sentences))
    
    #print(person_score, sents_per_par, avg_similarity, readability, max_diff)
    
    
    # Make score
    #score = sum([person_score, sents_per_par, avg_similarity, readability, max_diff])/5
    score = sum([person_score, avg_similarity, readability, max_diff])/4
    print('person_score', 'avg_similarity', 'readability', 'max_diff')
    print(person_score, avg_similarity, readability, max_diff)
    # boost by person count
    score = score * (1 + (0.1 * author_count))
    score = score - (0.2 * thesis_penalty)
    
    print(score)

In [23]:
score_summary(sample_string, 'Brando')

person_score avg_similarity readability max_diff
0.9444444444444444 0.5501685739883325 0.22034291774903275 0.30000000000000004
0.5037389840454525


In [24]:
other_brando = """There have been a lot of movies where stars have repeated the triumphs of their parts - but has any star ever done it more triumphantly than Marlon Brando does in "The Freshman"? Think of the criticism Brando risked, from those ready to attack him for cashing in on his most famous performance by reprising it for a comedy. Brando must have known the dangers, but he must have had confidence in himself, too - enough to go ahead anyway, and to win a considerable gamble. When Brando finished filming "The Freshman" last September, he attacked The Freshman in a notorious interview in the Toronto Globe and Mail, claiming it was trash and that he was retiring from acting. The other actors, perhaps aware of the chances Brando was taking, seem a little in awe of him - which is as it should be.
However, Brando doesn't so much walk through Don Juan DeMarco as coast, in a gassy, self-indulgent performance no one else could have gotten away with. In a new book about how he makes movies, director Sidney Lumet speculates that Marlon Brando has a way of testing his directors. Brando does a scene twice, he writes, once really putting his soul into it, the second time using only technique. And there are moments so bizarre we wonder if the director actually suggested them, or whether Brando had a brainstorm, and everybody was afraid to tell him it wasn't such a hot idea. If the director fails the test, Brando walks through the rest of Don Juan DeMarco.
And, yes, Brando is good as the old Brando. "Reflections in a Golden Eye" was proof that Brando is far from dead as an actor, although he may be finished as a box office draw.
If you can take that moment and hold it in your mind with the famous scene where he assaults Stella's sister, Blanche DuBois (Vivien Leigh), you can see the freedom Brando is giving to Stanley Kowalski - and the range. Brando held nothing back, and within a few years his was the style that dominated Hollywood movie acting."""

score_summary(other_brando, 'Brando')

person_score avg_similarity readability max_diff
0.9333333333333333 0.5584514089379983 0.21625024211246366 0.30000000000000004
0.5020087460959488


In [25]:
bad_cruise = """The archetypal Tom Cruise Movie is "Top Gun," in which the young fighter pilot, a natural, was tutored by a once-great pilot and emotionally nurtured by an older female flight instructor before testing his wings against the hot dogs of his unit, in preparation for a final showdown. And Cruise is so efficiently packaged in this product that he plays the same role as a saint in a Mexican village's holy day procession: It's not what he does that makes him so special; it's the way he manifests everybody's faith in him.
However, I'd sorta rather see Diaz and Cruise in action scenes on a human scale, rather than have it rubbed in that for long stretches, they're essentially replaced by animation. Knight and Day knows this sequence is monumentally silly, and so do Cruise and Diaz, and Cruise keeps up a reassuring line of patter all during it, even while trying to crash-land safely. Roy Miller, the Cruise character, has something, and there are bad guys who want it.
They find true love, which is shattered when Shue sees Cruise with the rich Manhattan executive. That's when Cruise drops out of school, becomes a full-time bartender, makes Brown his best friend and learns to juggle those bottles. Brown advises Cruise to keep his eyes open for a "rich chick," because that's his ticket to someday opening his own bar.
After Moore and Cruise meet with the accused young Marines, she realizes they have a sticky case on their hands, because the unwritten Marine code means that the two won't talk, even to save themselves."""

score_summary(bad_cruise, 'Cruise')

person_score avg_similarity readability max_diff
0.9 0.5138724477793329 0.5133846031421214 0.0
0.5299956890034


This sounds like an entertaining story, I suppose, but Scorsese doesn't direct a single scene for a payoff. Scorsese and De Niro are the most creative, productive director/actor team in The King of Comedys right now, and the fact that they feel the freedom to make such an odd, stimulating, unsatisfyng movie is good news, I guess. It's hard to believe Scorsese made it; instead of the big-city life, the violence and sexuality of his movies like "Taxi Driver" and "Mean Streets," what we have here is an agonizing portrait of lonely, angry people with their emotions all tightly bottled up.
Scorsese seems so much in command of his gift in GoodFellas. So says Henry Hill in the opening moments of Martin Scorseses GoodFellas, a movie about the tradecraft and culture of organized crime in New York.
However, The way he sees the character of Sam Bowden is the key to why Martin Scorsese wanted to remake the 1962 thriller Cape Fear. Unlike the simplistic version of this scene we have seen in a hundred thrillers, what Scorsese gives us is a villain who has been wronged, seeking to harm a hero who has sinned. Cape Fear is impressive moviemaking, showing Scorsese as a master of a traditional Hollywood genre who is able to mold it to his own themes and obsessions. This is the first film in a production deal Scorsese has with Universal Studios and Steven Spielberg's Amblin Entertainment, and it represents his access to budgets much larger than those he has worked with in the past. And with this new version of Max Cady, Scorsese gives us not simply a bad man, but an evil one - whose whole purpose is to show Sam Bowden he's a criminal, too. In the Scorsese version, Bowden is flawed and guilty, and indeed everyone in Cape Fear is weak in one way or another, and there are no heroes.
Scorsese remains one of the bright young hopes of American movies. Director Martin Scorsese has gone for mood and atmosphere more than for action, and his violence is always blunt and unpleasant -- never liberating and exhilarating, as the New Violence is supposed to be.

In [26]:
scorcese_sample = """This sounds like an entertaining story, I suppose, but Scorsese doesn't direct a single scene for a payoff. Scorsese and De Niro are the most creative, productive director/actor team in The King of Comedys right now, and the fact that they feel the freedom to make such an odd, stimulating, unsatisfyng movie is good news, I guess. It's hard to believe Scorsese made it; instead of the big-city life, the violence and sexuality of his movies like "Taxi Driver" and "Mean Streets," what we have here is an agonizing portrait of lonely, angry people with their emotions all tightly bottled up.
Scorsese seems so much in command of his gift in GoodFellas. So says Henry Hill in the opening moments of Martin Scorseses GoodFellas, a movie about the tradecraft and culture of organized crime in New York.
However, The way he sees the character of Sam Bowden is the key to why Martin Scorsese wanted to remake the 1962 thriller Cape Fear. Unlike the simplistic version of this scene we have seen in a hundred thrillers, what Scorsese gives us is a villain who has been wronged, seeking to harm a hero who has sinned. Cape Fear is impressive moviemaking, showing Scorsese as a master of a traditional Hollywood genre who is able to mold it to his own themes and obsessions. This is the first film in a production deal Scorsese has with Universal Studios and Steven Spielberg's Amblin Entertainment, and it represents his access to budgets much larger than those he has worked with in the past. And with this new version of Max Cady, Scorsese gives us not simply a bad man, but an evil one - whose whole purpose is to show Sam Bowden he's a criminal, too. In the Scorsese version, Bowden is flawed and guilty, and indeed everyone in Cape Fear is weak in one way or another, and there are no heroes.
Scorsese remains one of the bright young hopes of American movies. Director Martin Scorsese has gone for mood and atmosphere more than for action, and his violence is always blunt and unpleasant -- never liberating and exhilarating, as the New Violence is supposed to be."""

score_summary(scorcese_sample, "Scorsese")

person_score avg_similarity readability max_diff
0.9285714285714286 0.5305535211966649 0.41753748020396547 -0.02499999999999991
0.5554987289916178


Cape Fear is impressive moviemaking, showing Scorsese as a master of a traditional Hollywood genre who is able to mold it to his own themes and obsessions. The way he sees the character of Sam Bowden is the key to why Martin Scorsese wanted to remake the 1962 thriller Cape Fear. This is the first film in a production deal Scorsese has with Universal Studios and Steven Spielberg's Amblin Entertainment, and it represents his access to budgets much larger than those he has worked with in the past. And with this new version of Max Cady, Scorsese gives us not simply a bad man, but an evil one - whose whole purpose is to show Sam Bowden he's a criminal, too. Unlike the simplistic version of this scene we have seen in a hundred thrillers, what Scorsese gives us is a villain who has been wronged, seeking to harm a hero who has sinned.
Remarkably, writer-director Martin Scorsese is only 25 years old, and this is his first film. It is possible that with more experience and maturity Scorsese will direct more polished, finished films--but this work, completed when he was 25, contains a frankness he may have diluted by then. Scorsese is gifted at handling subtle moments, but he has some trouble with the more obvious ones. For another misplaced scene, however, Scorsese does not deserve full blame.
Scorsese shows a sure sense for the Hollywood of that time, as in a scene where Howard, new in town, approaches the mogul L.B. Mayer at the Coconut Grove and asks to borrow two cameras for a big "Hells' Angels" scene. Their lives, in fact, are even a little similar: Heedless ambition and talent when young, great early success, tempestuous romances and a dark period, although with Hughes it got darker and darker, while Scorsese has emerged into the full flower of his gifts. Scorsese adds subtle continuity: Every time we see Mayer, he seems to be surrounded by the same flunkies.
I have often thought that many of Scorsese's critics and admirers do not realize how deeply the Catholic Church of pre-Vatican II could burrow into the subconscious, or in how many ways Scorsese is a Catholic director. It is intriguing to wonder what Scorsese saw in the Hong Kong movie that inspired him to make the second remake of his career (after "Cape Fear"). It's strange that Jack Nicholson and Scorsese have never worked together, since they seem like a natural fit; he makes Frank Costello not a godfather, not a rat, not a blowhard, but a smart man who finally encounters a situation no one could fight free of, because he simply lacks all the necessary information."""


In [27]:
other_scorsese = """Cape Fear is impressive moviemaking, showing Scorsese as a master of a traditional Hollywood genre who is able to mold it to his own themes and obsessions. The way he sees the character of Sam Bowden is the key to why Martin Scorsese wanted to remake the 1962 thriller Cape Fear. This is the first film in a production deal Scorsese has with Universal Studios and Steven Spielberg's Amblin Entertainment, and it represents his access to budgets much larger than those he has worked with in the past. And with this new version of Max Cady, Scorsese gives us not simply a bad man, but an evil one - whose whole purpose is to show Sam Bowden he's a criminal, too. Unlike the simplistic version of this scene we have seen in a hundred thrillers, what Scorsese gives us is a villain who has been wronged, seeking to harm a hero who has sinned.
Remarkably, writer-director Martin Scorsese is only 25 years old, and this is his first film. It is possible that with more experience and maturity Scorsese will direct more polished, finished films--but this work, completed when he was 25, contains a frankness he may have diluted by then. Scorsese is gifted at handling subtle moments, but he has some trouble with the more obvious ones. For another misplaced scene, however, Scorsese does not deserve full blame.
Scorsese shows a sure sense for the Hollywood of that time, as in a scene where Howard, new in town, approaches the mogul L.B. Mayer at the Coconut Grove and asks to borrow two cameras for a big "Hells' Angels" scene. Their lives, in fact, are even a little similar: Heedless ambition and talent when young, great early success, tempestuous romances and a dark period, although with Hughes it got darker and darker, while Scorsese has emerged into the full flower of his gifts. Scorsese adds subtle continuity: Every time we see Mayer, he seems to be surrounded by the same flunkies.
I have often thought that many of Scorsese's critics and admirers do not realize how deeply the Catholic Church of pre-Vatican II could burrow into the subconscious, or in how many ways Scorsese is a Catholic director. It is intriguing to wonder what Scorsese saw in the Hong Kong movie that inspired him to make the second remake of his career (after "Cape Fear"). It's strange that Jack Nicholson and Scorsese have never worked together, since they seem like a natural fit; he makes Frank Costello not a godfather, not a rat, not a blowhard, but a smart man who finally encounters a situation no one could fight free of, because he simply lacks all the necessary information."""

score_summary(other_scorsese, 'Scorsese')

person_score avg_similarity readability max_diff
0.8333333333333334 0.5710885434736652 0.5363524419501641 -0.02499999999999991
0.5268379376582198


In [28]:
qt = """Now that we know Quentin Tarantino can make a movie like "Reservoir Dogs," it's time for him to move on and make a better one. Tarantino himself is also interesting as an actor; he could play great crazy villains. Reservoir Dogs feels like it's going to be terrific, but Tarantino's script doesn't have much curiosity about these guys. Having created the characters and fashioned the outline, Tarantino doesn't do much with his characters except to let them talk too much, especially when they should be unconscious from shock and loss of blood.
But this is all one film, and now that we see it whole, it's greater than its two parts; Tarantino remains the most brilliantly oddball filmmaker of his generation, and this is one of the best films of the year. Put the two parts together, and Tarantino has made a masterful saga that celebrates the martial arts genre while kidding it, loving it, and transcending it. Kill Bill, Volume 2 is a distillation of the countless grind house kung-fu movies Tarantino has absorbed, and which he loves beyond all reason. Tarantino, who began Kill Bill, Volume 2 in black and white before switching to color, plays with formats here, too; to suggest the claustrophobia of being buried, he shows The Bride inside her wooden casket, and as clods of earth rain down on the lid, he switches from widescreen to the classic 4x3 screen ratio.
Instead of waving his arms and talking brilliantly a mile a minute, Tarantino mumbles an approximation of a Southwestern accent and puts his charisma on hold. The first of many disappointments comes early in "Destiny Turns On the Radio," when Quentin Tarantino appears on the screen. The disappointment is that Tarantino is not playing "himself," the persona he has perfected on many talk shows and in celebrated roles in "Reservoir Dogs," "Pulp Fiction" and "Sleep With Me" (where he explained his theory that "Top Gun" was a gay movie).
Tarantino has a lot of good scenes in Jackie Brown. This is Jackie Brown that proves Tarantino is the real thing, and not just a two-film wonder boy. Tarantino leaves the hardest questions for last, hides his moves, conceals his strategies in plain view, and gives his characters dialogue that is alive, authentic and spontaneous."""

score_summary(qt, 'Tarantino')

person_score avg_similarity readability max_diff
0.9333333333333333 0.5409352920149993 0.45611934413809613 0.09999999999999998
0.5075969923716072


In [29]:
ke = """Why he likes Beethoven is never explained, but my notion is that Alex likes Beethoven in the same way that Kubrick likes to load his sound track with familiar classical music -- to add a cute, cheap, dead-end dimension. I don't know quite how to explain my disgust at Alex (whom Kubrick likes very much, as his visual style reveals and as we shall see in a moment). No, I think Kubrick is being too modest: Alex is all his. When Kubrick shows us Alex, however, he either places him in the center of a wide-angle shot (so Alex alone has normal human dimensions,) or uses a standard lens that does not distort. Kubrick has used visuals to alter the book's point of view and to nudge us toward a kind of grudging pal-ship with Alex. But that isn't what Kubrick is saying, He actually seems to be implying something simpler and more frightening: that in a world where society is criminal, the citizen might as well be a criminal, too. Kubrick uses the wide-angle lens almost all the time when he is showing events from Alex's point of view; this encourages us to see the world as Alex does, as a crazy-house of weird people out to get him. Alex is violent because it is necessary for him to be violent in order for A Clockwork Orange to entertain in the way Kubrick intends.
Barry Lyndon" isn't a great entertainment in the usual way, but it's a great example of directorial vision: Kubrick saying he's going to make this material function as an illustration of the way he sees the world. Kubrick told the critic Michel Ciment he used the narrator because the novel had too much incident even for a three-hour film, but there isn't the slightest sense he's condensing. There is no other way to see Barry than the way Kubrick sees him. By placing such small characters on such a big stage, by forcing our detachment from them, Kubrick supplies a philosophical position just as clearly as if he'd put speeches in his characters' mouths.
Stanley Kubrick himself was a perfectionist who went to obsessive lengths in order to get everything in his films to work just right. Kubrick the perfectionist left the unplanned slip in Dr. Strangelove (Restored Version), because Scott made it seem convincing, and not an accident. Yet out of these rudimentary physical props and a brilliant screenplay (which Kubrick and Terry Southern based on a novel by Peter George), Kubrick made what is arguably the best political satire of the century, a film that pulled the rug out from under the Cold War by arguing that if a ``nuclear deterrent'' destroys all life on Earth, it is hard to say exactly what it has deterred. Kubrick, whose attention to the smallest detail in every frame was obsessive, would have been aware of George C. Scott's facial gymnastics, and yet he endorsed them, and when you watch ``Strangelove'' you can see why.
Stanley Kubrick himself was a perfectionist who went to obsessive lengths in order to get everything in his films to work just right. Kubrick the perfectionist left the unplanned slip in Dr. Strangelove, because Scott made it seem convincing, and not an accident. Yet out of these rudimentary physical props and a brilliant screenplay (which Kubrick and Terry Southern based on a novel by Peter George), Kubrick made what is arguably the best political satire of the century, a film that pulled the rug out from under the Cold War by arguing that if a "nuclear deterrent" destroys all life on Earth, it is hard to say exactly what it has deterred. Kubrick, whose attention to the smallest detail in every frame was obsessive, would have been aware of George C. Scott's facial gymnastics, and yet he endorsed them, and when you watch "Strangelove" you can see why."""

score_summary(ke, 'Kubrick')

person_score avg_similarity readability max_diff
0.8 0.7148757556863313 0.48850340231830963 0.1875
0.6572637474013922


The archetypal Tom Cruise Movie is "Top Gun," in which the young fighter pilot, a natural, was tutored by a once-great pilot and emotionally nurtured by an older female flight instructor before testing his wings against the hot dogs of his unit, in preparation for a final showdown. And Cruise is so efficiently packaged in this product that he plays the same role as a saint in a Mexican village's holy day procession: It's not what he does that makes him so special; it's the way he manifests everybody's faith in him.
However, I'd sorta rather see Diaz and Cruise in action scenes on a human scale, rather than have it rubbed in that for long stretches, they're essentially replaced by animation. Knight and Day knows this sequence is monumentally silly, and so do Cruise and Diaz, and Cruise keeps up a reassuring line of patter all during it, even while trying to crash-land safely. Roy Miller, the Cruise character, has something, and there are bad guys who want it.
They find true love, which is shattered when Shue sees Cruise with the rich Manhattan executive. That's when Cruise drops out of school, becomes a full-time bartender, makes Brown his best friend and learns to juggle those bottles. Brown advises Cruise to keep his eyes open for a "rich chick," because that's his ticket to someday opening his own bar.
After Moore and Cruise meet with the accused young Marines, she realizes they have a sticky case on their hands, because the unwritten Marine code means that the two won't talk, even to save themselves.

I'd sorta rather see Diaz and Cruise in action scenes on a human scale, rather than have it rubbed in that for long stretches, they're essentially replaced by animation. Knight and Day knows this sequence is monumentally silly, and so do Cruise and Diaz, and Cruise keeps up a reassuring line of patter all during it, even while trying to crash-land safely.
The archetypal Tom Cruise Movie is "Top Gun," in which the young fighter pilot, a natural, was tutored by a once-great pilot and emotionally nurtured by an older female flight instructor before testing his wings against the hot dogs of his unit, in preparation for a final showdown. And Cruise is so efficiently packaged in this product that he plays the same role as a saint in a Mexican village's holy day procession: It's not what he does that makes him so special; it's the way he manifests everybody's faith in him.
However, Tom Cruise, who initially seemed to many people an unlikely choice to play Lestat, is never less than convincing, and his slight British accent, combined with makeup that is dramatic without being obtrusive, disguises the clean-cut star - makes him seem unwholesome in an odd, insinuating way.
I have no reason to believe that Tom Cruise is gay, except maybe for the time he used Oprah's couch as a trampoline, but since she brought it up, isn't his private life none of our beeswax?"""


In [30]:
other_cruise = """I'd sorta rather see Diaz and Cruise in action scenes on a human scale, rather than have it rubbed in that for long stretches, they're essentially replaced by animation. Knight and Day knows this sequence is monumentally silly, and so do Cruise and Diaz, and Cruise keeps up a reassuring line of patter all during it, even while trying to crash-land safely.
The archetypal Tom Cruise Movie is "Top Gun," in which the young fighter pilot, a natural, was tutored by a once-great pilot and emotionally nurtured by an older female flight instructor before testing his wings against the hot dogs of his unit, in preparation for a final showdown. And Cruise is so efficiently packaged in this product that he plays the same role as a saint in a Mexican village's holy day procession: It's not what he does that makes him so special; it's the way he manifests everybody's faith in him.
However, Tom Cruise, who initially seemed to many people an unlikely choice to play Lestat, is never less than convincing, and his slight British accent, combined with makeup that is dramatic without being obtrusive, disguises the clean-cut star - makes him seem unwholesome in an odd, insinuating way.
I have no reason to believe that Tom Cruise is gay, except maybe for the time he used Oprah's couch as a trampoline, but since she brought it up, isn't his private life none of our beeswax?"""

score_summary(other_cruise, 'Cruise')

person_score avg_similarity readability max_diff
0.8571428571428571 0.4960873088279995 1.057385377368458 0.8604166666666666
0.9813096630017943


# Work on Using Scorer on Generic Text

In [31]:
ebert_sample = """
The success of “The Godfather” as a novel was largely due to a series of unforgettable scenes. Puzo is a good storyteller, but no great shakes as a writer. The movie gives almost everything in the novel except the gynecological repair job. It doesn’t miss a single killing; it opens with the wedding of Don Corleone’s daughter (and attendant upstairs activity); and there are the right number of auto bombs, double crosses, and garrotings.
Coppola has found a style and a visual look for all this material so “The Godfather” becomes something of a rarity: a really good movie squeezed from a bestseller. The decision to shoot everything in period decor (the middle and late 1940s) was crucial; if they’d tried to save money as they originally planned, by bringing everything up-to-date, the movie simply wouldn’t have worked. But it’s uncannily successful as a period piece, filled with sleek, bulging limousines and postwar fedoras. Coppola and his cinematographer, Gordon Willis, also do some interesting things with the color photography. The earlier scenes have a reddish-brown tint, slightly overexposed and feeling like nothing so much as a 1946 newspaper rotogravure supplement.
Although the movie is three hours long, it absorbs us so effectively it never has to hurry. There is something in the measured passage of time as Don Corleone hands over his reins of power that would have made a shorter, faster moving film unseemly. Even at this length, there are characters in relationships you can’t quite understand unless you’ve read the novel. Or perhaps you can, just by the way the characters look at each other.
"""

In [32]:
score_summary(ebert_sample, 'Coppola')

person_score avg_similarity readability max_diff
0.14285714285714285 0.5477441397064999 0.03712192955661554 0.42500000000000004
0.28818080303006455


Those who have read the novel may be surprised to find Michael at the center of the movie, instead of Don Corleone. In fact, this is simply an economical way for Coppola to get at the heart of the Puzo story, which dealt with the transfer of power within the family. Marlon Brando, who plays the Godfather as a shrewd, unbreakable old man, actually has the character lead in the movie; Al Pacino, with a brilliantly developed performance as Michael, is the lead.
But Brando’s performance is a skillful throwaway, even though it earned him an Academy Award for best actor. His voice is wheezy and whispery, and his physical movements deliberately lack precision; the effect is of a man so accustomed to power that he no longer needs to remind others. Brando does look the part of old Don Corleone, mostly because of acting and partly because of the makeup, although he seems to have stuffed a little too much cotton into his jowls, making his lower face immobile.


In [33]:
ebert_sample_2 = """Those who have read the novel may be surprised to find Michael at the center of the movie, instead of Don Corleone. In fact, this is simply an economical way for Coppola to get at the heart of the Puzo story, which dealt with the transfer of power within the family. Marlon Brando, who plays the Godfather as a shrewd, unbreakable old man, actually has the character lead in the movie; Al Pacino, with a brilliantly developed performance as Michael, is the lead.
But Brando’s performance is a skillful throwaway, even though it earned him an Academy Award for best actor. His voice is wheezy and whispery, and his physical movements deliberately lack precision; the effect is of a man so accustomed to power that he no longer needs to remind others. Brando does look the part of old Don Corleone, mostly because of acting and partly because of the makeup, although he seems to have stuffed a little too much cotton into his jowls, making his lower face immobile."""
score_summary(ebert_sample_2, 'Brando')

person_score avg_similarity readability max_diff
0.42857142857142855 0.6038022790899997 0.4396381648179288 0.0
0.3680029681198393


In [34]:
ebert_sample_3 = """It's pale and unconvincing compared with the chemistry between Cruise and Rebecca De Mornay in "Risky Business," and between McGillis and Harrison Ford in "Witness" - not to mention between Richard Gere and Debra Winger in "An Officer and a Gentleman," which obviously inspired "Top Gun." Cruise and McGillis spend a lot of time squinting uneasily at each other and exchanging words as if they were weapons, and when they finally get physical, they look like the stars of one of those sexy new perfume ads. There's no flesh and blood here, which is remarkable, given the almost palpable physical presence McGillis had in "Witness." In its other scenes on the ground, the movie seems content to recycle old cliches and conventions out of countless other war movies.
Wouldn't you know, for example, that Maverick's commanding officer at the flying school is the only man who knows what happened to the kid's father in Vietnam? And are we surprised when Maverick's best friend dies in his arms? Is there any suspense as Maverick undergoes his obligatory crisis of conscience, wondering whether he can ever fly again? Movies like "Top Gun" are hard to review because the good parts are so good and the bad parts are so relentless. The dogfights are absolutely the best since Clint Eastwood's electrifying aerial scenes in "Firefox." But look out for the scenes where the people talk to one another."""
score_summary(ebert_sample_3, 'Cruise')

person_score avg_similarity readability max_diff
0.18181818181818182 0.5703697282889971 0.2886254112533559 0.4
0.36020333034013374


In [35]:
ebert_sample_4 = """His performance broke through some kind of psychic barrier, freeing actors of his and later generations to tap emotions that most earlier actors were unable or willing to reveal. It was said that his style was fashioned by the famed acting teacher Stella Adler, but perhaps he possessed it all along, and Adler simply recognized and encouraged it. It took her only a week of coaching Brando, recalls the AP's Bob Thomas, before she said that within a year he would be the best young actor in America.
Much is made of the generation that followed, the Method actors, and although there is much theory and lore associated with the Method, to some degree it consisted of a lot of actors trying to do what Brando did. Paul Newman, Montgomery Clift, James Dean, Robert De Niro, Jack Nicholson, Al Pacino, Sean Penn and Johnny Depp all owe something to Brando. And a performance like Charlize Ther-on's in "Monster" (2003) is almost literally made possible by the avenues that Brando opened.
A recent Premiere magazine poll named Brando's Don Corleone, from Francis Coppola's "The Godfather" (1972) as the single most memorable character in movie history. "The Godfather" is at or near the top of many lists of the greatest films, and Brando's masterpieces also include "A Streetcar Named Desire" (1951), "On the Waterfront" (1954), "Apocalypse Now" (1979) and "Last Tango in Paris" (1972). In those films, he was fearless, exposing his psyche in "Last Tango" in a famous sex scene that no other major actor might have dared.
"Apocalypse Now" traded on his mystique by keeping him offscreen until the closing act of the film; he was the enigmatic Col. Kurtz, brilliant, crazy, holed up in the Vietnamese jungle, running his own operation. Even when he appeared, he was seen mostly in shadow, speaking of the horrors of war in a way that transcended the movie and the character."""
score_summary(ebert_sample_4, 'Brando')

person_score avg_similarity readability max_diff
0.46153846153846156 0.5273591221373326 0.5200207438047236 0.5
0.5022295818701294


# Rebuild the Parts of Coh-Metrix that are Relevant

I couldn't find a good port of CohMetrix for English so I picked the relevant parts and elected to implement them myself.

* Referential Cohesion
    * Noun Overlap
    * Argument Overlap
    * Stem Overlap
    * Content Word Overlap
    * Anaphor Overlap
* Connectives
    * Incidence score of
        * Causal
        * Logical
        * Adversative/Contrastive
        * Temporal
        * Additive
        
I'm going to focus on the Anaphor Overlap and connectives since much of the rest should be caught in the LSA similiarity scoring. I'll use a lexicon for connectives.

In [17]:
def anaphor_overlap(summary_text):
    """This function computes the anaphor overlap index
    
    Per the Coh-Metrix Documentation:
        
        This measure considers the anphor overlap between pairs of 
        sentences. A pair of sentences has an anphor overlap if the 
        later sentence contains a pronoun that refers to a pronoun 
        or noun in the earlier sentence. The score for each pair of 
        sentences is binary, i.e., 0 or 1. The measure of the text 
        is the average of the pair scores.
    
    Args:
        summary_text (String): The text to be scored
        
    Return:
        Float - the anaphor overlap score
    """
    
    # Analyze Text w/ Spacy
    annotated_text = NLP(summary_text)
    sentences = list(annotated_text.sents)
    
    # Iterate through sentences
    num_anaphors = 0
    for i in range(1, len(sentences)):
        if sentences[i - 1].text not in [' ', '', '\n'] and sentences[i].text not in [' ', '', '\n']:
            # check if prounoun in i and either noun or matching pronoun in i-1
            
            # First, check if there is a person in i-1
            person_flag = False
            for word in sentences[i-1]:
                if word.ent_type_ == 'PERSON':
                    person_flag = True
            
            # Next, check if there is a pronoun in i-1
            pronouns = ['he', 'she', 'they', 'him', 'her', 'them', 'his', 'hers', 'theirs']
            pronoun_flag = any(j in sentences[i-1].text.lower() for j in pronouns)
            
            # Check if there is a pronoun in i
            pronoun_flag_2 = any(j in sentences[i].text.lower() for j in pronouns)
            
            if (pronoun_flag_2 and pronoun_flag) or (pronoun_flag_2 and person_flag):
                num_anaphors += 1
                
    return num_anaphors / (len(sentences) - 1)
                    
            
def person_overlap(summary_text):
    """This function computes the person overlap index
    
    Roughly based on the Noun Overlap index from Coh-Metrix
    
    Args:
        summary_text (String): The text to be scored
        
    Return:
        Float - the person overlap score
    """
    
    # Analyze Text w/ Spacy
    annotated_text = NLP(summary_text)
    sentences = list(annotated_text.sents)
    
    # Iterate through sentences
    num_anaphors = 0
    for i in range(1, len(sentences)):
        if sentences[i - 1].text not in [' ', '', '\n'] and sentences[i].text not in [' ', '', '\n']:
            # check if name in i and in i-1
            people_match = False
            
            # First, check if there is a person in i-1
            people = []
            for word in sentences[i-1]:
                if word.ent_type_ == 'PERSON':
                    people.append(word.text.lower())
            
            for word in sentences[i]:
                if word.ent_type_ == 'Person':
                    if word.text.lower() in people:
                        people_match = True
                        
            if people_match:
                num_anaphors += 1
                        
                
    return num_anaphors / (len(sentences) - 1)           
  
    
    

In [18]:
test_sentences = "Jerry Seinfeld starred in the show for like seven seasons. He was wildly successful from it."
print(anaphor_overlap(test_sentences))
print(person_overlap(test_sentences))

1.0
0.0


In [19]:
EBERT_READABILITY = 9.782916286401726
def score_summary_2(summary_text, desired_person):
    """Score a summarized piece of text
    """
    # Want high similarity between paragraphs
    inter_paragraph_similarities = []
    avg_similarity = None
    
    sentences = [i.text for i in NLP(summary_text).sents]
    
    # readability measures close to ebert baseline
    readability = abs(text_stats.TextStats(NLP(summary_text)).automated_readability_index - EBERT_READABILITY)/EBERT_READABILITY
    
    
    # Coh Metrix Indices
    anaphor_score = anaphor_overlap(summary_text)
    person_score = person_overlap(summary_text)
    
    
    # more subjective is better
    total_subjectivity = 0
    for i in sentences:
        total_subjectivity += TextBlob(i).sentiment[1]
    subjectivity = total_subjectivity/len(sentences)
    
    # thesis sentence doesn't have "this", "here", "it"
    if sentences[0] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[0] for i in [" this ", " This ", " here ", " Here"])
    elif sentences[1] not in [' ', '', '\n']:
        thesis_penalty = sum(i in sentences[1] for i in [" this ", " This ", " here ", " Here"])
    else:
        thesis_penalty = 0
    
    # Prefer expressions from the author
    author_count = 0
    for s in sentences:
        if any(i in s for i in ["I ", "I'd", "My"]):
            author_count += 1
    
    # iterate through the paragraphs
    # sentiment within a paragraph is similar
    paragraphs = summary_text.split('\n')
    for i in range(1, len(paragraphs)):
        if paragraphs[i - 1] not in [' ', '', '\n'] and paragraphs[i] not in [' ', '', '\n']:
            inter_paragraph_similarities.append(similarity.word_movers(NLP(paragraphs[i - 1]), NLP(paragraphs[i])))
            
    max_diff = 0
    for p in paragraphs:
        p_sent_min = None
        p_sent_max = None    
        for s in p.split('.'):
            sent = TextBlob(s).sentiment[0]
            if p_sent_min is None:
                p_sent_min = sent
            if p_sent_max is None:
                p_sent_max = sent
                
            if sent < p_sent_min:
                p_sent_min = sent
            if sent > p_sent_max:
                p_sent_max = sent
        if max_diff < abs(p_sent_max - p_sent_min):
            max_diff = abs(p_sent_max - p_sent_min)
    max_diff = 1 - max_diff 
    avg_similarity = sum(inter_paragraph_similarities)/len(inter_paragraph_similarities)
    
    
    
    # Make score
    score = (0.25 * avg_similarity) + \
            (0.20 * person_score) + \
            (0.15 * anaphor_score) + \
            (0.1 * max_diff) + \
            (0.05 * readability) + \
            (0.25 * subjectivity) 
    # boost by person count
    score = score * (1 + (0.1 * author_count))
    score = score - (0.2 * thesis_penalty)
    
    
    return score

In [83]:
ebert_sample_4 = """His performance broke through some kind of psychic barrier, freeing actors of his and later generations to tap emotions that most earlier actors were unable or willing to reveal. It was said that his style was fashioned by the famed acting teacher Stella Adler, but perhaps he possessed it all along, and Adler simply recognized and encouraged it. It took her only a week of coaching Brando, recalls the AP's Bob Thomas, before she said that within a year he would be the best young actor in America.
Much is made of the generation that followed, the Method actors, and although there is much theory and lore associated with the Method, to some degree it consisted of a lot of actors trying to do what Brando did. Paul Newman, Montgomery Clift, James Dean, Robert De Niro, Jack Nicholson, Al Pacino, Sean Penn and Johnny Depp all owe something to Brando. And a performance like Charlize Ther-on's in "Monster" (2003) is almost literally made possible by the avenues that Brando opened.
A recent Premiere magazine poll named Brando's Don Corleone, from Francis Coppola's "The Godfather" (1972) as the single most memorable character in movie history. "The Godfather" is at or near the top of many lists of the greatest films, and Brando's masterpieces also include "A Streetcar Named Desire" (1951), "On the Waterfront" (1954), "Apocalypse Now" (1979) and "Last Tango in Paris" (1972). In those films, he was fearless, exposing his psyche in "Last Tango" in a famous sex scene that no other major actor might have dared.
"Apocalypse Now" traded on his mystique by keeping him offscreen until the closing act of the film; he was the enigmatic Col. Kurtz, brilliant, crazy, holed up in the Vietnamese jungle, running his own operation. Even when he appeared, he was seen mostly in shadow, speaking of the horrors of war in a way that transcended the movie and the character."""
score_summary_2(ebert_sample_4, 'Brando')

0.46817376024501184

In [84]:
ebert_sample_4 = """Brando's performances broke through some kind of psychic barrier, freeing actors of his and later generations to tap emotions that most earlier actors were unable or willing to reveal. It was said that his style was fashioned by the famed acting teacher Stella Adler, but perhaps he possessed it all along, and Adler simply recognized and encouraged it. It took her only a week of coaching Brando, recalls the AP's Bob Thomas, before she said that within a year he would be the best young actor in America.
Much is made of the generation that followed, the Method actors, and although there is much theory and lore associated with the Method, to some degree it consisted of a lot of actors trying to do what Brando did. Paul Newman, Montgomery Clift, James Dean, Robert De Niro, Jack Nicholson, Al Pacino, Sean Penn and Johnny Depp all owe something to Brando. And a performance like Charlize Ther-on's in "Monster" (2003) is almost literally made possible by the avenues that Brando opened.
A recent Premiere magazine poll named Brando's Don Corleone, from Francis Coppola's "The Godfather" (1972) as the single most memorable character in movie history. "The Godfather" is at or near the top of many lists of the greatest films, and Brando's masterpieces also include "A Streetcar Named Desire" (1951), "On the Waterfront" (1954), "Apocalypse Now" (1979) and "Last Tango in Paris" (1972). In those films, he was fearless, exposing his psyche in "Last Tango" in a famous sex scene that no other major actor might have dared.
"Apocalypse Now" traded on his mystique by keeping him offscreen until the closing act of the film; he was the enigmatic Col. Kurtz, brilliant, crazy, holed up in the Vietnamese jungle, running his own operation. Even when he appeared, he was seen mostly in shadow, speaking of the horrors of war in a way that transcended the movie and the character."""
score_summary_2(ebert_sample_4, 'Brando')

0.4752293660917666

In [85]:
ebert_sample_3 = """It's pale and unconvincing compared with the chemistry between Cruise and Rebecca De Mornay in "Risky Business," and between McGillis and Harrison Ford in "Witness" - not to mention between Richard Gere and Debra Winger in "An Officer and a Gentleman," which obviously inspired "Top Gun." Cruise and McGillis spend a lot of time squinting uneasily at each other and exchanging words as if they were weapons, and when they finally get physical, they look like the stars of one of those sexy new perfume ads. There's no flesh and blood here, which is remarkable, given the almost palpable physical presence McGillis had in "Witness." In its other scenes on the ground, the movie seems content to recycle old cliches and conventions out of countless other war movies.
Wouldn't you know, for example, that Maverick's commanding officer at the flying school is the only man who knows what happened to the kid's father in Vietnam? And are we surprised when Maverick's best friend dies in his arms? Is there any suspense as Maverick undergoes his obligatory crisis of conscience, wondering whether he can ever fly again? Movies like "Top Gun" are hard to review because the good parts are so good and the bad parts are so relentless. The dogfights are absolutely the best since Clint Eastwood's electrifying aerial scenes in "Firefox." But look out for the scenes where the people talk to one another."""
score_summary_2(ebert_sample_3, 'Cruise')

0.48443035847907295

In [86]:
results_3 = like_person_3('ebert', 'Streep')
print(results_3)
print(score_summary_2(results_3, 'Streep'))

The proof of it is that, on the basis of She-Devil, Streep didn't have to retire to her own dressing room to ask herself what she was doing in a movie with Barr. It's a role that calls out for broad, fearless interpretation, and Streep has a lot of fun with it. And that sense of realized revenge is an undercurrent throughout "She-Devil," which works both on a fictional level and as a real-life demonstration that Barr and Streep are indeed right there in the same movie. If Barr is correctly cast, so is Streep, who has always had a rich vein of comedy bubbling through her personal life - few people are merrier during interviews - but who has dedicated her career to playing serious or even tragic women, most of them with accents. Streep, as Mary Fisher, has erected a glamorous fictional facade around the mundane actual facts of her life, and it is with grim precision that Barr's character pulls it to pieces.
However, And while it can be funny when a respectable lady gets stoned on pot, it

In [87]:
results_4 = like_person_4('ebert', 'Streep')
print(results_4)
print(score_summary_2(results_4, 'Streep'))

So Lisa continues the sessions, and perhaps only Streep could produce such gradations of facial expressions as her client describes her son's lovemaking, his opinion of his mother, and admirable details of his physique. And Streep has that ability to cut through the solemnity of a scene with a zinger that reveals how all human effort is, after all, comic at some level: How amusing, to think we can control fate! Meryl Streep and Uma Thurman have line readings that work as delicate and precise adjustments of dangerous situations.
Streep was so much more convincing in the somewhat similar role of Karen Silkwood. "Erin Brockovich" is "Silkwood" (Meryl Streep fighting nuclear wastes) crossed with "A Civil Action" (John Travolta against pollution) plus Julia Roberts in a plunging neckline.
The proof of it is that, on the basis of She-Devil, Streep didn't have to retire to her own dressing room to ask herself what she was doing in a movie with Barr. It's a role that calls out for broad, fearl

In [88]:
ebert_sample_2 = """Those who have read the novel may be surprised to find Michael at the center of the movie, instead of Don Corleone. In fact, this is simply an economical way for Coppola to get at the heart of the Puzo story, which dealt with the transfer of power within the family. Marlon Brando, who plays the Godfather as a shrewd, unbreakable old man, actually has the character lead in the movie; Al Pacino, with a brilliantly developed performance as Michael, is the lead.
But Brando’s performance is a skillful throwaway, even though it earned him an Academy Award for best actor. His voice is wheezy and whispery, and his physical movements deliberately lack precision; the effect is of a man so accustomed to power that he no longer needs to remind others. Brando does look the part of old Don Corleone, mostly because of acting and partly because of the makeup, although he seems to have stuffed a little too much cotton into his jowls, making his lower face immobile."""
score_summary_2(ebert_sample_2, 'Brando')

0.4197181922991106

In [90]:
nick_sample_1 = """Joel McHale has a particular kind of charisma that makes him perfect for a show like community, and stick out in a show like his new one. His overwhelming screen presence and self-aware delivery absolutely make the characater of Jeff Winger. But, in a more traditional sitcom -- laugh track or studio audience and all -- it just pulls you out of it.
I must say though, despite his limited range, I loved him in Community. His character is designed to showcase the sharp monologue delivery that carried him for most of his early career such as the host of The Soup. And, it makes every episode feel like the end of some old Bill Murray movie. He can even plausibly play the action hero, when the paintball homage calls for it.
I wouldn't cast McHale in just anything; but he's incredibly watchable in the right setting."""

score_summary_2(nick_sample_1, 'McHale')

0.5029025510961714

In [91]:
results_3 = like_person_3('ebert', 'Nicholson')
print(results_3)
print(score_summary_2(results_3, 'Nicholson'))

Nicholson is very good with the material (some of his line readings are balancing acts of the savage and the sublime), but this material can only take him, and us, so far. "Wolf" stars Jack Nicholson as a top editor for a New York publishing house, who is bitten by a wolf and begins to turn into a werewolf, just as a billionaire tycoon buys the company and replaces Nicholson with a back-stabbing yuppie. His replacement, a traitor Nicholson thought was his friend, is the polished young hypocrite Stewart Swinton (James Spader, playing what can only be called the James Spader role, and playing it very nicely, too). And more, because Nicholson and his director, Mike Nichols, are halfway serious about exploring what might happen if a New York book editor did become a werewolf. This scenario doesn't develop as office politics as usual, however, because of the strange experience Nicholson had a few nights earlier in Vermont, where he was bitten by a wolf.
It may seem surprising to say that Ba

In [92]:
results_3 = like_person_4('ebert', 'Nicholson')
print(results_3)
print(score_summary_2(results_3, 'Nicholson'))

Too bad, because in their best scenes together, Nicholson and Mohr achieve a kind of intimacy and immediate truth that is hard to find, and a shame to waste. They aren't sitcom types, but solid, loyal, comfortable, smart people; Alice reminds me a little of Nicholson's great performance in "Tully" (2000) as a veterinary student who knows who she wants to marry and captures her prey with infinite subtlety and tenderness.
"Easy Rider" proved in 1969 that Jack Nicholson was a great character actor. "Five Easy Pieces" proved in 1970 that he was a great actor and a star. Nicholson was not Five Easy Pieces's only discovery.
Historic moments in the cinema are not always this easy to identify: Nicholson had been in movies for years, but his jailhouse dialogue in "Easy Rider" instantly made him a star. In scenes like the one where Hopper and Fonda teach Nicholson how to inhale, there's a quietly approving air, as if life is a treatable disease, and pot is the cure. Easy Rider comes alive with t

# Topic Modeling

Use topic modeling to try and make better paragraphs

In [20]:
c = Corpus('ebert')
all_articles = c.slurp_articles()

In [28]:
doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
    (textacy.Doc(doc, lang='en').to_terms_list(ngrams=1, named_entities=True, as_strings=True)
     for doc in all_articles),
    weighting='tfidf', normalize=True, smooth_idf=True, min_df=4, max_df=0.90)

In [29]:
all_articles[0]

'1. A robot may not injure a human being or, through inaction, allow a human being to come to harm.\n2. A robot must obey orders given it by human beings, except where such orders would conflict with the First Law.\n3. A robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\n--Isaac Asimov\'s I, Robot\n\'I, Robot" takes place in Chicago circa 2035, a city where spectacular new skyscrapers share the skyline with landmarks like the Sears (but not the Trump) Tower. The tallest of the buildings belongs to U.S. Robotics, and on the floor of its atrium lobby lies the dead body of its chief robot designer, apparently a suicide.\nDet. Del Spooner is on the case. Will Smith plays Spooner, a Chicago Police Department detective who doesn\'t think it\'s suicide. He has a deep-seated mistrust of robots, despite the famous Three Laws of Robotics, which declare above all that a robot must not harm a human being.\nThe dead man is Dr. Alfred Lan

In [30]:
model = textacy.tm.TopicModel('lda', n_topics=12)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
doc_topic_matrix.shape



(7833, 12)

In [31]:
import numpy as np

In [32]:
#model = textacy.tm.TopicModel('lda', n_topics=30)

#model.fit(doc_term_matrix)
#doc_topic_matrix = model.transform(doc_term_matrix)

top_term_matrix = np.real(model.model.components_)
doc_lengths = [len(d) for d in all_articles]
vocab = list(id2term.values())
term_frequency = textacy.vsm.get_term_freqs(doc_term_matrix)
term_frequency = np.real(term_frequency)

import pyLDAvis

vis_data = pyLDAvis.prepare(
    top_term_matrix,
    doc_topic_matrix,
    doc_lengths,
    vocab,
    term_frequency,
)

In [33]:
top_term_matrix[0]

array([ 0.08333349,  0.08333338,  0.08333338, ...,  0.08333339,
        0.08333338,  0.08333339])

In [34]:
pyLDAvis.display(vis_data)

In [97]:
#model.topic_weights(doc_topic_matrix)

In [96]:
#model.termite_plot(doc_term_matrix, id2term,
#                   topics=-1,  n_terms=109, sort_terms_by='seriation')

In [95]:
#%matplotlib inline

In [94]:
#for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=20):
#    print('topic', topic_idx, ':', ', '.join(top_terms))

In [58]:

model = textacy.tm.TopicModel('lda', n_topics=3)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
doc_topic_matrix.shape

top_term_matrix = np.real(model.model.components_)
doc_lengths = [len(d) for d in all_articles]
vocab = list(id2term.values())
term_frequency = textacy.vsm.get_term_freqs(doc_term_matrix)
term_frequency = np.real(term_frequency)





In [59]:
vis_data = pyLDAvis.prepare(
    np.real(top_term_matrix),
    np.real(doc_topic_matrix),
    np.real(doc_lengths),
    vocab,
    np.real(term_frequency),
)

In [60]:
pyLDAvis.display(vis_data)

In [None]:
def like_person_wrapper(corpus, name):
    """This function evaluates a few handlers and selects the response with teh best fit score
    
    Args:
        corpus - the name of the corpus , either 'ebert', or 'jones'
        name - the name of the person
        
    Return:
        STring - a few paragraphs about the person
        
    """
    results = []
    for f in [like_person, like_person_2, like_person_3, like_person_4]:
        results.append(f(corpus, name))
        
    max_score = 0
    result = None
    for r in results:
        s = score_summary_2(r)
        if s >= max_score:
            max_score = s
            result = r
            
    return result
        
