### NLP - Homework 5
#### Paritosh Rai¶

In [3]:
# ref: https://shravan-kuchkula.github.io/scrape_imdb_movie_reviews/#analysis

import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
import nltk
from nltk import word_tokenize
import spacy # import spacy
import spacy
#load en_core_web_sm
nlp = spacy.load('en_core_web_sm')

pd.options.display.max_colwidth=500

In [4]:
def getSoup(url):
    """
    Utility function which takes a url and returns a Soup object.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup

In [5]:
# API call to select:
## feature films
## which are rated atleast 4.0
## having atleast 50,000 votes
## in the romance genre
## sorted by user rating
## limit to 150 movies
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.0
&num_votes=50000,&genres=romance&view=simple&sort=user_rating,desc&count=150'''

# get the soup from url using 'getSoup' function
movies_soup = getSoup(url)
#print(movies_soup)

In [6]:
# Find all a-tag with class:None

In [7]:
movie_tags = movies_soup.find_all('a', attrs={'class': None})

#print(movie_tags)

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]
#print(movie_tags)

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))
#print(movie_tags)

print("There are a total of " + str(len(movie_tags)) + " movie titles")
print("Displaying 10 titles")

movie_tags[:10]

There are a total of 150 movie titles
Displaying 10 titles


['/title/tt0109830/',
 '/title/tt0118799/',
 '/title/tt0095765/',
 '/title/tt0034583/',
 '/title/tt0027977/',
 '/title/tt0021749/',
 '/title/tt5311514/',
 '/title/tt0910970/',
 '/title/tt0338013/',
 '/title/tt0211915/']

In [8]:
# movie links
# use base_url+tag+review
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 150 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt0109830/reviews',
 'https://www.imdb.com/title/tt0118799/reviews',
 'https://www.imdb.com/title/tt0095765/reviews',
 'https://www.imdb.com/title/tt0034583/reviews',
 'https://www.imdb.com/title/tt0027977/reviews',
 'https://www.imdb.com/title/tt0021749/reviews',
 'https://www.imdb.com/title/tt5311514/reviews',
 'https://www.imdb.com/title/tt0910970/reviews',
 'https://www.imdb.com/title/tt0338013/reviews',
 'https://www.imdb.com/title/tt0211915/reviews']

In [9]:
# function to find positive and negative review
def minMax(a):
    '''Returns the index of negative and positive review.'''
    
    # get the index of least rated user review
    minpos = a.index(min(a))
    
    # get the index of highest rated user review
    maxpos = a.index(max(a))
    
    return minpos, maxpos


In [10]:
def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    
    # find the index of negative and positive review
    n_index, p_index = minMax(list(map(int, user_review_ratings)))
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    
    
    # get the negative and positive review tags
    n_review_tag = user_review_list[n_index]
    p_review_tag = user_review_list[p_index]
    
    # return the negative and positive review link
    n_review_link = "https://www.imdb.com" + n_review_tag['href']
    p_review_link = "https://www.imdb.com" + p_review_tag['href']
    
    return n_review_link, p_review_link

In [11]:
def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

In [12]:
def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()



In [13]:
def getNounChunks(user_review):
    
    # create the doc object
    doc = nlp(user_review)
    
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    
    # convert noun_chunks from span objects to strings, otherwise it won't pickle
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    
    return noun_chunks_strlist

In [14]:
# get a list of soup objects
movie_soups = [getSoup(link) for link in movie_links]

# get all 150 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

300
There are a total of 300 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw3472550/',
 'https://www.imdb.com/review/rw1088679/',
 'https://www.imdb.com/review/rw0409038/',
 'https://www.imdb.com/review/rw0409131/',
 'https://www.imdb.com/review/rw1222045/',
 'https://www.imdb.com/review/rw1024609/',
 'https://www.imdb.com/review/rw4921675/',
 'https://www.imdb.com/review/rw0026277/',
 'https://www.imdb.com/review/rw0015304/',
 'https://www.imdb.com/review/rw3431894/']

### Construct a dataframe: 
#### Create, a dataframe is constructed using these results.

In [15]:
# get review text from the review link
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': review_texts, 'sentiment': review_sentiment})

In [16]:
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,Forrest Gump,https://www.imdb.com/review/rw3472550/,"I remember John Byner, the stand-up comic and impressionist of the 1970s talking about guys crying at movies, how it's not an acceptable behavior. He advised the men in his audience to drop their keys, do something that gets them to lean forward, wipe their faces, and get things under control.I dropped my keys watching Forrest Gump. Lieutenant Dan comes over the hill at Forrest and Jenny's wedding, new legs, fiancé at his side, clean-cut and happy.Forrest states the obvious, ""Lieutenant Dan,...",negative
1,Forrest Gump,https://www.imdb.com/review/rw1088679/,When I first saw this movie I didn't appreciate it like I do now. I think it may have been because I was so young when I first saw it. Just recently I saw the movie again. What an amazing story and moving meaning. That movie teaches you so much about life and the meaning of it. That life isn't as bad as most people make it seem. That an innocent man can impact so many lives with his innocence. The meaning of the movie to me is that everyone needs to have a better outlook on life. That we nee...,positive
2,Life Is Beautiful,https://www.imdb.com/review/rw0409038/,"I typically don't care much for sub-titled movies. Foreign films to me are slow and about nothing. So I naturally went to see Life is Beautiful with a chip on my shoulder. I wanted to hate it. I wanted it to be just another movie that Hollywood was praising to show that they were an intelligent bunch of people. Oh boy was I in for the surprise of my life. Not only is this film good, I honestly believe that it was the second best film of 98, next to Saving Private Ryan. The mere fact t...",negative
3,Life Is Beautiful,https://www.imdb.com/review/rw0409131/,"I am surprised about the negative comments that some people made on this web-page. I can see how some people might not experience the same kind of uplift or joy that most of us lived through when seeing the movie. But that some viewers felt insulted and betrayed because the movie did not depict ""reality"" as it really was or is, is not fair.I must concur with my fellow proponents of the movie -- it was a great and very satisfying movie. It provided me with something that everyone in one or...",positive
4,Cinema Paradiso,https://www.imdb.com/review/rw1222045/,"A love story, pure and simple. Not, as one might suppose, the doomed young love between Salvatore and Elena (a love he fails to recapture), but the pure father/son love between the local cinema projectionist, Alfredo and the fatherless Salvatore. The essential beauty of this film-long relationship is brought home in the greatest final scene any film in history ever had (IMHO). This has the hairs on the back of my neck stand to attention every single time I've seen it.With the themes this fil...",negative


#### Q2 Extract noun phrase (NP) chunks from your reviews using the following procedure:
#### a.	In Python, use BeautifulSoup to grab the main review text from each link.  
#### b.	Next run each review text through a tokenizer, and then try to NP-chunk it with a shallow parser. 
#### c.	You probably will have too many unknown words, owing to proper names of characters, actors, and so on that are not in your working dictionary. Make sure the main names that are relevant to the movies in your collection of reviews are added to the working lexicon, and then run the NP chunker again.



In [17]:
# used function BeautifulSoup to grabthe review
# covert list to text
#review_texts
#r_tokens = nltk.word_tokenize(review_texts) # tokenize the review 
review_texts1 = [getReviewText(url) for url in movie_review_list]
import nltk # import nltk
import spacy
nlp=spacy.load('en_core_web_sm')
from nltk.tokenize import word_tokenize
#doc = nlp(print(review_texts1))
def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele  
    
    # return string  
    return str1 
        

res=listToString(review_texts1)
#res = [sub.split() for sub in review_texts1]
#res
doc = nlp(res)

# show text
doc

I remember John Byner, the stand-up comic and impressionist of the 1970s talking about guys crying at movies, how it's not an acceptable behavior. He advised the men in his audience to drop their keys, do something that gets them to lean forward, wipe their faces, and get things under control.I dropped my keys watching Forrest Gump. Lieutenant Dan comes over the hill at Forrest and Jenny's wedding, new legs, fiancé at his side, clean-cut and happy.Forrest states the obvious, "Lieutenant Dan, you gawt le-eggs!"And the water-works just started to flow.I sit up straight and clear my throat. Got 'em (the keys, that is). My wife leans over and gives me a kiss. She says, "That's why I love you."Other than a few historical fussinesses and plot slickeries, none of which are worth mentioning, this is as close to a perfect, emotionally-satisfying entertainment as I have ever seen. I love this movie. I never tire of the simple story of the guy with the lowest IQ in the room being the smartest guy

In [18]:
#Create Token
tokens = [token.text for token in doc]
print(tokens)

['I', 'remember', 'John', 'Byner', ',', 'the', 'stand', '-', 'up', 'comic', 'and', 'impressionist', 'of', 'the', '1970s', 'talking', 'about', 'guys', 'crying', 'at', 'movies', ',', 'how', 'it', "'s", 'not', 'an', 'acceptable', 'behavior', '.', 'He', 'advised', 'the', 'men', 'in', 'his', 'audience', 'to', 'drop', 'their', 'keys', ',', 'do', 'something', 'that', 'gets', 'them', 'to', 'lean', 'forward', ',', 'wipe', 'their', 'faces', ',', 'and', 'get', 'things', 'under', 'control', '.', 'I', 'dropped', 'my', 'keys', 'watching', 'Forrest', 'Gump', '.', 'Lieutenant', 'Dan', 'comes', 'over', 'the', 'hill', 'at', 'Forrest', 'and', 'Jenny', "'s", 'wedding', ',', 'new', 'legs', ',', 'fiancé', 'at', 'his', 'side', ',', 'clean', '-', 'cut', 'and', 'happy', '.', 'Forrest', 'states', 'the', 'obvious', ',', '"', 'Lieutenant', 'Dan', ',', 'you', 'gawt', 'le', '-', 'eggs!"And', 'the', 'water', '-', 'works', 'just', 'started', 'to', 'flow', '.', 'I', 'sit', 'up', 'straight', 'and', 'clear', 'my', 'thro

In [19]:
# Create Part of Speach 
from nltk import pos_tag
from nltk import RegexpParser
tokens_tag = pos_tag(tokens)
tokens_tag

[('I', 'PRP'),
 ('remember', 'VBP'),
 ('John', 'NNP'),
 ('Byner', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('stand', 'NN'),
 ('-', ':'),
 ('up', 'RB'),
 ('comic', 'JJ'),
 ('and', 'CC'),
 ('impressionist', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('1970s', 'CD'),
 ('talking', 'VBG'),
 ('about', 'IN'),
 ('guys', 'NNS'),
 ('crying', 'VBG'),
 ('at', 'IN'),
 ('movies', 'NNS'),
 (',', ','),
 ('how', 'WRB'),
 ('it', 'PRP'),
 ("'s", 'VBZ'),
 ('not', 'RB'),
 ('an', 'DT'),
 ('acceptable', 'JJ'),
 ('behavior', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('advised', 'VBD'),
 ('the', 'DT'),
 ('men', 'NNS'),
 ('in', 'IN'),
 ('his', 'PRP$'),
 ('audience', 'NN'),
 ('to', 'TO'),
 ('drop', 'VB'),
 ('their', 'PRP$'),
 ('keys', 'NNS'),
 (',', ','),
 ('do', 'VBP'),
 ('something', 'NN'),
 ('that', 'WDT'),
 ('gets', 'VBZ'),
 ('them', 'PRP'),
 ('to', 'TO'),
 ('lean', 'VB'),
 ('forward', 'RB'),
 (',', ','),
 ('wipe', 'VBP'),
 ('their', 'PRP$'),
 ('faces', 'VBZ'),
 (',', ','),
 ('and', 'CC'),
 ('get', 'VB'),
 ('things', 'N

In [20]:
import spacy
#nlp = spacy.load('en_core_web_sm')
#doc = nlp(doc)

phrases = set() 
for nc in doc.noun_chunks:
    phrases.add(nc.text)
    phrases.add(doc[nc.root.left_edge.i:nc.root.right_edge.i+1].text)
print(phrases)

{'the right time to review this film', 'a project', 'America', 'technical advances', 'violent crimes', 'a huge heart', "Grace's mother", 'so many interesting ideas', "a deeply-felt clinical study of class relations conditioning men's [*] behaviour and destiny in the...deterministic social system", 'the angel', 'an effective, funny, heart-warming and moving arrangement', 'the cogwheels', 'a lot of documents/short films and photo that were amazingly made by the Japanese soldiers themselves for showing off and promoting a sense of fears at that time', 'a Masterpiece of Indian Cinema', 'no positive outcome', 'the melodrama', 'just blatantly overlooked explanations', 'his large use', 'the greater plight', 'English man', 'the Marquis de Sade', 'Visas, permissions and the like', 'no Lion King, and no Disneyland', 'the transgressions', 'a montage', 'a universal film', 'the protest of our youth', 'Celine and Jesse', 'the lively intelligence', 'the nihilistic punk rock', 'the other 5%', 'the che

In [21]:
# Export these links to a csv file to help with further analysis.
# save the dataframe to a csv file.
df.to_csv('IMDb_userReviews.csv', index=False)

In [22]:
df['noun_chunks'] = df['user_review'].apply(getNounChunks)
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment,noun_chunks
0,Forrest Gump,https://www.imdb.com/review/rw3472550/,"I remember John Byner, the stand-up comic and impressionist of the 1970s talking about guys crying at movies, how it's not an acceptable behavior. He advised the men in his audience to drop their keys, do something that gets them to lean forward, wipe their faces, and get things under control.I dropped my keys watching Forrest Gump. Lieutenant Dan comes over the hill at Forrest and Jenny's wedding, new legs, fiancé at his side, clean-cut and happy.Forrest states the obvious, ""Lieutenant Dan,...",negative,"[I, John Byner, the 1970s, guys, movies, it, an acceptable behavior, He, the men, his audience, their keys, something, them, their faces, things, control, I, my keys, Forrest Gump, Lieutenant Dan, the hill, Forrest and Jenny's wedding, new legs, his side, Forrest, you, the water-works, I, my throat, 'em, the keys, My wife, me, a kiss, She, I, you, ""Other, a few historical fussinesses, plot slickeries, none, a perfect, emotionally-satisfying entertainment, I, I, this movie, I, the simple stor..."
1,Forrest Gump,https://www.imdb.com/review/rw1088679/,When I first saw this movie I didn't appreciate it like I do now. I think it may have been because I was so young when I first saw it. Just recently I saw the movie again. What an amazing story and moving meaning. That movie teaches you so much about life and the meaning of it. That life isn't as bad as most people make it seem. That an innocent man can impact so many lives with his innocence. The meaning of the movie to me is that everyone needs to have a better outlook on life. That we nee...,positive,"[I, this movie, I, it, I, I, it, I, I, it, I, the movie, What an amazing story, meaning, That movie, you, life, the meaning, it, That life, most people, it, an innocent man, so many lives, his innocence, The meaning, the movie, me, everyone, a better outlook, life, we, the little things, the big things, us, life, us, trials, tribulations, a box, chocolates, we, it, it, we, it, we, our values, a place, no hope, I, this movie, anyone, who, it, who, it, I, it, it, It]"
2,Life Is Beautiful,https://www.imdb.com/review/rw0409038/,"I typically don't care much for sub-titled movies. Foreign films to me are slow and about nothing. So I naturally went to see Life is Beautiful with a chip on my shoulder. I wanted to hate it. I wanted it to be just another movie that Hollywood was praising to show that they were an intelligent bunch of people. Oh boy was I in for the surprise of my life. Not only is this film good, I honestly believe that it was the second best film of 98, next to Saving Private Ryan. The mere fact t...",negative,"[I, sub-titled movies, Foreign films, me, nothing, I, Life, a chip, my shoulder, I, it, I, it, just another movie, Hollywood, they, an intelligent bunch, people, I, the surprise, my life, this film, I, it, the second best film, Saving, Private Ryan, The mere fact, Shakespeare, Love, best picture, not only Ryan, a joke, It, Benigni, everything, this film, He, the floors, night, that much raw energy, this film, someone, that kind, vision, the film, a terrific cinematic experience, the importan..."
3,Life Is Beautiful,https://www.imdb.com/review/rw0409131/,"I am surprised about the negative comments that some people made on this web-page. I can see how some people might not experience the same kind of uplift or joy that most of us lived through when seeing the movie. But that some viewers felt insulted and betrayed because the movie did not depict ""reality"" as it really was or is, is not fair.I must concur with my fellow proponents of the movie -- it was a great and very satisfying movie. It provided me with something that everyone in one or...",positive,"[I, the negative comments, some people, this web-page, I, some people, the same kind, uplift, joy, us, the movie, some viewers, the movie, ""reality, it, I, my fellow proponents, the movie, it, a great and very satisfying movie, It, me, something, another shape, form needs, Hope, The movie, the cruelty, life, some light, the beauty, love, life, I, anyone, who, La Vita è bella, it, we, us, just little bit, that hope, this movie, this world, a much better place, you, I, you, the movie, I]"
4,Cinema Paradiso,https://www.imdb.com/review/rw1222045/,"A love story, pure and simple. Not, as one might suppose, the doomed young love between Salvatore and Elena (a love he fails to recapture), but the pure father/son love between the local cinema projectionist, Alfredo and the fatherless Salvatore. The essential beauty of this film-long relationship is brought home in the greatest final scene any film in history ever had (IMHO). This has the hairs on the back of my neck stand to attention every single time I've seen it.With the themes this fil...",negative,"[A love story, one, the doomed young love, Salvatore, Elena, a love, he, the pure father/son love, the local cinema projectionist, Alfredo, the fatherless, Salvatore, The essential beauty, this film-long relationship, the greatest final scene, any film, history, IMHO, the hairs, the back, my neck, attention, I, it, the themes, this film, it, a gloop, mawkishness sentimentality, it, sentiments, a difference, I, this film, I, still a record).The Theatrical version, me, the director's cut, it, ..."


**Make sure the main names that are relevant to the movies in your collection of reviews are added to the working lexicon, and then run the NP chunker again**

After the review of noun chunk it was found that there are not many chunks missing (if not all) from the review. So the nothing was added to lexicon and re-run was not required. 

#### Q3.	Output all the chunks in a single list for each review, and submit that output for this assignment. Also submit a brief written summary of what you did (describe your selection of genre, your source of reviews, how many you collected, and by what means).

In [23]:
df['noun_chunks']

0      [I, John Byner, the 1970s, guys, movies, it, an acceptable behavior, He, the men, his audience, their keys, something, them, their faces, things, control, I, my keys, Forrest Gump, Lieutenant Dan, the hill, Forrest and Jenny's wedding, new legs, his side, Forrest, you, the water-works, I, my throat, 'em, the keys, My wife, me, a kiss, She, I, you, "Other, a few historical fussinesses, plot slickeries, none, a perfect, emotionally-satisfying entertainment, I, I, this movie, I, the simple stor...
1                                    [I, this movie, I, it, I, I, it, I, I, it, I, the movie, What an amazing story, meaning, That movie, you, life, the meaning, it, That life, most people, it, an innocent man, so many lives, his innocence, The meaning, the movie, me, everyone, a better outlook, life, we, the little things, the big things, us, life, us, trials, tribulations, a box, chocolates, we, it, it, we, it, we, our values, a place, no hope, I, this movie, anyone, who, it, who, it, I

I have collected the data from IMDB.com. We collected 150 movies with at least 50K votes to ensure these are popular movies and have sufficient feedback samples, selected genre as romance to filer the content. 

The following steps were followed to pull the information:
* Listed the moved based on above criteria (150 moves, with more than 50K votes and Romance genre.
* Build python functions to get review text, get Movie Title, pick positive and negative reviews, get the text of the review, and pick noun chunks.
* Organized the data in a data frame for future analysis and processing with indexing, movie name, review link, review text, and positive and negative sentiments.

Data was tokenized and broken into PoS (Part of Speech) and later broken down into chunks of noun phrases.
Added the noun chunks to a data frame.

