<a href="https://colab.research.google.com/github/murkydata/python-deliberate-practice/blob/master/Utilityfunctions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

import re
import pandas as pd
import numpy as np
import itertools
# from imdbUtils import *

import requests
from bs4 import BeautifulSoup


def getSoup(url):
    """
    Utility function which takes a url and returns a Soup object.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    return soup

def minMax(a):
    '''Returns the index of negative and positive review.'''
    
    # get the index of least rated user review
    minpos = a.index(min(a))
    
    # get the index of highest rated user review
    maxpos = a.index(max(a))
    
    return minpos, maxpos

def getReviews(soup):
    '''Function returns a negative and positive review for each movie.'''
    
    # get a list of user ratings
    user_review_ratings = [tag.previous_element for tag in 
                           soup.find_all('span', attrs={'class': 'point-scale'})]
    
    
    # find the index of negative and positive review
    n_index, p_index = minMax(list(map(int, user_review_ratings)))
    
    
    # get the review tags
    user_review_list = soup.find_all('a', attrs={'class':'title'})
    
    
    # get the negative and positive review tags
    n_review_tag = user_review_list[n_index]
    p_review_tag = user_review_list[p_index]
    
    # return the negative and positive review link
    n_review_link = "https://www.imdb.com" + n_review_tag['href']
    p_review_link = "https://www.imdb.com" + p_review_tag['href']
    
    return n_review_link, p_review_link

def getReviewText(review_url):
    '''Returns the user review text given the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find div tags with class text show-more__control
    tag = soup.find('div', attrs={'class': 'text show-more__control'})
    
    return tag.getText()

def getMovieTitle(review_url):
    '''Returns the movie title from the review url.'''
    
    # get the review_url's soup
    soup = getSoup(review_url)
    
    # find h1 tag
    tag = soup.find('h1')
    
    return list(tag.children)[1].getText()

def getNounChunks(user_review):
    
    # create the doc object
    doc = nlp(user_review)
    
    # get a list of noun_chunks
    noun_chunks = list(doc.noun_chunks)
    
    # convert noun_chunks from span objects to strings, otherwise it won't pickle
    noun_chunks_strlist = [chunk.text for chunk in noun_chunks]
    
    return noun_chunks_strlist




In [5]:
url = '''https://www.imdb.com/search/title/?title_type=feature&user_rating=4.0,10.
            0&num_votes=50000,&genres=romance&view=simple&sort=user_rating,desc&count=150'''

def getSoup(url):
    """
    Utility function which takes a url and returns a Soup object.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
   
    return soup

# get the soup object for main api url   
movies_soup = getSoup(url) 









In [6]:
# find all a-tags with class:None
movie_tags = movies_soup.find_all('a', attrs={'class': None})

# filter the a-tags to get just the titles
movie_tags = [tag.attrs['href'] for tag in movie_tags 
              if tag.attrs['href'].startswith('/title') & tag.attrs['href'].endswith('/')]

# remove duplicate links
movie_tags = list(dict.fromkeys(movie_tags))

# print("There are a total of " + str(len(movie_tags)) + " movie titles")
# print("Displaying 10 titles")
movie_tags[:10]

There are a total of 150 movie titles
Displaying 10 titles


['/title/tt0109830/',
 '/title/tt0118799/',
 '/title/tt0095765/',
 '/title/tt0034583/',
 '/title/tt0027977/',
 '/title/tt0021749/',
 '/title/tt5311514/',
 '/title/tt0338013/',
 '/title/tt0211915/',
 '/title/tt0119217/']

In [None]:
# movie links
base_url = "https://www.imdb.com"
movie_links = [base_url + tag + 'reviews' for tag in movie_tags]
print("There are a total of " + str(len(movie_links)) + " movie user reviews")
print("Displaying 10 user reviews links")
movie_links[:10]

There are a total of 150 movie user reviews
Displaying 10 user reviews links


['https://www.imdb.com/title/tt0109830/reviews',
 'https://www.imdb.com/title/tt0118799/reviews',
 'https://www.imdb.com/title/tt0095765/reviews',
 'https://www.imdb.com/title/tt0034583/reviews',
 'https://www.imdb.com/title/tt0027977/reviews',
 'https://www.imdb.com/title/tt0021749/reviews',
 'https://www.imdb.com/title/tt5311514/reviews',
 'https://www.imdb.com/title/tt0338013/reviews',
 'https://www.imdb.com/title/tt0211915/reviews',
 'https://www.imdb.com/title/tt0119217/reviews']

Step 4: For each of the movie reviews link, get a positive user review link and a negative movie review link.
Now that we have obtained the user reviews link for each of the 250 movies, our next task is to get the links for one positive and one negative user review link.

The function getReviews() returns a tuple of positive and negative user review links for each movie.

In [None]:
# get a list of soup objects
movie_soups = [getSoup(link) for link in movie_links]

# get all 500 movie review links
movie_review_list = [getReviews(movie_soup) for movie_soup in movie_soups]

movie_review_list = list(itertools.chain(*movie_review_list))
print(len(movie_review_list))

print("There are a total of " + str(len(movie_review_list)) + " individual movie reviews")
print("Displaying 10 reviews")
movie_review_list[:10]

300
There are a total of 300 individual movie reviews
Displaying 10 reviews


['https://www.imdb.com/review/rw3472550/',
 'https://www.imdb.com/review/rw1088679/',
 'https://www.imdb.com/review/rw1078552/',
 'https://www.imdb.com/review/rw0409131/',
 'https://www.imdb.com/review/rw2860182/',
 'https://www.imdb.com/review/rw1024609/',
 'https://www.imdb.com/review/rw5081531/',
 'https://www.imdb.com/review/rw0026277/',
 'https://www.imdb.com/review/rw0015336/',
 'https://www.imdb.com/review/rw3431894/']

In [None]:
# Construct a dataframe
# Finally, a dataframe is constructed using these results

In [None]:
# get review text from the review link
review_texts = [getReviewText(url) for url in movie_review_list]

# get movie name from the review link
movie_titles = [getMovieTitle(url) for url in movie_review_list]

# label each review with negative or positive
review_sentiment = np.array(['negative', 'positive'] * (len(movie_review_list)//2))

# construct a dataframe
df = pd.DataFrame({'movie': movie_titles, 'user_review_permalink': movie_review_list,
             'user_review': review_texts, 'sentiment': review_sentiment})

In [None]:
df.head()

Unnamed: 0,movie,user_review_permalink,user_review,sentiment
0,Forrest Gump,https://www.imdb.com/review/rw3472550/,"I remember John Byner, the stand-up comic and ...",negative
1,Forrest Gump,https://www.imdb.com/review/rw1088679/,When I first saw this movie I didn't appreciat...,positive
2,Life Is Beautiful,https://www.imdb.com/review/rw1078552/,There are a small handful of reviewers who are...,negative
3,Life Is Beautiful,https://www.imdb.com/review/rw0409131/,I am surprised about the negative comments tha...,positive
4,Cinema Paradiso,https://www.imdb.com/review/rw2860182/,Don't approach the movie with your logic part ...,negative


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
all_phrase =[]
for text in df.user_review:
  doc = nlp(text)
  all_phrase.append([chunk.text for chunk in doc.noun_chunks]) 


In [None]:
doc = nlp("I have a red car")
# doc.noun_chunks is a generator that yields spans
[chunk.text for chunk in doc.noun_chunks]
# ['I', 'a red car']
df['Tokens'] = all_phrase

## USING NLTK Library

In [None]:
import nltk
import pprint
import re
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
 
def preprocess(doc):
    sentences = nltk.sent_tokenize(doc)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
token_text = [ preprocess(text) for text in df.user_review ]


In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd
nltk.download('maxent_ne_chunker')
nltk.download('words')

def get_continuous_chunks(text, chunk_func=ne_chunk):
    chunked = chunk_func(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk



[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
NP_tokens = df['user_review'].apply(lambda sent:get_continuous_chunks((sent)))

In [None]:
df['NP Chunk'] = NP_tokens

In [None]:
df.sample(12)

Unnamed: 0,movie,user_review_permalink,user_review,sentiment,Tokens,NP Chunk
12,Your Name.,https://www.imdb.com/review/rw4180336/,This anime is just superb. It is one of those ...,negative,"[This anime, It, those moments, a rare absorbi...","[JJ Abrams, Hollywood, Kimi No Na Wa, Dallas T..."
14,Eternal Sunshine of the Spotless Mind,https://www.imdb.com/review/rw1020113/,Joel (Jim Carrey) is a rather milquetoast man ...,negative,"[Joel, Jim Carrey, a rather milquetoast man, w...","[Joel, Jim Carrey, Kate, Clementine, Lacuna, J..."
30,Wild Strawberries,https://www.imdb.com/review/rw1113924/,This is a film school movie; one of the greate...,negative,"[a film school movie, the experts, I, the play...","[English, Wild Strawberries, Wild Strawberries..."
78,Big Fish,https://www.imdb.com/review/rw1583793/,"Subtle, delicate, touching and fascinating ple...",negative,"[Subtle, delicate, touching and fascinating pl...","[Subtle, Tom Burton, Christamas Eve, Ewan Mc G..."
6,Casablanca,https://www.imdb.com/review/rw5081531/,I say a 'simple' story because I am writing th...,negative,"[I, a 'simple' story, I, this piece, almost 77...","[Bogart, Bergman]"
215,The Last of the Mohicans,https://www.imdb.com/review/rw1818234/,My all time favorite film. Still gives me chil...,positive,"[My all time favorite film, me, chills, It, th...","[Mohicans, French, Indian, Michael Mann, Frenc..."
155,Moonrise Kingdom,https://www.imdb.com/review/rw2614681/,Let's try to understand the miracle I have jus...,positive,"['s, the miracle, I, Director Wes Anderson, hi...","[Director Wes Anderson, Summer, Norton, Willis..."
241,Stardust,https://www.imdb.com/review/rw1708237/,Why are the previews so blah for a movie that ...,positive,"[the previews, so blah, a movie, Everyone, wha...","[Princess Bride, Labyrinth, Robert DeNiro, Cha..."
197,500 Days of Summer,https://www.imdb.com/review/rw6468527/,LiarsMisandristsHosMisanthropes and\nCheats,positive,"[LiarsMisandristsHosMisanthropes, Cheats]",[]
115,Amour,https://www.imdb.com/review/rw2705833/,What introduction could this film possibly req...,positive,"[What introduction, this film, Any film enthus...","[Haneke, Amour, Palme, Cannes, Haneke Amour, H..."


## Spacy