In [1]:
import pandas as pd
import numpy as np

In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('C:/Users/DELL/Downloads/ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('C:/Users/DELL/Downloads/ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [3]:
ratings_train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [4]:
n_users = ratings_train.user_id.unique().shape[0]
n_items = ratings_train.movie_id.unique().shape[0]


max_user_id = max(ratings_train['user_id'])
max_item_id = max(ratings_train['movie_id'])

print(n_users, n_items)
print(max_user_id, max_item_id)

943 1680
943 1682


In [5]:
data_matrix = np.zeros((n_users, max_item_id)) 
for line in ratings_train.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [6]:
data_matrix.shape

(943, 1682)

In [7]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [8]:
print(user_similarity.shape)
print(item_similarity.shape)

(943, 943)
(1682, 1682)


In [9]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])   # a[:, np.newaxis] gives transpose of a
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        
    return pred

In [130]:
user_prediction = predict(data_matrix, user_similarity, type='user')

In [131]:
r_cols = [ 'movieName', 'release_date','movie_url','unknown','Action', 'Adventure', 'Animation','Children','Comedy','Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery', 'Romance' , 'Sci-Fi' ,'Thriller', 'War','Western']
item_info = pd.read_csv('C:/Users/DELL/Downloads/ml-100k/u.item',names=r_cols, sep='|', encoding='latin-1')

indexes = item_info.index

movie_names = [i[1] for i in indexes]
# movie_names

user_predictions_df = pd.DataFrame(user_prediction, columns = movie_names)
user_predictions_df = user_predictions_df.T

In [147]:
def get_recommendations(userID):
    preds = user_predictions_df[userID]
#     print(preds)
    new_df=user_predictions_df[[userID]].sort_values(by=[userID],ascending=False)
    print(new_df[userID][:31])

In [160]:
get_recommendations(11)

Star Wars (1977)                                2.065476
Fargo (1996)                                    1.785245
Return of the Jedi (1983)                       1.672772
Contact (1997)                                  1.554194
English Patient, The (1996)                     1.483289
Raiders of the Lost Ark (1981)                  1.459260
Toy Story (1995)                                1.432938
Godfather, The (1972)                           1.404607
Silence of the Lambs, The (1991)                1.368524
Scream (1996)                                   1.335206
Pulp Fiction (1994)                             1.303767
Air Force One (1997)                            1.265387
Empire Strikes Back, The (1980)                 1.258287
Twelve Monkeys (1995)                           1.246951
Independence Day (ID4) (1996)                   1.233318
Liar Liar (1997)                                1.212342
Titanic (1997)                                  1.163257
Jerry Maguire (1996)           

In [138]:
user_predictions_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
Toy Story (1995),1.813028,1.499090,1.517559,1.460529,1.572856,1.732210,2.200339,1.448526,1.404242,1.751797,...,1.673475,1.396558,1.636429,1.438332,1.484525,1.434085,1.545759,1.367278,1.544525,1.593531
GoldenEye (1995),0.706541,0.341086,0.293120,0.249340,0.440241,0.616495,1.069593,0.274415,0.232788,0.612770,...,0.524027,0.272588,0.506221,0.277765,0.393649,0.304736,0.383555,0.234737,0.368189,0.454778
Four Rooms (1995),0.616523,0.183207,0.150444,0.110153,0.349785,0.511480,0.998240,0.182507,0.100171,0.512360,...,0.437236,0.129504,0.341792,0.128674,0.242496,0.163207,0.270766,0.092060,0.256787,0.373044
Get Shorty (1995),0.973042,0.676493,0.634967,0.590413,0.732799,0.878450,1.320885,0.580882,0.559717,0.869702,...,0.799187,0.611804,0.837390,0.608963,0.728223,0.635464,0.671546,0.571196,0.680572,0.747626
Copycat (1995),0.613974,0.198972,0.152929,0.112402,0.344278,0.509771,0.978426,0.174326,0.100660,0.501255,...,0.429158,0.141421,0.369652,0.137424,0.261146,0.171236,0.268153,0.104276,0.252736,0.365368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mat' i syn (1997),0.391011,-0.086303,-0.126632,-0.166271,0.107555,0.259371,0.760578,-0.073077,-0.167073,0.259618,...,0.185034,-0.129459,0.096703,-0.143001,-0.002179,-0.096244,0.014886,-0.172477,-0.012148,0.144145
B. Monkey (1998),0.393593,-0.084597,-0.125257,-0.164641,0.110065,0.261822,0.763146,-0.070696,-0.164931,0.262070,...,0.187428,-0.127275,0.098776,-0.140910,0.000141,-0.093941,0.016992,-0.170353,-0.010130,0.146744
Sliding Doors (1998),0.392302,-0.085450,-0.125945,-0.165456,0.108810,0.260596,0.761862,-0.071886,-0.166002,0.260844,...,0.186231,-0.128367,0.097740,-0.141956,-0.001019,-0.095093,0.015939,-0.171415,-0.011139,0.145444
You So Crazy (1994),0.391804,-0.083943,-0.124159,-0.163950,0.108678,0.260629,0.761231,-0.071904,-0.165133,0.260674,...,0.186271,-0.127451,0.098799,-0.140791,-0.000469,-0.094394,0.016425,-0.170362,-0.010445,0.144972


In [290]:
# import urllib2
import urllib.request as urllib2
from bs4 import BeautifulSoup
import re
import requests
movie = str(input('Movie Name: '))
movie_search = '+'.join(movie.split())
print(movie_search)
base_url = 'http://www.imdb.com/find?q='
url = base_url+movie_search+'&s=all'
print(url)


res = requests.get(url=url)
soup = BeautifulSoup(res.text, 'html.parser')
meta = soup.find_all('tr')[0]
print(meta)

Movie Name: Little Rascals, The (1994)
Little+Rascals,+The+(1994)
http://www.imdb.com/find?q=Little+Rascals,+The+(1994)&s=all
<tr class="findResult odd"> <td class="primary_photo"> <a href="/title/tt1067369/"><img src="https://m.media-amazon.com/images/M/MV5BOWRhM2U3ZmYtZDMxOS00ZmM4LWJlZTYtNGQ2YzVjOTQxZjVlXkEyXkFqcGdeQXVyNzkzODA4NzI@._V1_UX32_CR0,0,32,44_AL_.jpg"/></a> </td> <td class="result_text"> <a href="/title/tt1067369/">The Little Rascal: Stymie</a> (2000) (TV Episode) <br/> <small>- Season 3 <span class="ghost">|</span> Episode 8 </small> <br><small>- <a href="/title/tt0155428/">E! Mysteries &amp; Scandals</a> (1998) (TV Series) </small> </br></td> </tr>


In [291]:
poster_url = meta.find_all('img')[0].get('src')
# print(poster_url)
title = meta.find_all('a')[-1].get('href')

base = "https://www.imdb.com"
end = "?ref_=fn_al_tt_1"
movie_url = base+title+end
print(movie_url)

https://www.imdb.com/title/tt0155428/?ref_=fn_al_tt_1


In [292]:
res = requests.get(url=movie_url)
soup = BeautifulSoup(res.text, 'html.parser')
# meta = soup.find_all('tr')[0]
# print(soup)

In [324]:
# soup

In [294]:
soup.find_all("div", {'class':"ipc-html-content ipc-html-content--base"})

[<div class="ipc-html-content ipc-html-content--base"><div>Hollywood's most notorious scandals exposed through interviews and re-enactments.</div></div>,
 <div class="ipc-html-content ipc-html-content--base"><div>This show is a wonderful experience for anyone who is interested in the scandalous lives and often mysterious deaths of celebrities. I wish they would air some more original episodes of this exceptional series. It packs a lot of punch for a 30-minute segment and is absolutely addictive.</div></div>]

In [295]:
soup.find_all("meta")[5].get('content')

"E! Mysteries & Scandals: With A.J. Benza, A.C. Lyles, Marc Wanamaker, Bob Thomas. Hollywood's most notorious scandals exposed through interviews and re-enactments."

In [323]:
soup.find_all("div")[18]

<div class="_1IQgIe3JwGh2arzItRgYN3" role="presentation"><ul aria-orientation="vertical" class="ipc-list _1gB7giE3RrFWXvlzwjWk-q ipc-list--baseAlt" role="menu"><a aria-disabled="false" class="ipc-list__item nav-link NavLink-sc-19k0khm-0 dvLykY ipc-list__item--indent-one" href="/what-to-watch/?ref_=nv_watch" role="menuitem" tabindex="-1"><span class="ipc-list-item__text" role="presentation">What to Watch</span></a><a aria-disabled="false" class="ipc-list__item nav-link NavLink-sc-19k0khm-0 dvLykY ipc-list__item--indent-one" href="/trailers/?ref_=nv_mv_tr" role="menuitem" tabindex="-1"><span class="ipc-list-item__text" role="presentation">Latest Trailers</span></a><a aria-disabled="false" class="ipc-list__item nav-link NavLink-sc-19k0khm-0 dvLykY ipc-list__item--indent-one" href="/originals/?ref_=nv_sf_ori" role="menuitem" tabindex="-1"><span class="ipc-list-item__text" role="presentation">IMDb Originals</span></a><a aria-disabled="false" class="ipc-list__item nav-link NavLink-sc-19k0khm

In [296]:
title = soup.find('title')
print(title.string)

E! Mysteries & Scandals (TV Series 1998– ) - IMDb


In [222]:
print(soup.find("div",{'class':'titleBar'}))

None


In [228]:
data = {}
ratingValue = soup.find("span", {"itemprop" : "ratingValue"})
# data["ratingValue"] = ratingValue.string

# no of rating given
ratingCount = soup.find("span", {"itemprop" : "ratingCount"})
# data["ratingCount"] = ratingCount.string

# name
# titleName = soup.find("div",{'class':'titleBar'}).find("h1")
# data["name"] = titleName.contents[0].replace(u'\xa0', u'')

# additional details
subtext = soup.find("div",{'class':'subtext'})
data["subtext"] = ""
for i in subtext.contents:
    data["subtext"] += i.string.strip()

# summary
summary_text = soup.find("div",{'class':'summary_text'})
data["summary_text"] = summary_text.string.strip()

credit_summary_item = soup.find_all("div",{'class':'credit_summary_item'})
data["credits"] = {}
for i in credit_summary_item:
    item = i.find("h4")
    names = i.find_all("a")
    data["credits"][item.string] = []
    for i in names:
        data["credits"][item.string].append({
            "link": i["href"],
            "name": i.string
        })
print(data)

AttributeError: 'NoneType' object has no attribute 'contents'