# Term Project - M&M Movie Reviews

In [1]:
import pandas as pd 
import numpy as np

In [3]:
def read_api_key(file):
    f = open(file,'r')
    api_key = f.read().replace('\n','')
    f.close()
    return api_key

api_key = read_api_key('api_key.txt')

## Create dataset from GroupLens data
The data we downloaded can be found [here](https://grouplens.org/datasets/movielens/). The attributes in this dataset are: 
- Movie IDs: corresponds with ID used on MovieLens website (ex: id1 corresponds with https://movielens.org/movies/1) 
- Title: entered manually or imported 
- Genre: pipe separated list 
- imdbID: identifier for movie used by imdb (http://www.imdb.com)
- tmdbID: identifier for tmdbId (https://www.themoviedb.org) 
- userID: the id of the user 
- rating: on 5 star scale with half-star increments 
- timestamp: seconds since midnight Coordinated Universal Time of January 1, 1970 
- tag: user-generated metadata about movies 

In [4]:
movies = pd.read_csv('ml-latest-small/movies.csv')
links = pd.read_csv('ml-latest-small/links.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [5]:
print(movies.shape)
print(movies.columns)
print("----------")
print(links.shape)
print(links.columns)
print("----------")
print(ratings.shape)
ratings.rename(columns={'timestamp':'rating_timestamp'}, inplace=True)
print(ratings.columns)
print("----------")
print(tags.shape)
tags.rename(columns={'timestamp':'tags_timestamp'}, inplace=True)
print(tags.columns)

(9742, 3)
Index(['movieId', 'title', 'genres'], dtype='object')
----------
(9742, 3)
Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')
----------
(100836, 4)
Index(['userId', 'movieId', 'rating', 'rating_timestamp'], dtype='object')
----------
(3683, 4)
Index(['userId', 'movieId', 'tag', 'tags_timestamp'], dtype='object')


In [6]:
movie_links = pd.merge(movies, links, on=['movieId'], how="inner")
ratings_tags = pd.merge(ratings, tags, on=['userId', 'movieId'], how="inner")
data = pd.merge(movie_links, ratings_tags, on=['movieId'], how="inner")
print(data.shape)
print(data.columns)
data.head()

(3476, 10)
Index(['movieId', 'title', 'genres', 'imdbId', 'tmdbId', 'userId', 'rating',
       'rating_timestamp', 'tag', 'tags_timestamp'],
      dtype='object')


In [8]:
data.to_csv( "./GroupLens.csv", index=False)

##  Get imdb data
The attributes from imdb we are interested in are: 
- Rated
- Released 
- Runtime 
- Genre 
- Director 
- Writer 
- Actors 
- Awards 
- Ratings (need to parse) 
- Metascore 
- imdbRating 
- imdbVotes 
- imdbID 
- BoxOffice 
- Production 

In [11]:
# Get the imdbIds from the GroupLens data 
imdbIds = data.imdbId.unique()
print("Number of Ids: ", imdbIds.shape)
print(imdbIds[:10])

Number of Ids:  (1464,)
[114709 113497 113228 113041 114319 112346 113987 112641 114388 113161]


In [None]:
# Query each Id's data from OMDb API 