#### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import math

#### Load Files

In [3]:
movies = pd.read_csv('../data/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/tmdb_5000_credits.csv') 

## Merge both dataframes

In [4]:
df = movies.merge(credits,on='title')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [6]:
df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [7]:
#count null values
df.isna().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

#### Data Cleaning

In [8]:
# df.drop(['homepage','tagline','production_companies'],axis =1,inplace = True)
df.shape

(4809, 23)

In [9]:
df.drop_duplicates(inplace=True)
sum(df.duplicated())

0

In [10]:
df['release_date'] = df['release_date'] .apply(lambda x : str(x).split('-')[0])

In [11]:
df['release_date']

0       2009
1       2007
2       2015
3       2012
4       2012
        ... 
4804    1992
4805    2011
4806    2013
4807    2012
4808    2005
Name: release_date, Length: 4809, dtype: object

In [12]:
for i in range(df.shape[0]):
    if df['status'][i]=='Released':
        df['status'][i] = df['status'][i] +": "+ df['release_date'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['status'][i] = df['status'][i] +": "+ df['release_date'][i]


In [13]:
# drop all NaN Values
df.dropna(inplace = True)

In [14]:
#calculate Profit for each of the movie And add a new column in the dataframe name 'Profit'
df['Profit'] = df['revenue'] - df['budget']

In [15]:
#print first Five rows of the dataset
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew,Profit
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released: 2009,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",2550965087
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released: 2007,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",661000000
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released: 2015,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",635674609
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released: 2012,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",834939099
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released: 2012,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",24139100


In [16]:
df['overview_2'] = df['overview']

### Extraction of Genres, Keywords, Production Countries, spoken language, cast, crew

In [17]:
def hours(minutes):
    h = minutes/60
    if h<1:
        return(minutes,' min')
    else:
        hours = int(h)
        minutes = (h*60) % 60.
#         frac, whole = math.modf(h)
        return("{} h {} min".format(hours,int(minutes)))

In [18]:
# genre, keywords, prod countries, spoken language

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L


In [19]:
#function for extracting top(first) 3 actors from the movie 

def convert_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [20]:
#function to fetch the director of movie from the crew column
def director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [21]:
df['runtime'] = df['runtime'].apply(hours)
df['crew'] = df['crew'].apply(director)
df['cast'] = df['cast'].apply(convert_cast)
df['keywords'] = df['keywords'].apply(convert)
df['genres'] = df['genres'].apply(convert)
df['production_countries'] = df['production_countries'].apply(convert)
df['spoken_languages'] = df['spoken_languages'].apply(convert)
df['overview'] = df['overview'].apply(lambda x:x.split())

In [22]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,status,tagline,title,vote_average,vote_count,movie_id,cast,crew,Profit,overview_2
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,Released: 2009,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],2550965087,"In the 22nd century, a paraplegic Marine is di..."
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,Released: 2007,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],661000000,"Captain Barbossa, long believed to be dead, ha..."
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,"[A, cryptic, message, from, Bond’s, past, send...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,Released: 2015,A Plan No One Escapes,Spectre,6.3,4466,206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],635674609,A cryptic message from Bond’s past sends him o...
3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,Released: 2012,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],834939099,Following the death of District Attorney Harve...
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion, space travel...",en,John Carter,"[John, Carter, is, a, war-weary,, former, mili...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,Released: 2012,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],24139100,"John Carter is a war-weary, former military ca..."


In [23]:
x = df[['movie_id', 'title', 'genres','keywords','overview', 'cast','crew','runtime','status','vote_average']]

In [25]:
x.head(3)

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew,runtime,status,vote_average
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],2 h 42 min,Released: 2009,7.2
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],2 h 49 min,Released: 2007,6.9
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],2 h 28 min,Released: 2015,6.3


In [54]:
df.to_csv('cleaned.csv', index=False)

### Removing Spaces

In [55]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [56]:
#Applying clean data function

features = ['cast','keywords','genres','crew', 'overview']

for i in features:
    df[i] = df[i].apply(clean_data)

## Features

In [57]:
# Making a new column 'tags' which is made by combining all features
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [58]:
df['tags'] = df['tags'].apply(lambda x:" ".join(x))
df['tags'] = df['tags'].apply(lambda x:x.lower()) #lower casing all the alphabets in the tags column

In [59]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,tagline,title,vote_average,vote_count,movie_id,cast,crew,Profit,overview_2,tags
0,237000000,"[action, adventure, fantasy, sciencefiction]",http://www.avatarmovie.com/,19995,"[cultureclash, future, spacewar, spacecolony, ...",en,Avatar,"[in, the, 22nd, century,, a, paraplegic, marin...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[samworthington, zoesaldana, sigourneyweaver]",[jamescameron],2550965087,"In the 22nd century, a paraplegic Marine is di...","in the 22nd century, a paraplegic marine is di..."
1,300000000,"[adventure, fantasy, action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drugabuse, exoticisland, eastindiatrad...",en,Pirates of the Caribbean: At World's End,"[captain, barbossa,, long, believed, to, be, d...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski],661000000,"Captain Barbossa, long believed to be dead, ha...","captain barbossa, long believed to be dead, ha..."
2,245000000,"[action, adventure, crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",en,Spectre,"[a, cryptic, message, from, bond’s, past, send...",107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,A Plan No One Escapes,Spectre,6.3,4466,206647,"[danielcraig, christophwaltz, léaseydoux]",[sammendes],635674609,A cryptic message from Bond’s past sends him o...,a cryptic message from bond’s past sends him o...
3,250000000,"[action, crime, drama, thriller]",http://www.thedarkknightrises.com/,49026,"[dccomics, crimefighter, terrorist, secretiden...",en,The Dark Knight Rises,"[following, the, death, of, district, attorney...",112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[christianbale, michaelcaine, garyoldman]",[christophernolan],834939099,Following the death of District Attorney Harve...,following the death of district attorney harve...
4,260000000,"[action, adventure, sciencefiction]",http://movies.disney.com/john-carter,49529,"[basedonnovel, mars, medallion, spacetravel, p...",en,John Carter,"[john, carter, is, a, war-weary,, former, mili...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[taylorkitsch, lynncollins, samanthamorton]",[andrewstanton],24139100,"John Carter is a war-weary, former military ca...","john carter is a war-weary, former military ca..."


In [61]:
new_df = df[['movie_id','title', 'runtime', 'status', 'vote_average', 'overview_2','tags']]

In [62]:
new_df.head()

Unnamed: 0,movie_id,title,runtime,status,vote_average,overview_2,tags
0,19995,Avatar,2 h 42 min,Released: 2009,7.2,"In the 22nd century, a paraplegic Marine is di...","in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,2 h 49 min,Released: 2007,6.9,"Captain Barbossa, long believed to be dead, ha...","captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,2 h 28 min,Released: 2015,6.3,A cryptic message from Bond’s past sends him o...,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,2 h 45 min,Released: 2012,7.6,Following the death of District Attorney Harve...,following the death of district attorney harve...
4,49529,John Carter,2 h 12 min,Released: 2012,6.1,"John Carter is a war-weary, former military ca...","john carter is a war-weary, former military ca..."


In [63]:
new_df.to_csv('train_data.csv', index=False)

## Saving codes for deployment

In [64]:
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))

In [65]:
pickle.dump(new_df, open('movies.pkl', 'wb'))