## Feature Engineering Notebook

##### In this notebook, we demonstrate some key feature engineering techniques to extract data from existing features and create new ones

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
%matplotlib inline

In [2]:
credits = pd.read_csv('movies_dataset/tmdb_5000_credits.csv')
movies = pd.read_csv('movies_dataset/tmdb_5000_movies.csv')

In [3]:
credits.columns

Index([u'movie_id', u'title', u'cast', u'crew'], dtype='object')

In [5]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


Extracting Unique Movie Genre tags for each instance

In [23]:
movies['unique_genres'] = [list(set([y['name'] for y in x])) for x in movies['genres'].apply(ast.literal_eval)]

In [24]:
movies['unique_genres'].head(3)

0    [Action, Fantasy, Adventure, Science Fiction]
1                     [Action, Fantasy, Adventure]
2                       [Action, Adventure, Crime]
Name: unique_genres, dtype: object

In [25]:
dummies = pd.get_dummies(movies['unique_genres'].apply(pd.Series).stack()).groupby(level=0).sum()

In [26]:
movies = movies.merge(dummies, right_index=True, left_index=True)

In [27]:
movies.dtypes

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
unique_genres            object
Action                    uint8
Adventure                 uint8
Animation                 uint8
Comedy                    uint8
Crime                     uint8
Documentary               uint8
Drama                     uint8
Family                    uint8
Fantasy                   uint8
Foreign                   uint8
History 

In [28]:
nt = []
for x in movies['unique_genres']:
    for i in x:
        nt.append(i)
list(set(nt))

['Mystery',
 'Romance',
 'History',
 'Family',
 'Science Fiction',
 'Horror',
 'Crime',
 'Drama',
 'Fantasy',
 'Animation',
 'Music',
 'Adventure',
 'Foreign',
 'Action',
 'TV Movie',
 'Comedy',
 'Documentary',
 'War',
 'Thriller',
 'Western']

Now processing the credits dataframe

In [29]:
movies_out = movies.copy()

In [30]:
type(movies_out)

pandas.core.frame.DataFrame

In [35]:
#temporarily assigning to original - will modify this once data transformations are complete
movies_processed = movies_out.select_dtypes(include=['int64', 'float64', 'uint8'])
credits_processed = credits.select_dtypes(include=['int64', 'float64'])

In [36]:
%store movies_processed
%store credits_processed

Stored 'movies_processed' (DataFrame)
Stored 'credits_processed' (DataFrame)


In [37]:
movies_processed.dtypes

budget               int64
id                   int64
popularity         float64
revenue              int64
runtime            float64
vote_average       float64
vote_count           int64
Action               uint8
Adventure            uint8
Animation            uint8
Comedy               uint8
Crime                uint8
Documentary          uint8
Drama                uint8
Family               uint8
Fantasy              uint8
Foreign              uint8
History              uint8
Horror               uint8
Music                uint8
Mystery              uint8
Romance              uint8
Science Fiction      uint8
TV Movie             uint8
Thriller             uint8
War                  uint8
Western              uint8
dtype: object