##  Feature Engineering 

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

current_dir = Path().resolve()
data_path = current_dir.joinpath('Data').resolve()

ori_pickle_file_path  = data_path.joinpath('movies_df_original.pkl').resolve()
cleaned_pickle_file_path = data_path.joinpath('movies_df_cleaned.pkl').resolve()

# read df from pickle file
movies_df_cleaned = pd.read_pickle(cleaned_pickle_file_path)
movies_df_cleaned.head()

Unnamed: 0,movie_id,movie_title,movie_info,rating,genre,directors,in_theaters_date,on_streaming_date,runtime_in_minutes,critic_rating,critic_count,audience_rating,audience_count,release_year
0,1,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,2010-02-12,2010-06-29,83.0,49,144,53.0,254287.0,2010
1,2,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,2010-04-30,2010-10-19,90.0,86,140,64.0,11567.0,2010
2,3,10,Blake Edwards' 10 stars Dudley Moore as George...,R,"Comedy, Romance",Blake Edwards,1979-10-05,1997-08-27,118.0,68,22,53.0,14670.0,1979
3,4,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",NR,"Classics, Drama",Sidney Lumet,1957-04-13,2001-03-06,95.0,100,51,97.0,105000.0,1957
4,5,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,1954-01-01,2003-05-20,127.0,89,27,74.0,68860.0,1954


### 1. Split the data into a training and test set, with the training data including movies released in theatres before 2010 and the test data including movies released in theatres in 2010 and after.

In [2]:
train_data = movies_df_cleaned[movies_df_cleaned['release_year'] < 2010]
test_data = movies_df_cleaned[movies_df_cleaned['release_year'] >= 2010]

### 2. Your goal is to predict the critic_rating,  Update your training and test data sets to NOT include these columns.

In [3]:
columns_to_exclude = ['critic_rating', 'audience_rating', 'critic_count', 'audience_count']
train_data = train_data.drop(columns=columns_to_exclude)
test_data = test_data.drop(columns=columns_to_exclude)

### 3. Create a new DataFrame containing the following ID column and features(Using only the training data):
- movie_title
- runtime_in_minutes
- NEW: kid_friendly (1 if G or PG, 0 if other ratings)
- NEW: dummy variable columns for each genre

In [4]:
# kid_friendly

train_data['kid_friendly'] = train_data['rating'].apply(lambda x: 1 if x in ['G', 'PG'] else 0)
train_data.head()

Unnamed: 0,movie_id,movie_title,movie_info,rating,genre,directors,in_theaters_date,on_streaming_date,runtime_in_minutes,release_year,kid_friendly
2,3,10,Blake Edwards' 10 stars Dudley Moore as George...,R,"Comedy, Romance",Blake Edwards,1979-10-05,1997-08-27,118.0,1979,0
3,4,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",NR,"Classics, Drama",Sidney Lumet,1957-04-13,2001-03-06,95.0,1957,0
4,5,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,1954-01-01,2003-05-20,127.0,1954,1
5,6,"10,000 B.C.",A young outcast from a primitive tribe is forc...,PG-13,"Action & Adventure, Classics, Drama",Roland Emmerich,2008-03-07,2008-06-24,109.0,2008,0
6,7,The 39 Steps,A man in London tries to help a counterespiona...,NR,"Action & Adventure, Classics, Mystery & Suspense",Alfred Hitchcock,1935-08-01,2035-06-06,87.0,1935,0


In [5]:
new_train_data = train_data[['movie_title', 'runtime_in_minutes', 'rating', 'genre', 'kid_friendly']]
new_train_data.head()

Unnamed: 0,movie_title,runtime_in_minutes,rating,genre,kid_friendly
2,10,118.0,R,"Comedy, Romance",0
3,12 Angry Men (Twelve Angry Men),95.0,NR,"Classics, Drama",0
4,"20,000 Leagues Under The Sea",127.0,G,"Action & Adventure, Drama, Kids & Family",1
5,"10,000 B.C.",109.0,PG-13,"Action & Adventure, Classics, Drama",0
6,The 39 Steps,87.0,NR,"Action & Adventure, Classics, Mystery & Suspense",0


In [6]:
# Get genre dummies
new_train_data['genre_split'] = new_train_data['genre'].str.split(', ')
genre_dummy = pd.get_dummies(new_train_data['genre_split'].explode(), drop_first = True)
genre_dummy = genre_dummy.groupby(new_train_data['genre_split'].explode().index).sum()

new_train_data = pd.concat([new_train_data[['movie_title', 'runtime_in_minutes', 'kid_friendly', 'rating']], genre_dummy], axis=1)
new_train_data.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_data['genre_split'] = new_train_data['genre'].str.split(', ')


Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,rating,Animation,Anime & Manga,Art House & International,Classics,Comedy,Cult Movies,...,Horror,Kids & Family,Musical & Performing Arts,Mystery & Suspense,Romance,Science Fiction & Fantasy,Special Interest,Sports & Fitness,Television,Western
2,10,118.0,0,R,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,12 Angry Men (Twelve Angry Men),95.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"20,000 Leagues Under The Sea",127.0,1,G,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,"10,000 B.C.",109.0,0,PG-13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,The 39 Steps,87.0,0,NR,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


### 4. Create 3 new features that you think will do a good job predicting the critic_rating. Each new feature should use various combinations of the columns from your training data.

#### 1. Create binary feature for Not rated movie

In [7]:
new_train_data['Not_Rated'] = new_train_data['rating'].apply(lambda x : 1 if x in ['Not Rated', 'NR'] else 0)
new_train_data.head()

Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,rating,Animation,Anime & Manga,Art House & International,Classics,Comedy,Cult Movies,...,Kids & Family,Musical & Performing Arts,Mystery & Suspense,Romance,Science Fiction & Fantasy,Special Interest,Sports & Fitness,Television,Western,Not_Rated
2,10,118.0,0,R,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,12 Angry Men (Twelve Angry Men),95.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,"20,000 Leagues Under The Sea",127.0,1,G,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,"10,000 B.C.",109.0,0,PG-13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,The 39 Steps,87.0,0,NR,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1


#### 2. Divde runtime into three categories

In [8]:
q33 = new_train_data['runtime_in_minutes'].quantile(0.33)
q67 = new_train_data['runtime_in_minutes'].quantile(0.67)

# Create categories based on quantiles
def categorize_runtime(runtime):
    if runtime < q33:
        return 'Short'
    elif q33 <= runtime <= q67:
        return 'Medium'
    else:
        return 'Long'

new_train_data['runtime_category'] = new_train_data['runtime_in_minutes'].apply(categorize_runtime)
new_train_data.head()


Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,rating,Animation,Anime & Manga,Art House & International,Classics,Comedy,Cult Movies,...,Musical & Performing Arts,Mystery & Suspense,Romance,Science Fiction & Fantasy,Special Interest,Sports & Fitness,Television,Western,Not_Rated,runtime_category
2,10,118.0,0,R,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,Long
3,12 Angry Men (Twelve Angry Men),95.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,Medium
4,"20,000 Leagues Under The Sea",127.0,1,G,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Long
5,"10,000 B.C.",109.0,0,PG-13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Long
6,The 39 Steps,87.0,0,NR,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,Short


In [9]:
runtime_dummy = pd.get_dummies(new_train_data['runtime_category'], drop_first= True, dtype=int)
new_train_data = pd.concat([new_train_data, runtime_dummy], axis = 1 )
new_train_data = new_train_data.drop('runtime_category', axis = 1)
new_train_data.head()


Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,rating,Animation,Anime & Manga,Art House & International,Classics,Comedy,Cult Movies,...,Mystery & Suspense,Romance,Science Fiction & Fantasy,Special Interest,Sports & Fitness,Television,Western,Not_Rated,Medium,Short
2,10,118.0,0,R,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,12 Angry Men (Twelve Angry Men),95.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
4,"20,000 Leagues Under The Sea",127.0,1,G,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,"10,000 B.C.",109.0,0,PG-13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,The 39 Steps,87.0,0,NR,0,0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1


#### 3. Director Popularity and movie count

In [10]:
new_train_data = pd.concat([new_train_data, train_data['directors']], axis = 1)
new_train_data.head()

Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,rating,Animation,Anime & Manga,Art House & International,Classics,Comedy,Cult Movies,...,Romance,Science Fiction & Fantasy,Special Interest,Sports & Fitness,Television,Western,Not_Rated,Medium,Short,directors
2,10,118.0,0,R,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,Blake Edwards
3,12 Angry Men (Twelve Angry Men),95.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,Sidney Lumet
4,"20,000 Leagues Under The Sea",127.0,1,G,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Richard Fleischer
5,"10,000 B.C.",109.0,0,PG-13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Roland Emmerich
6,The 39 Steps,87.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,Alfred Hitchcock


In [11]:
director_counts = new_train_data['directors'].value_counts()
new_train_data['director_movie_count'] = new_train_data['directors'].map(director_counts)

In [12]:
def categorize_director(count):
    if count > 5:
        return 'HIghPop'
    elif count > 2:
        return 'MediumPop'
    else:
        return 'LowPop'

new_train_data['director_popularity'] = new_train_data['director_movie_count'].apply(categorize_director)

dpdummy  = pd.get_dummies(new_train_data['director_popularity'], drop_first = True, dtype = int)
new_train_data = pd.concat([new_train_data,dpdummy], axis = 1)
new_train_data = new_train_data.drop(['director_popularity', 'directors'], axis = 1)
new_train_data.head()

Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,rating,Animation,Anime & Manga,Art House & International,Classics,Comedy,Cult Movies,...,Special Interest,Sports & Fitness,Television,Western,Not_Rated,Medium,Short,director_movie_count,LowPop,MediumPop
2,10,118.0,0,R,0,0,0,0,1,0,...,0,0,0,0,0,0,0,27.0,0,0
3,12 Angry Men (Twelve Angry Men),95.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,1,1,0,30.0,0,0
4,"20,000 Leagues Under The Sea",127.0,1,G,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16.0,0,0
5,"10,000 B.C.",109.0,0,PG-13,0,0,0,1,0,0,...,0,0,0,0,0,0,0,8.0,0,0
6,The 39 Steps,87.0,0,NR,0,0,0,1,0,0,...,0,0,0,0,1,0,1,36.0,0,0


## Modeling

### 1. Make sure you apply the same transformations on your X_test and y_test data sets that you applied on the X_train and y_train data sets.

In [13]:
# Do same feature engineering in test data set

test_data['kid_friendly'] = test_data['rating'].apply(lambda x: 1 if x in ['G', 'PG'] else 0)
new_test_data = test_data[['movie_title', 'runtime_in_minutes', 'rating', 'genre', 'kid_friendly']]

new_test_data['genre_split'] = new_test_data['genre'].str.split(', ')
genre_dummy_t = pd.get_dummies(new_test_data['genre_split'].explode(), drop_first = True)
genre_dummy_t = genre_dummy_t.groupby(new_test_data['genre_split'].explode().index).sum()

new_test_data = pd.concat([new_test_data[['movie_title', 'runtime_in_minutes', 'kid_friendly', 'rating']], genre_dummy_t], axis=1)

new_test_data['Not_Rated'] = new_test_data['rating'].apply(lambda x : 1 if x in ['Not Rated', 'NR'] else 0)

new_test_data['runtime_category'] = new_test_data['runtime_in_minutes'].apply(categorize_runtime)
runtime_dummyt = pd.get_dummies(new_test_data['runtime_category'], drop_first= True, dtype=int)
new_test_data = pd.concat([new_test_data, runtime_dummyt], axis = 1 )
new_test_data = new_test_data.drop('runtime_category', axis = 1)

new_test_data = pd.concat([new_test_data, test_data['directors']], axis = 1)
director_counts = new_test_data['directors'].value_counts()
new_test_data['director_movie_count'] = new_test_data['directors'].map(director_counts)
new_test_data['director_popularity'] = new_test_data['director_movie_count'].apply(categorize_director)

dpdummyt  = pd.get_dummies(new_test_data['director_popularity'], drop_first = True, dtype = int)
new_test_data = pd.concat([new_test_data,dpdummyt], axis = 1)
new_test_data = new_test_data.drop(['director_popularity', 'directors'], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test_data['genre_split'] = new_test_data['genre'].str.split(', ')


In [None]:
y_train = train_data['critic_rating']
y_test = test_data['critic_rating']
X_train = new_train_data.drop('rating', axis = 1)
X_test = new_train_data.drop('rating', axis = 1)
