# Louis George    

## Feature Engineering

In [None]:
import numpy as np
import pandas as pd

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.style.use('ggplot')

#### Warning: Lots of the functions in this notebook take a LONG time to run!

### Reading in the data

In [None]:
df = pd.read_csv('../../data/df_clean.csv')

In [None]:
df.head()

In [None]:
X = df.iloc[:, -2:]
y = df.iloc[:, :5]

### Dummying the genres:

In [None]:
X = pd.concat((X, X['genres'].str.get_dummies()), axis=1).drop('genres', axis=1)

In [None]:
X.shape

### Writing the preprocessing, and tokenizing function

There are a lot of numbers in the scripts from the formatting, so I want to remove those.

In [None]:
def my_preprocessor(string):
    no_d = ''.join([i for i in string if not i.isdigit()])
    return no_d.lower()

I will remove stopwords, punctuation, and some other unwanted things in the tokenizing function, as well as lemmatize.

In [None]:
def my_tokenizer(string):
    # Initializing the spacy class
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(string)
    # List to append accepted tokens to
    tokens = []
    # Condition for a good token
    for token in doc:

        if (token.is_stop == False) & \
           (token.is_punct == False) & \
           (token.is_space == False) & \
           ('\n' not in str(token)):
            
            tokens.append(token.lemma_)

    return tokens

### Creating some additional features       
Counting the part of speech    
Warning: Takes a LONG time to run!

In [None]:
# Initializing columns
X['Num_NOUN'], X['Num_PRON'], X['Num_PROPN'], X['Num_ADJ'], X['Num_VERB'], X['Num_ADV'] = 0, 0, 0, 0, 0, 0

# Iterating over all scripts
for i in range(X.shape[0]):
    
    # Initializing the spacy class
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(X['scripts'][i])

    # Condition for a good token
    for token in doc:
        if (token.is_stop == False) & \
           (token.is_punct == False) & \
           (token.is_space == False) & \
           ('\n' not in str(token)):
            pos = token.pos_
            # Condition for each POS
            if pos == 'NOUN':
                X['Num_NOUN'][i] += 1
            elif pos == 'PRON':
                X['Num_PRON'][i] += 1
            elif pos == 'PROPN':
                X['Num_PROPN'][i] += 1
            elif pos == 'ADJ':
                X['Num_ADJ'][i] += 1
            elif pos == 'VERB':
                X['Num_VERB'][i] += 1
            elif pos == 'ADV':
                X['Num_ADV'][i] += 1
    print(f'Just finished: {i}', end='\r')

In [None]:
X.tail()

This script takes a long time to run, so I will export a copy of the dataframe from this point, and then load it and then reload it for the remainder of the steps. I'm doing this because I will be tuning the vectorizer, and don't want to have to rerun this function.

In [None]:
X.to_csv('../../data/X_plus.csv', columns=X.columns, index=False)

### Engineering the targets

Defining a new target as the percent profit of the film.    
This will be defined as:   
Percent Profit = Cumulative Worldwide Gross / Budget * 100   
Answer rounded to the nearest hundreth

In [None]:
y['Per_Profit'] = round((y['Gross_world'] / y['Budget'] * 100), 2)

In [None]:
y.head()

In [None]:
y.describe().T

Maybe combine the imdb and rt scores into a score ratio?

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(y.IMDb_score)
plt.title("Distribution of IMDb Votes")
plt.xlabel("Vote")
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(f"../plots/imdb_hist.png");

plt.figure(figsize=(10, 6))
plt.hist(y.RT_score)
plt.title("Distribution of Rotten Tomatoe Votes")
plt.xlabel("Vote")
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(f"../plots/rt_hist.png");

plt.figure(figsize=(10, 6))
plt.hist(y.Budget, bins=50)
plt.show();

plt.figure(figsize=(10, 6))
plt.hist(y.Gross_world, bins=50)
plt.show();

plt.figure(figsize=(10, 6))
plt.hist(y.Per_Profit, bins=50)
plt.title("Distribution of Profit Margins")
plt.xlabel("Profit Margins")
plt.ylabel('Count')
plt.tight_layout()
plt.savefig(f"../plots/profit_hist.png");

**IMDb Score:**    
Because the distribution is a nice and normal looking, I will categorize this variable at 0.70, which is right around both the mean and median.    

**RT Score:**    
It's interesting that this distribution is differenct from IMDbs, and gives me hope that there may be some insights to be gained here. Because this does not look like a normal distribution I'm going to split this at a more arbitrary value. Although RT classifies anything better than 60% as "fresh", we can see that there is a definite spike around 0.80, and the median score is 0.74. For this reason I will try a cutoff of 0.80, and reevaluate if necessary.    

**Budget:**    
Not sure if I'll classify this or not yet, might do regression for this and gross rev

**Cumulative Gross Worldwide:**
Same as above.

**Percent Profit:**
Accoding to an article published in Gizmodo, they give a general rule of thumb stating that a movie must make twice it's budget in order to break even. I will therefore select 200% as the cutoff for the Percent Profit target variable. This also is close to the median, which is about 260%.      

https://io9.gizmodo.com/how-much-money-does-a-movie-need-to-make-to-be-profitab-5747305

In [None]:
# Exporting the raw data to reference later
y.to_csv('../../data/y_wt.csv', columns=y.columns, index=False)

In [None]:
def imdb_converter(score):
    if score < 0.70:
        new_score = 0
    else:
        new_score = 1
    return new_score

In [None]:
def rt_converter(score):
    if score < 0.80:
        new_score = 0
    else:
        new_score = 1
    return new_score

In [None]:
def profit_converter(score):
    if score < 200:
        new_score = 0
    else:
        new_score = 1
    return new_score

In [None]:
y['IMDb_score'] = y['IMDb_score'].map(imdb_converter)
y['RT_score'] = y['RT_score'].map(rt_converter)
y['Per_Profit'] = y['Per_Profit'].map(profit_converter)

#### Baseline case for comparison

In [None]:
print(f"IMDb baseline: {round(y['IMDb_score'].mean(), 3)}")
print(f"RT baseline: {round(y['RT_score'].mean(), 3)}")
print(f"Percent Profit baseline: {round(y['Per_Profit'].mean(), 3)}")

In [None]:
y.head()

In [None]:
# Exporting the targets
y.drop('titles', axis=1).to_csv('../../data/y.csv', columns=y.drop('titles', axis=1).columns, index=False)

### Splitting the data

In [None]:
X_n = pd.read_csv('../../data/X_plus.csv')

In [None]:
X_n.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_n, y, random_state=4)

In [None]:
X_train.shape

### Vectorizing the scripts using TFIDF

WARNING This step takes a long time to run!

In [None]:
tfidf = TfidfVectorizer(min_df=0.1, 
                        max_df=0.9, 
                        preprocessor=my_preprocessor, 
                        tokenizer=my_tokenizer, 
                        ngram_range=(1,3)).fit(X_train['scripts'])

In [None]:
X_train_tfidf_transformed = tfidf.transform(X_train['scripts'])
X_test_tfidf_transformed = tfidf.transform(X_test['scripts'])

In [None]:
X_train_tfidf_df = pd.DataFrame(columns=tfidf.get_feature_names(), data=X_train_tfidf_transformed.toarray())
X_test_tfidf_df = pd.DataFrame(columns=tfidf.get_feature_names(), data=X_test_tfidf_transformed.toarray())

X_train_tfidf_df.head()

In [None]:
X_train_tfidf_df.sum().sort_values(ascending=False).head(20)

In [None]:
X_train_tfidf_f = pd.concat([X_train.drop('scripts', axis=1).reset_index(drop=True), X_train_tfidf_df], axis=1)
X_test_tfidf_f = pd.concat([X_test.drop('scripts', axis=1).reset_index(drop=True), X_test_tfidf_df], axis=1)

### Vectorizing using count vectorizer    
Primarily for use with LDA - probably won't be used as its own metric.     
Warning: This step also takes a long time to run.

In [None]:
bagofwords = CountVectorizer(min_df=0.1, 
                             max_df=0.9, 
                             preprocessor=my_preprocessor, 
                             tokenizer=my_tokenizer, 
                             ngram_range=(1,3)).fit(X_train['scripts'])

In [None]:
X_train_countv_transformed = bagofwords.transform(X_train['scripts'])
X_test_countv_transformed = bagofwords.transform(X_test['scripts'])

In [None]:
X_train_countv_transformed.shape

In [None]:
X_train_countv_df = pd.DataFrame(columns=bagofwords.get_feature_names(), data=X_train_countv_transformed.toarray())
X_test_countv_df = pd.DataFrame(columns=bagofwords.get_feature_names(), data=X_test_countv_transformed.toarray())

X_train_countv_df.head()

In [None]:
X_train_countv_df.sum().sort_values(ascending=False).head(20)

In [None]:
X_train_countv_f = pd.concat([X_train.drop('scripts', axis=1).reset_index(drop=True), X_train_countv_df], axis=1)
X_test_countv_f = pd.concat([X_test.drop('scripts', axis=1).reset_index(drop=True), X_test_countv_df], axis=1)

### Exporting the targets and features

TFIDF

In [None]:
X_train_tfidf_f.to_csv('../../data/X_train_tfidf.csv', columns=X_train_tfidf_f.columns, index=False)
X_test_tfidf_f.to_csv('../../data/X_test_tfidf.csv', columns=X_test_tfidf_f.columns, index=False)

COUNT V

In [None]:
X_train_countv_f.to_csv('../../data/X_train_countv.csv', columns=X_train_countv_f.columns, index=False)
X_test_countv_f.to_csv('../../data/X_test_countv.csv', columns=X_test_countv_f.columns, index=False)

Isolating the targets

In [None]:
y_imdb_train = y_train.iloc[:, 1]
y_rt_train = y_train.iloc[:, 2]
y_profit_train = y_train.iloc[:, -1]

y_imdb_test = y_test.iloc[:, 1]
y_rt_test = y_test.iloc[:, 2]
y_profit_test = y_test.iloc[:, -1]

Exporting the targets

In [None]:
y_imdb_train.to_csv('../../data/y_imdb_train.csv', header=True, index=False)
y_rt_train.to_csv('../../data/y_rt_train.csv', header=True, index=False)
y_profit_train.to_csv('../../data/y_profit_train.csv', header=True, index=False)

y_imdb_test.to_csv('../../data/y_imdb_test.csv', header=True, index=False)
y_rt_test.to_csv('../../data/y_rt_test.csv', header=True, index=False)
y_profit_test.to_csv('../../data/y_profit_test.csv', header=True, index=False)