# Capstone Modeling Part

In [None]:
import matplotlib
import seaborn as sns
import re
from collections import defaultdict
sns.set()
matplotlib.rcParams['figure.dpi'] = 144
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

In [None]:
%matplotlib inline
import requests
import dill
from bs4 import BeautifulSoup
from datetime import datetime
from requests_futures.sessions import FuturesSession
from retrying import retry
from math import ceil
import inspect
from sklearn import base
import itertools
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, 
                              AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import numpy as np

from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
output_notebook()
from bokeh.layouts import row



## Data Scraping

* **set up page limitations**


In [None]:
totl_num = 9953
num_p_pg = 100
yearly_inflation = 4.7418/(2016-1972)
LIMIT = ceil(totl_num / num_p_pg)
imdb_domain= 'https://www.imdb.com'
list_path = '/list/ls057823854/'
url_sub = 'page'

In [None]:
src = inspect.getsource(FuturesSession)

In [None]:
all_links = []
def get_page_args(i):
    result = {"url": (imdb_domain + list_path),
              "params": {url_sub: i + 1}}
    return result
def get_movie_boxes_per_page(response):
    page_soup = BeautifulSoup(response.text, "lxml")
    movie_boxes = page_soup.select('.lister-item.mode-detail')
    return movie_boxes

In [None]:
def extract_simple_info(movie):
    year = genre_list = run_time = certificate = gross = img_url = title = title_link =imdb_rating = None
    title = movie.select('.lister-item-header a')[0].text    
    try:        
        title_link = movie.select('.lister-item-header a')[0]['href']
    except:
        print("There is something wrong with title_link: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        year_txt = movie.select('.lister-item-year.text-muted.unbold')[0].text
        year = int(re.search('.*?\(([0-9]*)\).*?', year_txt).groups()[0])
    except:
        print("There is something wrong with year_txt: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        genre_list = movie.select('.genre')[0].text.strip().split(', ')
    except:
        print("There is something wrong with genre_list: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        run_time = float(movie.select('.runtime')[0].text.strip(' min'))
    except:
        print("There is something wrong with run_time: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        certificate = movie.select('.certificate')[0].text.strip()
    except:
        print("There is something wrong with certificate: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        gross = float(movie.select('.text-muted.text-small .ghost ~ .text-muted + span')[0]['data-value'].replace(',', ''))
    except:
        print("There is something wrong with gross: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        img_url = movie.select('img.loadlate')[0]['loadlate']  
    except:
        print("There is something wrong with img_url: " + title +\
              ': ' + imdb_domain + title_link)
    try: 
        imdb_rating = float(movie.select(".ipl-rating-star.small span.ipl-rating-star__rating")[0].text)
    except:
        print("There is something wrong with imdb_rating: " + title +\
              ': ' + imdb_domain + title_link)
    return {'title':[title],'title_link':[title_link], 'year':[year],
           'genres':genre_list, 'run_time':[run_time], 'certificate':[certificate],
            'gross':[gross],'img_url':[img_url], 'imdb_rating':[imdb_rating]} 

In [None]:
def extract_keyroles(movie, title, imdb_domain, title_link):
    directors = None
    stars = None    
    try:
        blks = movie.select('.text-muted.text-small')
        for text_blk in blks:    
            if "Director" in text_blk.text:
                txt = text_blk.text.strip(' \n\r')
                dir_reg = re.compile(u'Directors?:([\w\s?,?]*)\|?')
                directors = re.search('Directors?:([\w\s?,?]*)\|?',txt).groups()[0].strip(' \n\r').replace('\n', ' ').replace('\r', '').split(',  ')
            if "Star" in text_blk.text:
                txt = text_blk.text.strip(' \n\r')
                stars = re.search('Stars?:([\w\s?,?]*)',txt).groups()[0].strip(' \n\r').replace('\n', ' ').replace('\r', '').split(',  ')
        if directors == None:
            print('No DIRECTOR was found for ' + title +\
              ': ' + imdb_domain + title_link)
        if stars == None:
            print('No STARS was found for ' + title +\
              ': ' + imdb_domain + title_link)  
    except:
        print("There is something wrong with extract_keyroles: " + title +\
              ': ' + imdb_domain + title_link)
    return {'directors':directors, 'stars':stars}


In [None]:
def extract_month(sub_page_soup, title_link, imdb_domain):
    month = None
    try:        
        title_link_reg = re.compile(u'^([^\?]*)?')
        href_tag = title_link_reg.match(title_link).groups()[0]
        s = str('div.subtext > a[href*="'+ href_tag + '"]')
        time_text = sub_page_soup.select(s)[0].text
        time_text_reg = re.compile(u'^([^\(]*)\(')
        month = time_text_reg.match(time_text).groups()[0].strip(' \n').split(' ')[1]
        if month == None:
            print('No MONTH was found for ' + sub_page_soup.select('h1')[0].text +\
                  ': ' + imdb_domain + title_link) 
    except:
        print("There is something wrong with extract_month: " + imdb_domain + title_link)
    return month

In [None]:
def extract_budget(sub_page_soup, title_link, imdb_domain):
    budget = None
    try:
        txt_blks = sub_page_soup.select('#titleDetails .txt-block')        
        for txt_b in txt_blks:
            if 'Budget' in txt_b.text:
                txt = txt_b.text.strip(' \n\r')
                txt_reg = re.compile(u'Budget:[^\d]*([\d,]*)[^\d]*')
                budget = int(txt_reg.match(txt).groups()[0].replace(',', ''))
        if budget == None:
            print('No BUDGET was found for ' + sub_page_soup.select('h1')[0].text +\
                  ': ' + imdb_domain + title_link)            
    except:
        print("There is something wrong with extract_budget: " + imdb_domain + title_link)
    return budget    
   

In [None]:
def adjust_inflation(start, yearly_inflation, value):
    value_adj = value * (1 + (2016 - start) * yearly_inflation)
    return value_adj
def extract_all_info(movie):
    profit = gross_adj = budget_adj = profit_adj = None
    single_movie = {}
    simple_info = extract_simple_info(movie) 
    title = simple_info['title'][0]
    title_link = simple_info['title_link'][0]
    gross = simple_info['gross'][0]
    year = simple_info['year'][0]
    key_roles = extract_keyroles(movie , title, imdb_domain, title_link)    
    sub_page = requests.get(imdb_domain + title_link)
    sub_page_soup = BeautifulSoup(sub_page.text, 'lxml')
    month = extract_month(sub_page_soup, title_link, imdb_domain)
    budget = extract_budget(sub_page_soup, title_link, imdb_domain)    
    profit = None
    
    if gross is not None and budget is not None:
        profit = float(gross) - float(budget)
    if gross is not None and year is not None and year < 2016:
        gross_adj = adjust_inflation(year, yearly_inflation, gross)
    if budget is not None and year is not None and year < 2016:
        budget_adj = adjust_inflation(year, yearly_inflation, budget)
    if profit is not None and budget is not None and year is not None and year < 2016:
        profit_adj = adjust_inflation(year, yearly_inflation, profit)
    single_movie = {**single_movie, **simple_info, **key_roles,
                   **{'month':[month], 'budget':[budget], 'profit':[profit], 
                     'gross_adj':[gross_adj], 'budget_adj':[budget_adj], 'profit_adj':[profit_adj]}}
    return(single_movie)

In [None]:

print(LIMIT)
session = FuturesSession(max_workers=5)
futures = [session.get(**get_page_args(i)) for i in range(LIMIT)]
movie_info_all = []
print("7.9")
for f in range(len(futures)):
    
    future = futures[f]
    movie_boxes_per_page = get_movie_boxes_per_page(future.result())
    movie_info_per_page = []    
    for i in range(len(movie_boxes_per_page)):
        print('page '+ str(f + 1) + ' movie ' + str(i + f * 100))
        movie = movie_boxes_per_page[i]
        single_movie_info = extract_all_info(movie)
        movie_info_per_page.append(single_movie_info)
    movie_info_all.append(movie_info_per_page)
   

In [None]:
movie_info_all_flat = [val for sublist in movie_info_all for val in sublist]

In [None]:
with open('movie_info_without_img.pkd', 'wb') as f:  #  'w' replaced by 'wb' (write binary) to avoid crashing
        dill.dump(movie_info_all_flat, f)

## Load and Prepare Data A Bit

In [None]:
with open('movie_info_without_img.pkd', 'rb') as f:
        movie_no_img_raw = dill.load(f)
        count = 1
for row in movie_no_img_raw:
    count += 1
    for key in ['directors', 'stars', 'genres']:       
        if not row[key]:
            row[key] = [None]   

In [None]:
len(movie_no_img_raw)

In [None]:
movie_no_img = []
for row in movie_no_img_raw:
    if row['certificate'][0] == 'Not Rated':
        row['certificate'][0] = 'Unrated'
    if row['month'][0]:
        if row['month'][0].lower() in['April'.lower(), 'August'.lower(), 'December'.lower(), 'February'.lower(),\
                               'January'.lower(), 'July'.lower(), 'June'.lower(), 'March'.lower(), 'May'.lower(), \
                               'November'.lower(), 'October'.lower(), 'September'.lower()]:
            movie_no_img.append(row)

len(movie_no_img) 

In [None]:

genre_set = set()
for row in movie_no_img:
    if row['genres'][0]:
        
        for g in row['genres']:
            
            genre_set.add(g)
print(genre_set)
            

In [None]:
data_imdb_rating = [row for row in movie_no_img if (row['imdb_rating'][0] and
                                                   row['year'][0] and
                                                   row['month'][0] and 
                                                   row['genres'][0] and 
                                                   row['run_time'][0] and
                                                   row['certificate'][0] and
                                                   row['budget_adj'][0] and 
                                                   row['directors'][0] and
                                                   row['stars'][0])]
data_profit_adj = [row for row in movie_no_img if (row['profit_adj'][0] and
                                                   row['year'][0] and
                                                   row['month'][0] and 
                                                   row['genres'][0] and 
                                                   row['run_time'][0] and
                                                   row['certificate'][0] and
                                                   row['budget_adj'][0] and 
                                                   row['directors'][0] and
                                                   row['stars'][0])]

print(len(data_imdb_rating))
print(len(data_profit_adj))


In [None]:
def split_train_test(te_p, data, y ):
    random.shuffle(data)
    te_size = int(len(data)*te_p)
    data_test = data[:te_size]
    data_train = data[te_size:]
    y_test = [row[y][0] for row in data_test]
    y_train = [row[y][0] for row in data_train]
    result = {'data_test':data_test, 'data_train':data_train, 'y_test':y_test, 'y_train':y_train}
    return result

imdb_rating = split_train_test(0.3, data_imdb_rating, 'imdb_rating')
data_imdb_rating_train = imdb_rating['data_train']
data_imdb_rating_test = imdb_rating['data_test']
y_imdb_rating_train = imdb_rating['y_train']
y_imdb_rating_test = imdb_rating['y_test']


In [None]:
print(len(data_imdb_rating_test))
print(len(data_imdb_rating_train))
print(len(y_imdb_rating_test))
print(len(y_imdb_rating_train))

## Grid Search Models

### Code for preparing data for the Grid Search Module

**The following code can select multiple variables and turn them into `np.array` which can be handled by the Grid Search Module.** 

In [None]:
class ColumnToNPArrayTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names
        if self.col_names is None:
            return X
        else:
            result = np.array([])
            for row in X:
                row_list =np.array([])
                for name in self.col_names:
                    if row[name]:
                        row_list = np.append(row_list, np.array(row[name]))            
                    else:
                        row_list = np.append(row_list, np.array([None]))
                if len(result) == 0:
                    result = np.array([row_list])
                else:
                    result = np.concatenate((result, np.array([row_list])))
            return result

**The following code can one categorical and turn it into dummy coding which can be handled by the Grid Search Module.** 

In [None]:
class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col_names):
        self.col_names = col_names  # We will need these in transform()
    
    def fit(self, X, y=None):
        # This transformer doesn't need to learn anything about the data,
        # so it can just return self without any further processing
        return self
    
    def transform(self, X):
        # Return an array with the same number of rows as X and one
        # column for each in self.col_names
        if self.col_names is None:
            return X
        else:
            return [[row[col] for col in self.col_names] for row in X]
class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        return self
   
    def transform(self, X):
        # X will come in as a list of lists of lists.  Return a list of
        # dictionaries corresponding to those inner lists.
        result = []
        count = 0
        for row in X:
            count += 1
            cats_dict = {}
            if row[0]:
                cats =  row[0] 
                
                if len(cats) > 0:
                    [cats_dict.update({cat:1}) for cat in cats]
            result.append(cats_dict)
        return result

### Grid Search Models: code

This is a helper class for running paramater grid search across different classification or regression models. The helper takes two dictionaries as its constructor parameters. The first dictionary contains the models to be scored, while the second contains the parameters for each model (see examples below or the GridSearchCV documentation for the expected format). The `fit(X, y)` method runs a parameter grid search with cross validation for each model and for the given training data. After calling `fit(X, y)`, the `score_summary()` method returns a data frame with a summary of the scores.

In [None]:
import pandas as pd
from sklearn.grid_search import GridSearchCV

class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, 
            cv = 8, 
            n_jobs=1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit)
            gs.fit(X,y)
            self.grid_searches[key] = gs    
    
    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            params.update(d)
            return pd.Series(params)
                      
        rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 
                     for k in self.keys
                     for gsc in self.grid_searches[k].grid_scores_]
        
       
        #df = pd.concat(rows, axis=1).sort_index([sort_by], ascending=False)
       
        #print(pd.concat(rows, axis=1).T)
        df = pd.concat(rows, axis=1).T.sort_values(by = [sort_by], ascending=False)
        
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        
        return df[columns]
        #return pd.concat(rows, axis=1).T

## Data Training Module

### <font color='red'> 1.1. Train the model with `run_time`: <font size = 10> IN

#### Modeling

**Prepare X and y**

In [None]:
h =  ColumnToNPArrayTransformer(['run_time'])
tr_x = h.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = h.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [None]:
models1_1 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}

params1_1 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [None]:
helper1_1 = EstimatorSelectionHelper(models1_1, params1_1)
helper1_1.fit(tr_x, tr_y, n_jobs=-1)

In [None]:
helper1_1_result = helper1_1.score_summary()

In [None]:
print(helper1_1_result.iloc[0])

#### Fit the best model

In [None]:
model1_1_est = RandomForestRegressor(n_estimators = 19, max_depth = 3) 
model1_1_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model1_1_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model1_1_est.score(tr_x, tr_y)))

In [25]:
model1_1_est = Pipeline([
        ('column_select_trans', ColumnToNPArrayTransformer(['run_time', 'budget_adj'])),
        ('rf', RandomForestRegressor(n_estimators = 21, max_depth = 4))
         ])
model1_1_est.fit(data_imdb_rating_train, y_imdb_rating_train) # Fit the model using data_transform as training data and star_ratings\
                                                     # as target values
print('using Random Forest modes the score for TEST set is: ' + \
     str(model1_1_est.score(data_imdb_rating_test, y_imdb_rating_test)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model1_1_est.score(data_imdb_rating_train, y_imdb_rating_train)))

using Random Forest modes the score for TEST set is: 0.20119845128657496
using Random Forest modes the score for TRAIN set is: 0.2184202614969487


#### Plot

In [None]:
def select_to_list(col_name):
    select = ColumnToNPArrayTransformer([col_name])
    pre = select.fit_transform(data_imdb_rating, imdb_rating)
    result = [ele[0] for ele in pre]
    return result

In [None]:
x = select_to_list('run_time')
y = select_to_list('imdb_rating')
title = select_to_list('title')
df = pd.DataFrame({'x':x, 'y':y, 'title':title})
df.describe()

In [69]:

output_file("Movie Length and IMDB Score.html")
source = ColumnDataSource(data={
    'x': x,
    'y': y,
    'title':title,
})

p = figure(plot_height=200, plot_width = 200, tools="", toolbar_location=None,
           title="Movie Length and IMDB Score", sizing_mode="scale_width")
p.background_fill_color="#f5f5f5"
p.grid.grid_line_color="white"
p.xaxis.axis_label = 'Movie length'
p.yaxis.axis_label = 'IMDB score'
p.axis.axis_line_color = None

p.scatter(x='x', y='y', line_width=2, color='#ebbd5b', source=source)

p.add_tools(HoverTool(
    tooltips=[
        ('Movie sength: ', '@x{%0f}'),
        ('IMDB score: ', '@y{%0.1f}'), # use @{ } for field names with spaces
        ('Movie Title: ', '@title'),
    ],

    formatters={
        'x':'printf', # use 'datetime' formatter for 'date' field
        'y':'printf',   # use 'printf' formatter for 'adj close' field
        'title':'printf'                         # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
))

show(p)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: te_predict_rating, y_imdb_rating_test [renderer: GlyphRenderer(id='d7649bd6-f43f-4dae-b397-8f9d1f89db9a', ...)]


KeyboardInterrupt: 

### <font color='red'> 1.2. Train the model with `budget_adj`: <font size = 10> out

#### train the model

In [294]:
h =  ColumnToNPArrayTransformer(['budget_adj'])
tr_x = h.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = h.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [295]:
models1_2 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}

params1_2 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [296]:
helper1_2 = EstimatorSelectionHelper(models1_2, params1_2)
helper1_2.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.2s finished


Running GridSearchCV for Ridge.
Fitting 8 folds for each of 40 candidates, totalling 320 fits
Running GridSearchCV for Lasso.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.2s finished
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  positive)
  positive)
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.2s finished


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 961 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 2623 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 3785 out of 3800 | elapsed:   42.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:   43.2s finished


In [297]:
helper1_2_result = helper1_2.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [299]:
print(helper1_2_result.iloc[0])

estimator       RandomForestRegressor
min_score                -0.000349009
mean_score                 0.00506999
max_score                  0.00904127
std_score                   0.0032906
alpha                             NaN
max_depth                           1
n_estimators                       23
Name: 92, dtype: object


#### Fit the best model

In [300]:
model1_2_est = RandomForestRegressor(n_estimators = 21, max_depth = 4) 
model1_2_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model1_2_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model1_2_est.score(tr_x, tr_y)))

using Random Forest modes the score for TEST set is: 0.009840876547705046
using Random Forest modes the score for TRAIN set is: 0.027367223049927225


In [26]:
model1_2_est = Pipeline([
        ('column_select_trans', ColumnToNPArrayTransformer(['budget_adj'])),
        ('rf', RandomForestRegressor(n_estimators = 21, max_depth = 4))
         ])
model1_2_est.fit(data_imdb_rating_train, y_imdb_rating_train) # Fit the model using data_transform as training data and star_ratings\
                                                     # as target values
print('using Random Forest modes the score for TEST set is: ' + \
      str(model1_2_est.score(data_imdb_rating_test, y_imdb_rating_test)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model1_2_est.score(data_imdb_rating_train, y_imdb_rating_train)))

using Random Forest modes the score for TEST set is: 0.0016299897845416522
using Random Forest modes the score for TRAIN set is: 0.0284133498474346


### <font color='red'> 2. Train the model with `genres`: <font size = 10> IN

#### Modeling

In [134]:
temp_pipe = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['genres'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer())
    ])
tr_x = temp_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = temp_pipe.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [135]:
models2 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params2 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,10,0.5) },
    'Lasso': { 'alpha': np.arange(0,10,0.5) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [136]:
helper2 = EstimatorSelectionHelper(models2, params2)
helper2.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits
Running GridSearchCV for Ridge.
Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.6s finished


Running GridSearchCV for Lasso.
Fitting 8 folds for each of 20 candidates, totalling 160 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1973 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 3115 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 3785 out of 3800 | elapsed:   53.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:   53.7s finished


In [139]:
helper2_result = helper2.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [190]:
helper2_result.iloc[0]

estimator       RandomForestRegressor
min_score                    0.125719
mean_score                   0.172179
max_score                    0.204761
std_score                   0.0259654
alpha                             NaN
max_depth                           7
n_estimators                       11
Name: 196, dtype: object

#### Fit the best model

In [143]:
model2_est = RandomForestRegressor(n_estimators = 11, max_depth = 7) 
model2_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model2_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model2_est.score(tr_x, tr_y)))

using Random Forest modes the score for TEST set is: 0.16091102904698762
using Random Forest modes the score for TRAIN set is: 0.21761220098461098


In [27]:
model2_est = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['genres'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer()),
        ('rf', RandomForestRegressor(n_estimators = 11, max_depth = 7) )
         ])
model2_est.fit(data_imdb_rating_train, y_imdb_rating_train) # Fit the model using data_transform as training data and star_ratings\
                                                     # as target values
#print('using Random Forest modes the score for TEST set is: ' + \
#      str(model1_est.score(y_imdb_rating_test, y_imdb_rating_test)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model2_est.score(data_imdb_rating_train, y_imdb_rating_train)))

using Random Forest modes the score for TRAIN set is: 0.21723335853383074


#### Plot

In [70]:
genre_count = {}
genre_sum = {}
for row in data_imdb_rating:
    if row['genres'][0]:        
        for g in row['genres']:            
            genre_sum.update({g:0})
            genre_count.update({g:0})
print(genre_count)
print(genre_sum)

{'Drama': 0, 'Romance': 0, 'Comedy': 0, 'Crime': 0, 'Thriller': 0, 'Adventure': 0, 'Family': 0, 'Fantasy': 0, 'Horror': 0, 'Mystery': 0, 'History': 0, 'Sci-Fi': 0, 'Action': 0, 'Musical': 0, 'Music': 0, 'Sport': 0, 'Biography': 0, 'Animation': 0, 'War': 0, 'Western': 0, 'Documentary': 0}
{'Drama': 0, 'Romance': 0, 'Comedy': 0, 'Crime': 0, 'Thriller': 0, 'Adventure': 0, 'Family': 0, 'Fantasy': 0, 'Horror': 0, 'Mystery': 0, 'History': 0, 'Sci-Fi': 0, 'Action': 0, 'Musical': 0, 'Music': 0, 'Sport': 0, 'Biography': 0, 'Animation': 0, 'War': 0, 'Western': 0, 'Documentary': 0}


In [71]:
for g in data_imdb_rating[0]['genres']:
    print(g)
    print(type(g))

Drama
<class 'str'>
Romance
<class 'str'>


In [72]:
for row in data_imdb_rating:
    if row['genres'] and row['imdb_rating'][0]:
        
        for g in row['genres'] :
            genre_sum[g] += row['imdb_rating'][0]
            genre_count[g] += 1
genre_mean = {}
for k, v in genre_sum.items():
    genre_mean.update({k:round(v/genre_count[k], 2)})


means = []
genres = []
sorted_d = sorted(genre_mean.items(), key=lambda x: x[1])
for t in sorted_d:
    genres.append(t[0])
    means.append(t[1])
means

[5.76,
 5.93,
 6.09,
 6.1,
 6.1,
 6.13,
 6.22,
 6.27,
 6.3,
 6.3,
 6.3,
 6.31,
 6.4,
 6.42,
 6.54,
 6.56,
 6.63,
 6.85,
 6.93,
 6.97,
 7.04]

In [73]:
from bokeh.palettes import Spectral6
from bokeh.transform import factor_cmap
import math
from bokeh.models import ColumnDataSource, ranges, LabelSet
Spectral6

['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

In [74]:
source = ColumnDataSource(data = {'means' : means, 'genres' : genres})
color_set = []
count = 0
for g in genres:
    count += 1
    if count < 4:
        color = "#e84d60"
    elif count > len(genres) - 3:
        color = "#c9d9d3"
    else:
        color = "#718dbf"
    color_set.append(color)
    
print(color_set)

['#e84d60', '#e84d60', '#e84d60', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#718dbf', '#c9d9d3', '#c9d9d3', '#c9d9d3']


In [75]:
source = ColumnDataSource(data = {'means' : means, 'genres' : genres})
color_set = []
count = 0
for g in genres:
    count += 1
    if count < 4:
        color = "#e84d60"
    elif count > len(genres) - 3:
        color = "#c9d9d3"
    else:
        color = "#718dbf"
    color_set.append(color)
output_file("Movie Genre and Mean IMDB Score.html")
p = figure(x_range = genres, plot_height=400, plot_width = 1000, toolbar_location=None, title="Mean IMDB Score for Each Genre")
labels = LabelSet(x = 'genres', y='means', text='means', level='glyph',
        x_offset=-13.5, y_offset=0, source=source, render_mode='canvas', text_font_size = '9pt')
p.vbar(x='genres', top='means',   width=0.5, source=source,  line_color='white', 
       fill_color = factor_cmap('genres', palette=color_set, factors=genres), fill_alpha = 0.7 )

p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = math.pi/4
p.xaxis.axis_label = 'Movie genre'
p.yaxis.axis_label = 'Mean IMDB score'
p.y_range.start = 5
p.y_range.end = 8
p.add_layout(labels)

show(p)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: te_predict_rating, y_imdb_rating_test [renderer: GlyphRenderer(id='d7649bd6-f43f-4dae-b397-8f9d1f89db9a', ...)]


KeyboardInterrupt: 

```python 
p.add_tools(HoverTool(
    tooltips=[
        ('Mean IMDB Score: ', '@means{%0.2f}'),
        
        ('Genre: ', '@genres')
    ],

    formatters={
        'means':'printf', # use 'datetime' formatter for 'date' field
        
        'genres':'printf'                         # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
))

show(p)
```

### <font color='red'> 3. Train the model with `year`: <font size = 10> out

In [144]:
temp_pipe = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['year'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer())
    ])
tr_x = temp_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = temp_pipe.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [145]:
models3 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params3 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,10,0.5) },
    'Lasso': { 'alpha': np.arange(0,10,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [146]:
helper3 = EstimatorSelectionHelper(models3, params3)
helper3.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits
Running GridSearchCV for Ridge.
Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.0s finished


Running GridSearchCV for Lasso.
Fitting 8 folds for each of 200 candidates, totalling 1600 fits


[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    0.6s finished
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    0.5s


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 3152 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:   23.5s finished


In [147]:
helper3_result = helper3.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [149]:
helper3_result.iloc[0]

estimator       RandomForestRegressor
min_score                 -0.00826196
mean_score               -9.68559e-05
max_score                  0.00538479
std_score                  0.00475345
alpha                             NaN
max_depth                           2
n_estimators                        9
Name: 250, dtype: object

In [150]:
model3_est = RandomForestRegressor(n_estimators = 3, max_depth = 4) 
model3_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model3_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model3_est.score(tr_x, tr_y)))

using Random Forest modes the score for TEST set is: -0.005814964911641685
using Random Forest modes the score for TRAIN set is: 0.006635725087965949


### <font color='red'> 4. Train the model with `month`: <font size = 10> out 

In [484]:
temp_pipe = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['month'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer())
    ])
tr_x = temp_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = temp_pipe.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [485]:
models4 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params4 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [486]:
helper4 = EstimatorSelectionHelper(models4, params4)
helper4.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.1s finished


Running GridSearchCV for Ridge.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    1.0s finished


Running GridSearchCV for Lasso.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.4s finished


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 448 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 2644 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:   20.6s finished


In [487]:
helper4_result = helper4.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [488]:
helper4_result.iloc[0]

estimator       RandomForestRegressor
min_score                 -0.00594131
mean_score                 0.00611434
max_score                   0.0203738
std_score                  0.00852963
alpha                             NaN
max_depth                           7
n_estimators                       11
Name: 236, dtype: object

In [489]:
model4_est = RandomForestRegressor(n_estimators = 11, max_depth = 2) 
model4_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model4_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model4_est.score(tr_x, tr_y)))

using Random Forest modes the score for TEST set is: 0.006417970536933382
using Random Forest modes the score for TRAIN set is: 0.009046201594576142


#### <font color = 'green'> plot `year` and `month` with `imdb_rating`

In [418]:
import pandas as pd

from bokeh.io import show
from bokeh.models import BasicTicker, ColorBar, ColumnDataSource, LinearColorMapper, PrintfTickFormatter
from bokeh.plotting import figure
from bokeh.transform import transform

In [443]:
month = select_to_list('month')
year_pr = select_to_list('year') 
year = [str(int(ele)) for ele in year_pr]
budget_adj = select_to_list('budget_adj')
imdb_rating = select_to_list('imdb_rating')
certificate = select_to_list('certificate')
run_time = select_to_list('run_time')
title = select_to_list['title']

print(type(month[0]))
print(type(year[0]))
print(type(budget_adj[0]))
print(type(imdb_rating[0]))
print(type(certificate[0]))
print(type(run_time[0]))
print(np.unique(certificate))
print(np.unique(month))
print(np.unique(year))

TypeError: 'function' object is not subscriptable

In [446]:

df = pd.DataFrame({'imdb_rating':imdb_rating, 'budget_adj':budget_adj, 'run_time':run_time, 'title':title})
df.describe()

Unnamed: 0,imdb_rating,budget_adj,run_time
count,5631.0,5631.0,5631.0
mean,6.295347,97087930.0,106.177056
std,1.046617,1726650000.0,18.419758
min,1.8,38.01973,56.0
25%,5.7,16902100.0,94.0
50%,6.4,43709000.0,103.0
75%,7.0,93282360.0,115.0
max,9.3,126991400000.0,325.0


In [398]:
year = [str(int(ele[0])) for ele in year_p]
y = [ele[0] for ele in y_p]
month = [ele[0] for ele in month_p]

data = pd.DataFrame({'year':year, 'y':y, 'month':month})

In [467]:
np.unique(certificate)

array(['G', 'GP', 'NC-17', 'Not Rated', 'PG', 'PG-13', 'R', 'TV-14',
       'TV-MA', 'TV-PG', 'Unrated', 'X'], dtype='<U9')

In [399]:
df = data.groupby(['year','month']).mean()

In [404]:
ha = [ele for ele in month if ele not in['April', 'August', 'December', 'February',
       'January', 'July', 'June', 'March', 'May', 'November', 'October',
       'September']]

In [407]:
np.unique(year)


array(['1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985',
       '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015'], dtype='<U4')

### <font color='red'> 5. Train the model with `directors`: <font size = 10> IN

#### train

In [157]:
temp_pipe = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['directors'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer())
    ])
tr_x = temp_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = temp_pipe.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [158]:
models5 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params5 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [159]:
helper5 = EstimatorSelectionHelper(models5, params5)
helper5.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits
Running GridSearchCV for Ridge.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.6s


Running GridSearchCV for Lasso.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    1.5s finished
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.7s


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:   18.4s finished
[Parallel(n_jobs=-1)]: Done 458 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 1165 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done 1915 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 2497 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done 3397 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:  1.7min finished


In [160]:
helper5_result = helper5.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [266]:
helper5_result.iloc[0]

estimator           Ridge
min_score       0.0899666
mean_score       0.170559
max_score        0.218317
std_score       0.0478031
alpha                1.55
max_depth             NaN
n_estimators          NaN
Name: 32, dtype: object

#### fit the best model

In [166]:
model5_est = Ridge(alpha = 1.55) 
model5_est.fit(tr_x, tr_y)
#print('using Random Forest modes the score for TEST set is: ' + \
#      str(model5_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model5_est.score(tr_x, tr_y)))

using Random Forest modes the score for TRAIN set is: 0.5589995951227082


In [29]:
model5_est = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['directors'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer()),
        ('ridge', Ridge(alpha = 1.55) )
         ])
model5_est.fit(data_imdb_rating_train, y_imdb_rating_train) # Fit the model using data_transform as training data and star_ratings\
                                                     # as target values
#print('using Random Forest modes the score for TEST set is: ' + \
#      str(model1_est.score(y_imdb_rating_test, y_imdb_rating_test)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model5_est.score(data_imdb_rating_train, y_imdb_rating_train)))

using Random Forest modes the score for TRAIN set is: 0.5553861649577794


#### plot

In [76]:
director_count = {}
director_sum = {}
for row in data_imdb_rating:
    if row['directors'][0]:        
        for g in row['directors']:            
            director_sum.update({g:0})
            director_count.update({g:0})
print(director_count)
print(director_sum)

{'Barbra Streisand': 0, 'Anthony Burns': 0, 'Jim Henson': 0, 'Joel Anderson': 0, 'David Cronenberg': 0, 'Phillip Noyce': 0, 'Jon Hurwitz': 0, 'Hayden Schlossberg': 0, 'Atom Egoyan': 0, 'Allan A': 0, 'Ronald Neame': 0, 'David Yates': 0, 'Rod Lurie': 0, 'Francis Ford Coppola': 0, 'Eric Valette': 0, 'Greg Berlanti': 0, 'Frank Oz': 0, 'Stephen Frears': 0, 'Sylvain White': 0, 'Stephen Hopkins': 0, 'Robert Aldrich': 0, 'Guy Jenkin': 0, 'Terry Zwigoff': 0, 'Michael Tuchner': 0, 'John Badham': 0, 'Paul Weiland': 0, 'John Mackenzie': 0, 'John Gatins': 0, 'Luke Greenfield': 0, 'mink': 0, 'John Dahl': 0, 'Chris Noonan': 0, 'Sidney Poitier': 0, 'Ethan Coen': 0, 'Joel Coen': 0, 'Mark Illsley': 0, 'John Landis': 0, 'Bob Clark': 0, 'Gregory Mackenzie': 0, 'George Clooney': 0, 'Clint Eastwood': 0, 'Adam Shankman': 0, 'Hall Bartlett': 0, 'Steven Feder': 0, 'Jay Roach': 0, 'Lou Adler': 0, 'Tommy Chong': 0, 'Roger Spottiswoode': 0, 'Buzz Kulik': 0, 'Sarah Smith': 0, 'Barry Cook': 0, 'Gabor Csupo': 0, 'Be

In [77]:
for row in data_imdb_rating:
    if row['directors'] and row['imdb_rating'][0]:        
        for g in row['directors'] :
            director_sum[g] += row['imdb_rating'][0]
            director_count[g] += 1
director_mean = {}
for k, v in director_sum.items():
    director_mean.update({k:round(v/director_count[k], 2)})

In [80]:
director_count

{'Barbra Streisand': 3,
 'Anthony Burns': 1,
 'Jim Henson': 2,
 'Joel Anderson': 1,
 'David Cronenberg': 17,
 'Phillip Noyce': 10,
 'Jon Hurwitz': 2,
 'Hayden Schlossberg': 2,
 'Atom Egoyan': 4,
 'Allan A': 2,
 'Ronald Neame': 2,
 'David Yates': 4,
 'Rod Lurie': 5,
 'Francis Ford Coppola': 13,
 'Eric Valette': 1,
 'Greg Berlanti': 2,
 'Frank Oz': 9,
 'Stephen Frears': 10,
 'Sylvain White': 2,
 'Stephen Hopkins': 7,
 'Robert Aldrich': 6,
 'Guy Jenkin': 1,
 'Terry Zwigoff': 2,
 'Michael Tuchner': 1,
 'John Badham': 5,
 'Paul Weiland': 3,
 'John Mackenzie': 2,
 'John Gatins': 1,
 'Luke Greenfield': 3,
 'mink': 1,
 'John Dahl': 8,
 'Chris Noonan': 2,
 'Sidney Poitier': 3,
 'Ethan Coen': 16,
 'Joel Coen': 16,
 'Mark Illsley': 2,
 'John Landis': 16,
 'Bob Clark': 10,
 'Gregory Mackenzie': 1,
 'George Clooney': 4,
 'Clint Eastwood': 29,
 'Adam Shankman': 8,
 'Hall Bartlett': 1,
 'Steven Feder': 1,
 'Jay Roach': 7,
 'Lou Adler': 1,
 'Tommy Chong': 1,
 'Roger Spottiswoode': 11,
 'Buzz Kulik': 2

In [81]:
means = []
directors = []
counts = []
for k , v in director_mean.items():
    directors.append(k)
    means.append(v)
    counts.append(director_count[k])

In [73]:
from bokeh.palettes import Spectral6
from bokeh.transform import factor_cmap
import math
from bokeh.models import ColumnDataSource, ranges, LabelSet
Spectral6

['#3288bd', '#99d594', '#e6f598', '#fee08b', '#fc8d59', '#d53e4f']

In [None]:
output_file("Director_movie_count_and_mean_IMDB_score.html")
source = ColumnDataSource(data={
    'x': counts,
    'y': means,
    'director':directors,
})

p = figure(plot_height=200, plot_width = 200, tools="", toolbar_location=None,
           title="Director Movie Count and Mean IMDB Score", sizing_mode="scale_width")
p.background_fill_color="#f5f5f5"
p.grid.grid_line_color="white"
p.xaxis.axis_label = 'Movie Count'
p.yaxis.axis_label = 'Mean IMDB Score'
p.axis.axis_line_color = None

p.scatter(x='x', y='y', line_width=2, color='#ebbd5b', source=source)

p.add_tools(HoverTool(
    tooltips=[
        ('Movie count: ', '@x{%0f}'),
        ('Mean IMDB score: ', '@y{%0.1f}'), # use @{ } for field names with spaces
        ('Director: ', '@director'),
    ],

    formatters={
        'x':'printf', # use 'datetime' formatter for 'date' field
        'y':'printf',   # use 'printf' formatter for 'adj close' field
        'director':'printf'                         # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
))

show(p)

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: te_predict_rating, y_imdb_rating_test [renderer: GlyphRenderer(id='d7649bd6-f43f-4dae-b397-8f9d1f89db9a', ...)]


### <font color='red'> 6. Train the model with `stars`: <font size = 10> out

In [168]:
temp_pipe = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['stars'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer())
    ])
tr_x = temp_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = temp_pipe.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [169]:
models6 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params6 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [170]:
helper6 = EstimatorSelectionHelper(models6, params6)
helper6.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.2s remaining:    6.5s


Running GridSearchCV for Ridge.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s


Running GridSearchCV for Lasso.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    3.0s finished
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   54.3s


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:   56.5s finished
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 521 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 845 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 1195 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 1645 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2195 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2845 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3595 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:  4.7min finished


In [171]:
helper6_result = helper6.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [172]:
helper6_result.iloc[0]

estimator           Ridge
min_score       0.0151321
mean_score      0.0654673
max_score        0.120816
std_score       0.0433523
alpha                1.95
max_depth             NaN
n_estimators          NaN
Name: 40, dtype: object

In [173]:
model6_est = Ridge(alpha = 1.95) 
model6_est.fit(tr_x, tr_y)
#print('using Random Forest modes the score for TEST set is: ' + \
#      str(model6_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model6_est.score(tr_x, tr_y)))

using Random Forest modes the score for TRAIN set is: 0.7563899759770314


### <font color='red'> 7. Train the model with `certificate`: <font size = 10> out

In [490]:
temp_pipe = Pipeline([
        ('column_select_trans', ColumnSelectTransformer(['certificate'])),
        ('hot_one', DictEncoder()),
        ('hot_one_dictionary', DictVectorizer())
    ])
tr_x = temp_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = temp_pipe.fit_transform(data_imdb_rating_test, y_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)

In [491]:
models7 = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params7 = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [492]:
helper7 = EstimatorSelectionHelper(models7, params7)
helper7.fit(tr_x, tr_y, n_jobs=-1)

Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.0s finished


Running GridSearchCV for Ridge.
Fitting 8 folds for each of 40 candidates, totalling 320 fits
Running GridSearchCV for Lasso.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    1.3s finished
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.4s finished


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 412 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:   20.1s finished


In [493]:
helper7_result = helper7.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [494]:
helper7_result.iloc[0]

estimator       RandomForestRegressor
min_score                 -0.00370824
mean_score                 0.00414301
max_score                    0.010826
std_score                  0.00491699
alpha                             NaN
max_depth                           5
n_estimators                        3
Name: 182, dtype: object

In [495]:
model7_est = RandomForestRegressor(n_estimators=5, max_depth=9) 
model7_est.fit(tr_x, tr_y)
#print('using Random Forest modes the score for TEST set is: ' + \
#      str(model7_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model7_est.score(tr_x, tr_y)))

using Random Forest modes the score for TRAIN set is: 0.009720094461203255


## Full Model

In [30]:
class EstimatorTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, estimator):
        # What needs to be done here?
        self.estimator = estimator
    
    def fit(self, X, y):
        # Fit the stored estimator.
        # Question: what should be returned?
        self.estimator.fit(X,y)
        return self
    
    def transform(self, X):
        # Use predict on the stored estimator as a "transformation".
        # Be sure to return a 2-D array.
        #predict_list = self.estimator.predict(X)
        return[[prediction] for prediction in self.estimator.predict(X)]
        
            

In [35]:
from sklearn.pipeline import FeatureUnion

union = FeatureUnion([
        # FeatureUnions use the same syntax as Pipelines
    ('run_time and budget_adj', EstimatorTransformer(model1_1_est)),
    ('genres', EstimatorTransformer(model2_est)),
    ('director',EstimatorTransformer(model5_est))
    ])

full_est = Pipeline([
    ("features", union),
    ('regression', LinearRegression())
  ])
full_est.fit(data_imdb_rating_train, y_imdb_rating_train)
print(full_est.score(data_imdb_rating_train, y_imdb_rating_train))
print(full_est.score(data_imdb_rating_test, y_imdb_rating_test))

0.6723587488546856
0.2189749542157564


In [36]:
from sklearn.pipeline import FeatureUnion

union = FeatureUnion([
        # FeatureUnions use the same syntax as Pipelines
    ('run_time and budget_adj', EstimatorTransformer(model1_1_est)),
    ('genres', EstimatorTransformer(model2_est)),
    ('director',EstimatorTransformer(model5_est))
    ])

full_pipe = Pipeline([
    ("features", union)
  ])
tr_x = full_pipe.fit_transform(data_imdb_rating_train, y_imdb_rating_train)
tr_y = np.array(y_imdb_rating_train)
te_x = full_pipe.transform(data_imdb_rating_test)
te_y = np.array(y_imdb_rating_test)
model_full_est = LinearRegression() 
model_full_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model_full_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model_full_est.score(tr_x, tr_y)))

using Random Forest modes the score for TEST set is: 0.21804370030195164
using Random Forest modes the score for TRAIN set is: 0.6722308987569847


In [37]:
models_full = { 
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor()
}
params_full = { 
    'LinearRegression': { },
    'Ridge': { 'alpha': np.arange(0,2,0.05) },
    'Lasso': { 'alpha': np.arange(0,2,0.05) },
    'RandomForestRegressor':{'n_estimators': np.arange(1, 50 ,2),
                              'max_depth': np.arange(1, 20, 1)}
}


In [674]:
helper_full = EstimatorSelectionHelper(models_full, params_full)
helper_full.fit(tr_x, tr_y, n_jobs=-1)


Running GridSearchCV for LinearRegression.
Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.2s finished


Running GridSearchCV for Ridge.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    1.2s finished


Running GridSearchCV for Lasso.
Fitting 8 folds for each of 40 candidates, totalling 320 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    0.3s finished


Running GridSearchCV for RandomForestRegressor.
Fitting 8 folds for each of 475 candidates, totalling 3800 fits


[Parallel(n_jobs=-1)]: Done 696 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 1676 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 2676 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 3452 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done 3800 out of 3800 | elapsed:  1.1min finished


In [675]:
helper_full_result = helper_full.score_summary()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [676]:
helper_full_result.iloc[0]

estimator       RandomForestRegressor
min_score                     0.60759
mean_score                   0.694087
max_score                    0.734634
std_score                   0.0381972
alpha                             NaN
max_depth                           6
n_estimators                       19
Name: 215, dtype: object

In [60]:
model_full_est = LinearRegression() 
model_full_est.fit(tr_x, tr_y)
print('using Random Forest modes the score for TEST set is: ' + \
      str(model_full_est.score(te_x, te_y)))
print('using Random Forest modes the score for TRAIN set is: ' + \
      str(model_full_est.score(tr_x, tr_y)))

using Random Forest modes the score for TEST set is: 0.21804370030195164
using Random Forest modes the score for TRAIN set is: 0.6722308987569847


In [670]:
te_y

array([3.7, 5. , 6.2, ..., 7.6, 7.8, 7.3])

In [671]:
tr_y

array([7.3, 7.4, 6. , ..., 6. , 7.6, 5.5])

In [315]:
full_est.score(data_imdb_rating_train, y_imdb_rating_train)

0.6799145198725955

In [316]:
full_est.score(data_imdb_rating_test, y_imdb_rating_test)

0.22677964984199495

# <font color = 'red'> create a plot which shows the correlation between the predicted ratings and real ratings.

In [45]:
tr_predict_rating = full_est.predict(data_imdb_rating_train)

te_predict_rating = full_est.predict(data_imdb_rating_test)


In [61]:
output_file("Final_Prediction_Real.html")
source = ColumnDataSource(data={
    'tr_predict_rating': tr_predict_rating,
    'y_imdb_rating_train': y_imdb_rating_train,
})

p = figure(plot_height=400, plot_width = 500, tools="", toolbar_location=None,
           title="Training Set", sizing_mode="scale_width")
p.background_fill_color="#f5f5f5"
p.grid.grid_line_color="white"
p.xaxis.axis_label = 'Predicted IMDB Rating'
p.yaxis.axis_label = 'Real IMDB Rating'
p.axis.axis_line_color = None

p.scatter(x='y_imdb_rating_train', y='tr_predict_rating', line_width=2, color="#99d594", source=source)

p.add_tools(HoverTool(
    tooltips=[
        ('Predicted IMDB Score: ', '@tr_predict_rating{%0.1f}'),
        ('Real IMDB Score: ', '@y_imdb_rating_train{%0.1f}')
    ],

    formatters={
        'tr_predict_rating':'printf', # use 'datetime' formatter for 'date' field
        'y_imdb_rating_train':'printf'             # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
))



source2 = ColumnDataSource(data={
    'te_predict_rating': te_predict_rating,
    'y_imdb_rating_test': y_imdb_rating_test,
})

p2 = figure(plot_height=400, plot_width = 500, tools="", toolbar_location=None,
           title="Test Set", sizing_mode="scale_width")
p2.background_fill_color="#f5f5f5"
p2.grid.grid_line_color="white"
p2.xaxis.axis_label = 'Predicted IMDB Rating'
p2.yaxis.axis_label = 'Real IMDB Rating'
p2.axis.axis_line_color = None

p2.scatter(x='y_imdb_rating_test', y='te_predict_rating', line_width=2, color="#fee08b", source=source2)

p2.add_tools(HoverTool(
    tooltips=[
        ('Predicted IMDB Score: ', '@te_predict_rating{%0.1f}'),
        ('Real IMDB Score: ', '@y_imdb_rating_test{%0.1f}')
    ],

    formatters={
        'te_predict_rating':'printf', # use 'datetime' formatter for 'date' field
        'y_imdb_rating_test':'printf'             # use default 'numeral' formatter for other fields
    },

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='mouse'
))


show(row(p,p2))

ERROR:bokeh.core.validation.check:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: te_predict_rating, y_imdb_rating_test [renderer: GlyphRenderer(id='d7649bd6-f43f-4dae-b397-8f9d1f89db9a', ...)]


KeyboardInterrupt: 