# MIT Movie Dataset - Base Estimator

This notebook implements a baseline predictor that exclusively uses memory to classify words (e.g., it uses the most frequent class for each word as the prediction). 


In [None]:
import pandas as pd
import numpy as np

# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

#os.chdir('/content/drive/My Drive/Colab Notebooks/DAAN888/data')
os.chdir('/content/drive/My Drive/DAAN888/data')
os.getcwd()

'/content/drive/.shortcut-targets-by-id/1tuCnSXHVzIUUyCjMTEqmKbY7JxKZ-5eU/DAAN888/data'

In [None]:
model_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/models/'

In [None]:
data_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/data/'

## Load Dataset

In [None]:
import pickle 

with open('mitmovie.pickle', 'rb') as handle:
    dataset = pickle.load(handle)

## Define Base Estimator

Tobias Sterbak on his blog "Depends on the Definition" provides a base estimator useful for NER. 

https://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/



In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class MemoryTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
#        print(X)
        return [self.memory.get(x, 'O') for x in X]


### Train Base Estimator

In [None]:
base_estimator = MemoryTagger()

In [None]:
# stretch out the list of lists into one list for words and labels
train_words = [words for doc in dataset['train_tokens'] for words in doc]
train_labels = [tags for doc in dataset['train_labels'] for tags in doc]

In [None]:
# fit estimator based on training data
base_estimator.fit(train_words, train_labels)

### Evaluate Base Estimator

In [None]:
# stretch out the list of lists into one list for words and labels
test_words = [words for doc in dataset['test_tokens'] for words in doc]
test_labels = [tags for doc in dataset['test_labels'] for tags in doc]

In [None]:
# get predictions
train_preds = base_estimator.predict(train_words)
test_preds = base_estimator.predict(test_words)

['what', 'movies', 'star', 'bruce', 'willis', 'show', 'me', 'films', 'with', 'drew', 'barrymore', 'from', 'the', '1980s', 'what', 'movies', 'starred', 'both', 'al', 'pacino', 'and', 'robert', 'deniro', 'find', 'me', 'all', 'of', 'the', 'movies', 'that', 'starred', 'harold', 'ramis', 'and', 'bill', 'murray', 'find', 'me', 'a', 'movie', 'with', 'a', 'quote', 'about', 'baseball', 'in', 'it', 'what', 'movies', 'have', 'mississippi', 'in', 'the', 'title', 'show', 'me', 'science', 'fiction', 'films', 'directed', 'by', 'steven', 'spielberg', 'do', 'you', 'have', 'any', 'thrillers', 'directed', 'by', 'sofia', 'coppola', 'what', 'leonard', 'cohen', 'songs', 'have', 'been', 'used', 'in', 'a', 'movie', 'show', 'me', 'films', 'elvis', 'films', 'set', 'in', 'hawaii', 'what', 'movie', 'is', 'references', 'zydrate', 'are', 'there', 'any', 'musical', 'films', 'with', 'patrick', 'dempsey', 'list', 'westerns', 'starring', 'john', 'wayne', 'show', 'me', 'military', 'related', 'movies', 'with', 'demi', 'm

In [None]:
train_words[0:5]

['what', 'movies', 'star', 'bruce', 'willis']

In [None]:
train_labels[0:5]

['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR']

In [None]:
base_estimator.predict(train_words[0:5])

['what', 'movies', 'star', 'bruce', 'willis']


['O', 'O', 'O', 'B-ACTOR', 'I-ACTOR']

In [None]:
train_words[3]

'bruce'

In [None]:
base_estimator.predict(train_words[2])

star


['I-YEAR', 'I-DIRECTOR', 'O', 'B-RATING']

In [None]:
from sklearn.metrics import classification_report


train_report = classification_report(train_labels, train_preds)

print()
print('TRAINING SET Classification Report')
print()
print(train_report)

  _warn_prf(average, modifier, msg_start, len(result))



TRAINING SET Classification Report

                   precision    recall  f1-score   support

          B-ACTOR       0.77      0.88      0.82      3220
      B-CHARACTER       0.77      0.47      0.58       385
       B-DIRECTOR       0.77      0.63      0.70      1720
          B-GENRE       0.88      0.95      0.91      4354
           B-PLOT       0.74      0.68      0.71      1927
         B-RATING       0.97      0.97      0.97      2007
B-RATINGS_AVERAGE       0.70      0.64      0.67      1869
         B-REVIEW       0.78      0.08      0.15       221
           B-SONG       0.74      0.28      0.40       245
          B-TITLE       0.77      0.57      0.65      2376
        B-TRAILER       0.80      0.94      0.87       113
           B-YEAR       0.92      0.96      0.94      2858
          I-ACTOR       0.87      0.88      0.88      3474
      I-CHARACTER       0.72      0.58      0.64       342
       I-DIRECTOR       0.86      0.81      0.83      1850
          I-GENRE 

In [None]:
print()
print('TEST SET Classification Report')

f = open(model_dir + 'base_estimator/class_report_test.txt', 'w') 

class_report = classification_report(test_labels, test_preds)
print(class_report, file=f ) 

f.close() 


print(class_report)


TEST SET Classification Report


  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

          B-ACTOR       0.76      0.84      0.80       812
      B-CHARACTER       0.69      0.24      0.36        90
       B-DIRECTOR       0.70      0.50      0.58       456
          B-GENRE       0.86      0.94      0.90      1117
           B-PLOT       0.58      0.45      0.51       491
         B-RATING       0.98      0.97      0.97       500
B-RATINGS_AVERAGE       0.67      0.61      0.64       451
         B-REVIEW       0.33      0.02      0.03        56
           B-SONG       0.36      0.09      0.15        54
          B-TITLE       0.58      0.35      0.44       562
        B-TRAILER       0.82      0.90      0.86        30
           B-YEAR       0.90      0.94      0.92       720
          I-ACTOR       0.82      0.77      0.80       862
      I-CHARACTER       0.54      0.25      0.35        75
       I-DIRECTOR       0.76      0.48      0.59       496
          I-GENRE       0.91      0.49      0.64       

In [None]:
!pip install seqeval

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 17.6MB/s eta 0:00:01[K     |███████████████                 | 20kB 5.9MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 7.8MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 7.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.7MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=3a2122c9a31c049a1e1a7a3f3ec2ffebcd57cf841ecf755e7570c167a2f42ee8
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
test_preds = []
for doc in dataset['test_tokens']:
  test_preds.append(base_estimator.predict(doc))

In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

f = open(model_dir + 'base_estimator/seq_class_report_test.txt', 'w') 

seq_class_report = classification_report_seqeval(dataset['test_labels'], test_preds)
print(seq_class_report, file=f ) 

f.close() 
print(seq_class_report)

                 precision    recall  f1-score   support

          ACTOR       0.54      0.71      0.61       812
      CHARACTER       0.31      0.19      0.24        90
       DIRECTOR       0.35      0.35      0.35       456
          GENRE       0.78      0.88      0.83      1117
           PLOT       0.34      0.39      0.37       491
         RATING       0.93      0.92      0.92       500
RATINGS_AVERAGE       0.30      0.39      0.34       451
         REVIEW       0.25      0.02      0.03        56
           SONG       0.02      0.02      0.02        54
          TITLE       0.20      0.25      0.23       562
        TRAILER       0.82      0.90      0.86        30
           YEAR       0.58      0.74      0.65       720

      micro avg       0.53      0.61      0.57      5339
      macro avg       0.45      0.48      0.45      5339
   weighted avg       0.53      0.61      0.57      5339

