# MSDS 534: Statistical Learning - Homework 3

### NAME:  Michael Ryvin

### NET ID:  mar643

### Teammates: David Amiel - dja168, Devyani Mardia - dm1633, Neeti Patel - np912

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
import warnings
import itertools
from transformers import BertTokenizer, BertModel


warnings.filterwarnings('ignore')



## Movie data
* each movie has a short "overview" text description
* movie is assigned genres


### key attributes:
* genres in genres_to_predict
* bert features of "overview" stores in bert_feats columns

In [2]:
movies_df = pd.read_csv('movies_embedding.csv')

## shuffle the rows
movies_df = movies_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(movies_df.shape)

genres_to_predict = ['Action', 'Adventure', 'Comedy', 'Drama', 'Romance',
                        'Science Fiction', 'Crime', 'Horror',
                        'Fantasy', 'Animation']

BERT_LEN = 768
bert_feats = ['bert_'+str(i) for i in range(BERT_LEN)]

print(movies_df[["title", "overview"] + genres_to_predict].head())



(4771, 803)
                         title  \
0            The Shipping News   
1                Jack and Jill   
2                    Show Boat   
3  The Man with the Iron Fists   
4                     Red Dawn   

                                            overview  Action  Adventure  \
0  An emotionally-beaten man with his young daugh...       0          0   
1  Jack Sadelstein, a successful advertising exec...       0          0   
2  A dashing Mississippi river gambler wins the a...       0          0   
3  In feudal China, a blacksmith who makes weapon...       1          0   
4  It is the mid-1980s. From out of the sky, Sovi...       1          0   

   Comedy  Drama  Romance  Science Fiction  Crime  Horror  Fantasy  Animation  
0       0      1        1                0      0       0        0          0  
1       1      0        0                0      0       0        0          0  
2       0      0        1                0      0       0        0          0  
3       0   

## Create data

In [3]:
x = np.array(movies_df[bert_feats])
y = np.array(movies_df[genres_to_predict])

x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)


In [4]:
movies_df.shape

(4771, 803)

In [5]:
x.shape

torch.Size([4771, 768])

In [6]:
y.shape

torch.Size([4771, 10])

In [7]:
movies_df[bert_feats]

Unnamed: 0,bert_0,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,-0.855048,-0.529212,-0.897732,0.782060,0.459933,-0.215221,0.700999,0.440923,-0.833135,-0.999983,...,0.645016,-0.828362,0.989878,0.860511,-0.590727,0.544752,0.629952,-0.679922,-0.618484,0.796385
1,-0.271300,-0.425127,-0.953177,0.152986,0.512233,-0.024680,-0.279279,0.306333,-0.896802,-0.999300,...,0.354122,-0.939310,0.984160,0.531259,-0.025841,0.314838,0.390870,-0.590269,-0.508324,0.156365
2,-0.797073,-0.534549,-0.937934,0.658841,0.682308,-0.064351,0.744677,0.385673,-0.828768,-0.999992,...,0.674496,-0.792085,0.981960,0.792339,-0.504937,0.363750,0.737342,-0.763061,-0.671523,0.872107
3,-0.817654,-0.627679,-0.942563,0.794428,0.915340,-0.549173,0.398550,0.354581,-0.764943,-0.999916,...,0.192651,-0.773497,0.996642,0.661998,-0.953551,0.550534,0.312776,-0.957872,-0.732633,0.629061
4,-0.866573,-0.684388,-0.983777,0.795782,0.843119,-0.317151,0.434409,0.437486,-0.942618,-0.999994,...,0.539079,-0.968427,0.999213,0.752414,-0.483252,0.547676,0.570960,-0.909331,-0.730859,0.773438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4766,-0.315452,-0.405777,-0.958666,0.332656,0.653589,0.044818,-0.260017,0.263614,-0.863831,-0.999861,...,0.571485,-0.916849,0.975452,0.368222,-0.548295,0.095054,0.622448,-0.795626,-0.541701,0.075756
4767,-0.808146,-0.479640,-0.885991,0.696989,0.533721,-0.237040,0.607855,0.474232,-0.834073,-0.999963,...,0.451719,-0.846852,0.979823,0.776020,-0.016500,0.412000,0.467651,-0.705459,-0.656380,0.673918
4768,-0.815457,-0.522907,-0.988168,0.652234,0.887861,-0.138129,0.481904,0.308311,-0.964156,-0.999952,...,0.467353,-0.935382,0.999024,0.731358,-0.507446,0.113466,0.493366,-0.881984,-0.524299,0.803279
4769,-0.694710,-0.470595,-0.824760,0.188147,0.635146,-0.216050,-0.254090,0.221325,-0.304033,-0.999560,...,0.155456,-0.442276,0.872166,0.526895,-0.596313,0.620168,0.222343,-0.892673,-0.498745,0.189312


In [8]:
movies_df

Unnamed: 0,id,budget,homepage,original_language,original_title,overview,popularity,release_date,revenue,runtime,...,bert_758,bert_759,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767
0,6440,0,,en,The Shipping News,An emotionally-beaten man with his young daugh...,11.139900,2001-12-18,0.0,111.0,...,0.645016,-0.828362,0.989878,0.860511,-0.590727,0.544752,0.629952,-0.679922,-0.618484,0.796385
1,71880,79000000,http://www.jackandjill-movie.com/,en,Jack and Jill,"Jack Sadelstein, a successful advertising exec...",22.132418,2011-11-11,149673788.0,91.0,...,0.354122,-0.939310,0.984160,0.531259,-0.025841,0.314838,0.390870,-0.590269,-0.508324,0.156365
2,17820,2300000,,en,Show Boat,A dashing Mississippi river gambler wins the a...,1.606314,1951-07-13,11000000.0,107.0,...,0.674496,-0.792085,0.981960,0.792339,-0.504937,0.363750,0.737342,-0.763061,-0.671523,0.872107
3,97430,15000000,,en,The Man with the Iron Fists,"In feudal China, a blacksmith who makes weapon...",17.672021,2012-11-02,15608545.0,96.0,...,0.192651,-0.773497,0.996642,0.661998,-0.953551,0.550534,0.312776,-0.957872,-0.732633,0.629061
4,1880,4200000,,en,Red Dawn,"It is the mid-1980s. From out of the sky, Sovi...",11.743085,1984-08-10,38376497.0,114.0,...,0.539079,-0.968427,0.999213,0.752414,-0.483252,0.547676,0.570960,-0.909331,-0.730859,0.773438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4766,250184,0,http://www.locker13.com/,en,Locker 13,"The story of Skip, a young ex-convict who take...",1.905197,2014-03-29,0.0,90.0,...,0.571485,-0.916849,0.975452,0.368222,-0.548295,0.095054,0.622448,-0.795626,-0.541701,0.075756
4767,1359,7000000,,en,American Psycho,A wealthy New York investment banking executiv...,45.310443,2000-04-13,34266564.0,102.0,...,0.451719,-0.846852,0.979823,0.776020,-0.016500,0.412000,0.467651,-0.705459,-0.656380,0.673918
4768,34099,0,,en,Brave New Girl,Holly has everything it takes to be a star; th...,0.180940,2004-04-25,0.0,120.0,...,0.467353,-0.935382,0.999024,0.731358,-0.507446,0.113466,0.493366,-0.881984,-0.524299,0.803279
4769,74084,0,,hi,दिल जो भी कहे,"During the British rule in India, several Indi...",0.122704,2006-12-07,0.0,0.0,...,0.155456,-0.442276,0.872166,0.526895,-0.596313,0.620168,0.222343,-0.892673,-0.498745,0.189312


## Question 1:

Split data X, Y into training and test data.

Further split training data into D1 and D2, where D1 is used to train predictive model and D2 is used to calibrate prediction set.

In [9]:
ntrain = 3000
n1 = 2500

x_train = x[:ntrain]
x_test = x[ntrain:]

y_train = y[:ntrain]
y_test = y[ntrain:]

x_train1 = x_train[:n1]
y_train1 = y_train[:n1]

x_train2 = x_train[n1:]
y_train2 = y_train[n1:]

print(x_train1.shape)
print(x_train2.shape)
print(y_train1.shape)
print(y_train2.shape)

torch.Size([2500, 768])
torch.Size([500, 768])
torch.Size([2500, 10])
torch.Size([500, 10])


In [10]:
x_test.shape

torch.Size([1771, 768])

## Question 2:
Create a feed-forward neural network to predict movie genres

In [11]:
## NN should have 2 hidden layers, with dimension hidden_dims[0] and hidden_dims[1]
## Use ReLU as the activation function for the hidden layers
## Use Sigmoid as the activation function for the output layer
class MovieNet(nn.Module):
    def __init__(self, input_dim, hidden_dims, out_dim):
        super(MovieNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Linear(hidden_dims[1], out_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)


## Question 3:
Train the neural network to predict movie genres

In [12]:
## train neural network to predict genres
train_data = TensorDataset(x_train1, y_train1)

train_loader = DataLoader(train_data, batch_size=20, shuffle=True)

epochs = 200
hidden_dims = [32, 16]

input_dim = x_train1.shape[1]
output_dim = y_train1.shape[1]
model = MovieNet(BERT_LEN, hidden_dims, output_dim)
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

for epoch in range(epochs):
    for x_batch, y_batch in train_loader:
        y_pred = model(x_batch)
        loss = F.binary_cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    if epoch % 50 == 0:
        print('Epoch %d, loss %.4f' % (epoch, loss.item()))


Epoch 0, loss 0.4911
Epoch 50, loss 0.3856
Epoch 100, loss 0.2832
Epoch 150, loss 0.2359


In [13]:
loss

tensor(0.2642, grad_fn=<BinaryCrossEntropyBackward0>)

## Question 4:

Recall we have 10 genres.

`outcome` is a 10-dimensional binary list or tuple.

`probs` is a 10-dimensional list of probability of each genre.

You will complete the following functions:
* `computeScore` compute non-conformity score of an outcome, based on predicted probabilities
* `computeConformalSet` computes the prediction set


In [14]:
import math

In [15]:
## you may find this function helpful
def computeProbability(sequence, probs):
    return(np.prod([probs[i] if sequence[i] == 1 else 1 - probs[i] for i in range(len(sequence))]))


def computeScore(my_outcome, probs):

    outcome_probability = computeProbability(my_outcome, probs)
    all_out = list()
    for outcome in itertools.product([0,1], repeat=len(probs)):
      all_out.append(computeProbability(outcome, probs))

    rev_sort = sorted(all_out, reverse = True)
    ind = rev_sort.index(outcome_probability)

    score = 0
    for i in range(ind+1):
      score = score + rev_sort[i]

    return score

def computeConformalSet(probs, scores, alpha):
    n2 = len(scores)

    q_index = math.ceil((1-alpha)*(n2+1))/n2
    threshold = np.quantile(scores, q_index)

    all_outcomes = list(itertools.product([0,1], repeat=len(probs)))
    sorted_outcomes = sorted(all_outcomes, key=lambda x: computeProbability(x, probs), reverse=True)

    sorted_probs = [computeProbability(outcome, probs) for outcome in sorted_outcomes]
    cumulative_probs = np.cumsum(sorted_probs)

    conformal_ix = np.where(cumulative_probs > threshold)[0][0]
    conformal_prediction_set = sorted_outcomes[:conformal_ix + 1]
    return(conformal_prediction_set)

## used to more easily display the conformal prediction set
def confGenreSet(conf_set, genres_to_predict):
    return( [ [genres_to_predict[i] for i in range(len(conf_set[k]))
        if conf_set[k][i] == 1] for k in range(len(conf_set)) ] )

## Question 5:
Compute the non-conformity scores on D2.

In [16]:
y_pred2 = model(x_train2)
y_pred2 = y_pred2.detach().numpy() #predictions

scores = np.zeros(ntrain-n1)
for j in range(ntrain-n1):
    scores[j] = computeScore(y_train2[j], y_pred2[j])

In [17]:
scores.shape

(500,)

In [18]:
y_train2.shape

torch.Size([500, 10])

In [19]:
y_pred2.shape

(500, 10)

In [20]:
alpha = 0.2

print("Adjusted threshold: ", np.quantile(scores, 1-alpha))

Adjusted threshold:  0.9470250489658011


## Question 6:

Check whether our conformal prediction set has the desired coverage guarantee on test data.

In [21]:
ntest = 200
covered = 0
for i in range(ntest):

    cur_y_pred = model(x_test[i])
    cur_y_pred = cur_y_pred.detach().numpy()
    conf_set = computeConformalSet(cur_y_pred, scores, alpha)
#Predicted set is conf_set
    y_test_tuple = tuple(y_test[i].int().tolist())
#y_test_tuple actual results
    if (y_test_tuple in conf_set):
        covered = covered + 1
    else:
        conf_genres = confGenreSet(conf_set, genres_to_predict)

        ## display mistake
        print(movies_df.iloc[i+ntest]['title'])
        print(movies_df.iloc[i+ntest]['overview'])
        print('Actual genres: ', [genres_to_predict[j] for j in range(len(genres_to_predict)) if y_test_tuple[j] == 1])
        print('Prediction set: ', conf_genres)
        print('')

print("(1-alpha): %.3f    Percent covered: %.3f" % (1-alpha, covered/ntest))

Backmask
During an all-night, drug-fueled party at an abandoned asylum known for the horrific treatment of its patients, a group of ordinary teens decide to experiment with the occult, mysteriously leading to a violent possession. In an effort to find help, the group rushes to escape, only to find themselves locked inside with no means of communication. Tempers flare, trusts are broken and in attempt to save one of their friends possessed by the demon, the amateurs try to perform an exorcism. Instead of solving the problem, and unbeknownst to them, they unleash an even more powerful and vengeful spirit, one with a distinct motive and which wants them all dead. The teen's only chance of survival is to uncover the asylum's deep mysteries and find a way out before it's too late.
Actual genres:  ['Comedy', 'Drama', 'Crime']
Prediction set:  [['Drama', 'Romance'], ['Drama'], ['Comedy', 'Drama', 'Romance'], ['Comedy', 'Drama'], ['Romance'], [], ['Comedy', 'Romance'], ['Comedy'], ['Drama', 'R

## Extra:

Try generating the prediction sets for each of the following made-up movie descriptions.



In [22]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')

my_movie_collection = [
"Orphaned fox befriends a lost magical bird, together they embark on a quest, discovering true family, courage, and enchanted realms",
"In a galaxy ruled by machines, a rebellious pilot and a rogue android uncover a cosmic secret, igniting an interstellar chase, alliances, betrayals, and the fate of sentient life.",
"In post-war Paris, an estranged violinist encounters a mute child prodigy, together navigating trauma and rediscovery, their melodies weaving stories of loss, love, and the power of connection",
"Exploring abandoned asylums, a documentary crew uncovers chilling histories intertwined with the supernatural. As they delve deeper, the line between reality and nightmare becomes hauntingly blurred, ensnaring them forever.",
"An uptight lawyer's world turns upside down when a free-spirited barista accidentally receives his trial notes. Amidst coffee spills and court blunders, they find love in the most unexpected verdict",
]

for my_movie in my_movie_collection:

    my_movie_tokens = tokenizer(my_movie, return_tensors='pt',
                                max_length=128, padding='max_length', truncation=True)

    with torch.no_grad():
        my_movie_embed = bert(**my_movie_tokens)[1].detach().numpy()[0]

    my_pred = model(torch.tensor(my_movie_embed))
    my_pred = my_pred.detach().numpy()

    conf_set = computeConformalSet(my_pred, scores, alpha)
    genre_set = confGenreSet(conf_set, genres_to_predict)

    mydf = pd.DataFrame({'genres' : genres_to_predict, 'probs' : my_pred})

    print(my_movie)

    print("Output probabilities:")
    print(mydf)

    print("Prediction set:")
    for guess in genre_set:
        print(guess)
    print("")


Orphaned fox befriends a lost magical bird, together they embark on a quest, discovering true family, courage, and enchanted realms
Output probabilities:
            genres     probs
0           Action  0.242403
1        Adventure  0.804917
2           Comedy  0.119322
3            Drama  0.352219
4          Romance  0.228931
5  Science Fiction  0.107090
6            Crime  0.006174
7           Horror  0.012914
8          Fantasy  0.847737
9        Animation  0.604277
Prediction set:
['Adventure', 'Fantasy', 'Animation']
['Adventure', 'Fantasy']
['Adventure', 'Drama', 'Fantasy', 'Animation']
['Adventure', 'Drama', 'Fantasy']
['Action', 'Adventure', 'Fantasy', 'Animation']
['Adventure', 'Romance', 'Fantasy', 'Animation']
['Fantasy', 'Animation']
['Action', 'Adventure', 'Fantasy']
['Adventure', 'Romance', 'Fantasy']
['Adventure', 'Animation']
['Action', 'Adventure', 'Drama', 'Fantasy', 'Animation']
['Adventure', 'Drama', 'Romance', 'Fantasy', 'Animation']
['Fantasy']
['Adventure', 'Comed