In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
with open('/Users/lodino/Desktop/CSE250A/hws/hw8/hw8_ids.txt', 'r') as f:
    id_ls = f.read().strip('\n').split('\n')

with open('/Users/lodino/Desktop/CSE250A/hws/hw8/hw8_movies.txt', 'r') as f:
    movie_ls = f.read().strip('\n').split('\n')

with open('/Users/lodino/Desktop/CSE250A/hws/hw8/hw8_probR_init.txt', 'r') as f:
    probR_init_ls = f.read().strip('\n').split('\n')
    probR_init_arr = np.array([n.strip('   ').split('   ') for n in probR_init_ls]).astype(float)

with open('/Users/lodino/Desktop/CSE250A/hws/hw8/hw8_probZ_init.txt', 'r') as f:
    probZ_init_ls = f.read().strip('\n').split('\n')
    probZ_init_arr = np.array([n.strip('    ').split('    ') for n in probZ_init_ls]).astype(float)

with open('/Users/lodino/Desktop/CSE250A/hws/hw8/hw8_ratings.txt', 'r') as f:
    ratings_ls = f.read().strip('\n').split('\n')
    ratings_arr = np.array([n.strip(' ').split(' ') for n in ratings_ls])
    ratings_arr[ratings_arr=='?'] = 2
    ratings_arr = ratings_arr.astype(float)

# (a)

In [3]:
ratio = np.sum(ratings_arr==1, axis=0)/np.sum(ratings_arr!=2, axis=0)
df = pd.DataFrame(np.array([movie_ls, ratio]).T, columns=['movie', 'ratio'])
sorted_df = df.sort_values(by='ratio').reset_index(drop=True)
sorted_df.ratio = sorted_df.ratio.astype(float)
pd.set_option('display.max_rows', sorted_df.shape[0]+1)
sorted_df

Unnamed: 0,movie,ratio
0,I_Feel_Pretty,0.358974
1,Fifty_Shades_of_Grey,0.377143
2,Hustlers,0.456522
3,The_Last_Airbender,0.473684
4,Magic_Mike,0.515152
5,Fast_&_Furious:_Hobbs_&_Shaw,0.540541
6,The_Shape_of_Water,0.558442
7,Prometheus,0.584071
8,Phantom_Thread,0.586207
9,World_War_Z,0.589552


In [4]:
print(f'most popular: {sorted_df.movie[len(sorted_df)-1]} with ratio: {round(sorted_df.ratio[len(sorted_df)-1], 6)}')
print(f'most popular: {sorted_df.movie[0]} with ratio: {round(sorted_df.ratio[0], 6)}')

most popular: Inception with ratio: 0.965753
most popular: I_Feel_Pretty with ratio: 0.358974


# (e)

In [5]:
def joint_p_R_given_Z(p_R_given_Z):
    filter_arr = np.transpose(np.tile(ratings_arr!=2, (4, 1, 1)), (1, 2, 0))
    return np.prod(np.where((ratings_arr==1).reshape(len(ratings_arr), -1, 1), p_R_given_Z, 1-p_R_given_Z),
                   axis=1, where=filter_arr)

def get_p_Z_estimate(p_Z, p_R_given_Z):
    numerator = joint_p_R_given_Z(p_R_given_Z)*p_Z.ravel()
    denominator = np.sum(numerator, axis=1).reshape(-1, 1)
    return numerator/denominator

def get_p_Z_update(p_Z_est):
    return (np.sum(p_Z_est, axis=0) / len(p_Z_est)).reshape(-1, 1)

def get_p_R_given_Z_update(p_R_given_Z, p_Z_est):
    numerator = np.zeros_like(p_R_given_Z)
    denominator = np.sum(p_Z_est, axis=0)
    for j in range(76):
        sub_group_idx = ratings_arr[:, j]!=2
        sub_group = ratings_arr[sub_group_idx, j]
        numerator[j] = np.sum(np.where(sub_group==1, 1, 0).reshape(-1, 1)*p_Z_est[sub_group_idx], axis=0)
        
        sub_group_idx = ratings_arr[:, j]==2
        sub_group = ratings_arr[sub_group_idx, j]
        numerator[j] += np.sum(p_Z_est[sub_group_idx]*p_R_given_Z[j, :], axis=0)        

    return (numerator/denominator).reshape(*p_R_given_Z.shape)

def log_likelihood(p_Z, p_R_given_Z):
    p_R_observed = np.sum(joint_p_R_given_Z(p_R_given_Z)*p_Z.ravel(), axis=1)
    return np.mean(np.log(p_R_observed))

In [6]:
probR_arr = probR_init_arr.copy()
probZ_arr = probZ_init_arr.copy()
log_likelihood_ls = []
log_likelihood_ls = []
verbose_ls = [0] + [2**k for k in range(9)]
for i in range(256):
    ll = log_likelihood(probZ_arr, probR_arr)
    if i in verbose_ls:
        print(f'log-likelihood in iter {i}: {round(ll, 4)}')
        log_likelihood_ls.append(ll)

    p_Z_est = get_p_Z_estimate(probZ_arr, probR_arr)
    updated_probZ_arr = get_p_Z_update(p_Z_est)
    updated_probR_arr = get_p_R_given_Z_update(probR_arr, p_Z_est)

    probZ_arr = updated_probZ_arr
    probR_arr = updated_probR_arr

print(f'log-likelihood in iter {256}: {round(ll, 4)}')
log_likelihood_ls.append(ll)

log-likelihood in iter 0: -27.0358
log-likelihood in iter 1: -17.5604
log-likelihood in iter 2: -16.0024
log-likelihood in iter 4: -15.0606
log-likelihood in iter 8: -14.5016
log-likelihood in iter 16: -14.2638
log-likelihood in iter 32: -14.1802
log-likelihood in iter 64: -14.1701
log-likelihood in iter 128: -14.164
log-likelihood in iter 256: -14.1637


# (f)

In [7]:
my_idx = id_ls.index('A59009476')
unseen_movies_idx = ratings_arr[my_idx]==2
p_Z_est = get_p_Z_estimate(probZ_arr, probR_arr)
new_prob = np.sum(probR_arr*p_Z_est[my_idx], axis=1)

df = pd.DataFrame(np.array([movie_ls, new_prob]).T, columns=['unseen_movie', 'prob'])
sorted_df = df[unseen_movies_idx].sort_values(by='prob').reset_index(drop=True)
sorted_df.prob = sorted_df.prob.astype(float)
sorted_df

Unnamed: 0,unseen_movie,prob
0,I_Feel_Pretty,0.33574
1,Fifty_Shades_of_Grey,0.488174
2,Magic_Mike,0.557991
3,Hustlers,0.590871
4,The_Shape_of_Water,0.651946
5,Rocketman,0.685499
6,Midnight_in_Paris,0.694458
7,Fast_&_Furious:_Hobbs_&_Shaw,0.698717
8,Room,0.716118
9,World_War_Z,0.760699


In [8]:
print(f'most popular: {sorted_df.movie[len(sorted_df)-1]} with prob: {round(sorted_df.prob[len(sorted_df)-1], 6)}')
print(f'most popular: {sorted_df.movie[0]} with prob: {round(sorted_df.prob[0], 6)}')

AttributeError: 'DataFrame' object has no attribute 'movie'