In [152]:
import numpy as np
import sklearn
import pandas as pd
import re

from scipy import spatial
from sklearn.preprocessing import MinMaxScaler

## Read the data

In [153]:
df = pd.read_csv('./ml-100k/u.data', sep='\\t', encoding='iso-8859-1', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Group-by MovieID and calculate Metrics

In [154]:
# Calculate Average Rating to use as a metric for the vector
rmean = pd.DataFrame(df.groupby('MovieID')['Rating'].mean()).rename(columns={'Rating':'Avg'})

# Calculate standard deviation of ratings to use as a metric for the vector
rstd = pd.DataFrame(df.groupby('MovieID')['Rating'].std()).rename(columns={'Rating':'STD'})

# Calculate the number of ratings to use as a metric for the vector
rcount = pd.DataFrame(df.groupby('MovieID')['Rating'].count()).rename(columns={'Rating':'ReviewCount'})

# Join
mov_rt = pd.DataFrame(rmean).join(rstd).join(rcount)
# Replace standard deviation value for movies with only one rating with 0
mov_rt.fillna(0, inplace=True)
mov_rt.head()


Unnamed: 0_level_0,Avg,STD,ReviewCount
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.878319,0.927897,452
2,3.206107,0.966497,131
3,3.033333,1.21276,90
4,3.550239,0.965069,209
5,3.302326,0.946446,86


## Append Movie Genre data, and normalize database to be between 0-1

In [155]:
# Pretty self explanatory code here

genres =  pd.read_csv(filepath_or_buffer="./ml-100k/u.genre",
                                   sep='|',
                                   names=['genre', 'index'],
                                   encoding='iso-8859-1')
# print(genres.head(30))

c = ['MovieName', 'ReleaseDate', 'IMDBLink']
c.extend(genres.genre)
movs = pd.read_csv(filepath_or_buffer="./ml-100k/u.item",
                              sep="|",
                              names=c,
                              encoding='iso-8859-1', index_col=0)

movs.dropna(inplace=True)
movs.index.name = 'MovieID'
movs.drop(['IMDBLink', 'ReleaseDate'], axis=1, inplace=True)
movs['MovieName'] =  movs['MovieName'].apply(lambda x: re.sub(r'\s\(\d{4}\)', '', str(x)))

movs.astype({col: float for col in genres.genre})

movs = movs.join(mov_rt)

scaler = MinMaxScaler()
movs[['Avg', 'STD', 'ReviewCount']] = scaler.fit_transform(movs[['Avg', 'STD', 'ReviewCount']])

movs.head()

Unnamed: 0_level_0,MovieName,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Avg,STD,ReviewCount
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0.0,0.71958,0.328061,0.774914
2,GoldenEye,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0.0,0.551527,0.341708,0.223368
3,Four Rooms,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0.0,0.508333,0.428775,0.152921
4,Get Shorty,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0.0,0.63756,0.341203,0.357388
5,Copycat,0,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0.0,0.575581,0.334619,0.146048


## Create Vectors out of Genre and Review Data

In [156]:
def getVector(row):
    v = row.drop(['MovieName'])
    return v.tolist()

try:
    movs.loc[0, 'Vector']
except KeyError:
    movs['Vector'] = movs.apply(getVector, axis=1)

movs.head(50)
# print(cosineSimilarity(movs.at[1, 'Vector'], movs.at[2, 'Vector']))

Unnamed: 0_level_0,MovieName,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,Avg,STD,ReviewCount,Vector
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0.0,0.71958,0.328061,0.774914,"[0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,GoldenEye,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0.0,0.551527,0.341708,0.223368,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Four Rooms,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0.0,0.508333,0.428775,0.152921,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Get Shorty,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0.0,0.63756,0.341203,0.357388,"[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
5,Copycat,0,0,0,0,0,0,1,0,1,...,0,0,0,1,0,0.0,0.575581,0.334619,0.146048,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."
6,Shanghai Triad (Yao a yao yao dao waipo qiao),0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0.0,0.644231,0.460142,0.042955,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
7,Twelve Monkeys,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0.0,0.699617,0.347203,0.671821,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
8,Babe,0,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0.0,0.748858,0.35436,0.37457,"[0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
9,Dead Man Walking,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0.0,0.72408,0.368533,0.512027,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
10,Richard III,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0.0,0.707865,0.358485,0.151203,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


## Check the cosine similarity/distance between vectors

In [157]:
def cosineDistance(a, b):
    return spatial.distance.cosine(a, b)

i = 29
j = 50
print(f'Distance between {movs.loc[i, "MovieName"]} and {movs.loc[j, "MovieName"]}: {cosineDistance(movs.loc[i, 'Vector'], movs.loc[j, 'Vector'])}')

Distance between Batman Forever and Star Wars: 0.5111275408951674


## Save Vectors in file

In [158]:
final = movs[['MovieName', 'Vector']]
final.head()
final.to_csv("MovieVectors.csv")