### Finding Similarities between movies

In [1]:
# Libs
import pandas as pd

### Load data from csvs

In [2]:
r_cols  = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('data/u.data', sep='\t', names=r_cols, usecols=range(3)) 

m_cols = ['movie_id', 'title']
movies = pd.read_csv('data/u.item', sep='|', names=m_cols, usecols=range(2) , encoding='latin-1')

### Merge dataset

In [3]:
ratings = pd.merge(movies, ratings)
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


### EAD

In [4]:
ratings.shape

(100003, 4)

In [5]:
ratings.columns

Index(['movie_id', 'title', 'user_id', 'rating'], dtype='object')

### Transform Data

In [6]:
moviesRatings = ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
moviesRatings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [7]:
moviesRatings.shape

(944, 1664)

### Star Wars films - get all ratings from users

In [8]:
starWarsRatings = moviesRatings['Star Wars (1977)']
starWarsRatings.head()

user_id
0    5.0
1    5.0
2    5.0
3    NaN
4    5.0
Name: Star Wars (1977), dtype: float64

In [9]:
starWarsRatings.isna().sum()

360

In [10]:
starWarsRatings.shape

(944,)

In [11]:
starWarsRatings.mean()

4.359589041095891

In [12]:
starWarsRatings.median()

5.0

In [13]:
starWarsRatings.min()

1.0

In [14]:
starWarsRatings

user_id
0      5.0
1      5.0
2      5.0
3      NaN
4      5.0
      ... 
939    NaN
940    4.0
941    NaN
942    5.0
943    4.0
Name: Star Wars (1977), Length: 944, dtype: float64

### Correlate every column 
#### starWars movie with all other columns

In [18]:
similarMovies = moviesRatings.corrwith(starWarsRatings)
similarMovies = similarMovies.dropna()

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


In [20]:
df = pd.DataFrame(similarMovies)
df.head(50)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
'Til There Was You (1997),0.872872
1-900 (1994),-0.645497
101 Dalmatians (1996),0.211132
12 Angry Men (1957),0.184289
187 (1997),0.027398
2 Days in the Valley (1996),0.066654
"20,000 Leagues Under the Sea (1954)",0.289768
2001: A Space Odyssey (1968),0.230884
"39 Steps, The (1935)",0.106453
8 1/2 (1963),-0.142977


In [23]:
similarMovies.sort_values(ascending=False)

title
Hollow Reed (1996)                   1.0
Stripes (1981)                       1.0
Star Wars (1977)                     1.0
Man of the Year (1995)               1.0
Beans of Egypt, Maine, The (1994)    1.0
                                    ... 
For Ever Mozart (1996)              -1.0
Frankie Starlight (1995)            -1.0
I Like It Like That (1994)          -1.0
American Dream (1990)               -1.0
Theodore Rex (1995)                 -1.0
Length: 1410, dtype: float64

### Evaluating similarity and quantity

In [26]:
import numpy as np
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
'Til There Was You (1997),9,2.333333
1-900 (1994),5,2.6
101 Dalmatians (1996),109,2.908257
12 Angry Men (1957),125,4.344
187 (1997),41,3.02439


### Cut of in 100 ratings

In [27]:
popularMovies = movieStats['rating']['size'] >= 100
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:15]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Close Shave, A (1995)",112,4.491071
Schindler's List (1993),298,4.466443
"Wrong Trousers, The (1993)",118,4.466102
Casablanca (1942),243,4.45679
"Shawshank Redemption, The (1994)",283,4.44523
Rear Window (1954),209,4.38756
"Usual Suspects, The (1995)",267,4.385768
Star Wars (1977),584,4.359589
12 Angry Men (1957),125,4.344
Citizen Kane (1941),198,4.292929


### New DataFrame of startwars recomendations

In [29]:
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))

  df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))


In [30]:
df.head()

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101 Dalmatians (1996),109,2.908257,0.211132
12 Angry Men (1957),125,4.344,0.184289
2001: A Space Odyssey (1968),259,3.969112,0.230884
Absolute Power (1997),127,3.370079,0.08544
"Abyss, The (1989)",151,3.589404,0.203709


In [31]:
df.sort_values(['similarity'], ascending=False)[:15]

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars (1977),584,4.359589,1.0
"Empire Strikes Back, The (1980)",368,4.206522,0.748353
Return of the Jedi (1983),507,4.00789,0.672556
Raiders of the Lost Ark (1981),420,4.252381,0.536117
Austin Powers: International Man of Mystery (1997),130,3.246154,0.377433
"Sting, The (1973)",241,4.058091,0.367538
Indiana Jones and the Last Crusade (1989),331,3.930514,0.350107
Pinocchio (1940),101,3.673267,0.347868
"Frighteners, The (1996)",115,3.234783,0.332729
L.A. Confidential (1997),297,4.161616,0.319065
