In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

## user data

In [294]:
user = pd.read_csv('user.csv')
user.head(10)

Unnamed: 0,user id,age,gender,occupation,zip code
0,2,53,F,other,94043
1,3,23,M,writer,32067
2,4,24,M,technician,43537
3,5,33,F,other,15213
4,6,42,M,executive,98101
5,7,57,M,administrator,91344
6,8,36,M,administrator,5201
7,9,29,M,student,1002
8,10,53,M,lawyer,90703
9,11,39,F,other,30329


In [295]:
def combined_feature(row):
    return str(row['age']) + ' ' + row['gender'] + ' ' + row['occupation']

user['combined'] = user.apply(combined_feature, axis=1)

cv = CountVectorizer()
user_count_matrix = cv.fit_transform(user['combined'])
user_sim = cosine_similarity(user_count_matrix)
user = user.drop(columns=['combined', 'zip code'])

## create new user

In [296]:
newUser = [{'age': 25, 'gender': 'F', 'occupation': 'student'}]
newUser = pd.DataFrame(newUser)

userMatch = user[user['occupation'].isin(newUser['occupation'])]
userMatch = userMatch[userMatch['age'].isin(newUser['age'])]
userMatch = userMatch[userMatch['gender'].isin(newUser['gender'])]
userMatch.head(10)

Unnamed: 0,user id,age,gender,occupation
201,203,25,F,student
370,372,25,F,student


In [297]:
user_index = int(userMatch.index[0])
print(user_index)

201


## similer user

In [298]:
similar_user = []
for value in enumerate(user_sim[user_index]):
    similar_user.append(value)
sorted_similar_user = sorted(similar_user, key = lambda x:x[1], reverse = True)

selected_user = pd.DataFrame()
for value in sorted_similar_user[:5]:
    selected_user = selected_user.append(user[user.index == value[0]])
selected_user

Unnamed: 0,user id,age,gender,occupation
151,153,25,M,student
152,154,25,M,student
201,203,25,F,student
246,248,25,M,student
247,249,25,M,student


## user similarity score

In [299]:
for value in sorted_similar_user[:5]:
    print(user[user.index == value[0]], 'score: ', value[1])


     user id  age gender occupation
151      153   25      M    student score:  0.9999999999999998
     user id  age gender occupation
152      154   25      M    student score:  0.9999999999999998
     user id  age gender occupation
201      203   25      F    student score:  0.9999999999999998
     user id  age gender occupation
246      248   25      M    student score:  0.9999999999999998
     user id  age gender occupation
247      249   25      M    student score:  0.9999999999999998


## user rated movie

In [300]:
rating = pd.read_csv('rating.csv')
rating.head(10)

Unnamed: 0,user id,item id,rating,timestamp
0,186,302,3,891717742
1,22,377,1,878887116
2,244,51,2,880606923
3,166,346,1,886397596
4,298,474,4,884182806
5,115,265,2,881171488
6,253,465,5,891628467
7,305,451,3,886324817
8,6,86,3,883603013
9,62,257,2,879372434


In [304]:
user_rated_movie = rating[rating['user id'].isin(selected_user['user id'].tolist())]
user_rated_movie = user_rated_movie[user_rated_movie['rating'] >= 5]
user_rated_movie = user_rated_movie.groupby(['item id'])
user_rated_movie = sorted(user_rated_movie, key=lambda x: len(x[1]), reverse=True)

user_rated_movie[:5]

[(50,
         user id  item id  rating  timestamp
  7786       154       50       5  879138657
  14160      203       50       5  880434810
  34817      248       50       5  884535013),
 (64,
         user id  item id  rating  timestamp
  7673       153       64       5  881371005
  27885      249       64       5  879572210
  37321      248       64       5  884534735),
 (89,
         user id  item id  rating  timestamp
  16410      154       89       5  879138910
  24284      249       89       5  879572229
  49741      248       89       5  884535046),
 (182,
         user id  item id  rating  timestamp
  4747       249      182       5  879640949
  7780       154      182       5  879138783
  43635      153      182       5  881371198),
 (11,
         user id  item id  rating  timestamp
  2427       249       11       5  879640868
  11682      248       11       5  884534992)]

In [307]:
movie = pd.read_csv('movie.csv')
movie = movie.dropna(axis=1)
movie.head(10)

Unnamed: 0,item id,movie title,unknown,action,adventure,animation,children's,comedy,crime,documentary,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,7,Twelve Monkeys (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,8,Babe (1995),0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,9,Dead Man Walking (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10,Richard III (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,11,Seven (Se7en) (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [305]:
movie_filter_list = pd.DataFrame()
for id, group in user_rated_movie[:5]:
    movie_filter_list = movie_filter_list.append(movie[movie['item id'] == id])

movie_filter_list

Unnamed: 0,item id,movie title,unknown,action,adventure,animation,children's,comedy,crime,documentary,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
48,50,Star Wars (1977),0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
62,64,"Shawshank Redemption, The (1994)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87,89,Blade Runner (1982),0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
180,182,GoodFellas (1990),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,11,Seven (Se7en) (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
movie['combined'] = movie[movie.columns[4:]].apply(lambda x: ''.join(x.dropna().astype(str)), axis=1)

count = CountVectorizer()
movie_count_matrix = count.fit_transform(movie['combined'])
movie_sim = cosine_similarity(movie_count_matrix)

movie = movie.drop(columns=['combined'])

In [308]:
suggested_movie_list = pd.DataFrame()

In [310]:
for index in movie_filter_list.index:
    list = []
    for value in enumerate(movie_sim[index]):
        list.append(value)
    sorted_list = sorted(list, key = lambda x:x[1], reverse = True)

    for x in sorted_list[:5]:
        suggested_movie_list = suggested_movie_list.append(movie[movie.index == x[0]])

suggested_movie_list['movie title'].head(25)

48                                      Star Wars (1977)
179                            Return of the Jedi (1983)
0                                       GoldenEye (1995)
1                                      Four Rooms (1995)
2                                      Get Shorty (1995)
4      Shanghai Triad (Yao a yao yao dao waipo qiao) ...
7                                Dead Man Walking (1995)
13                             Mr. Holland's Opus (1995)
16                             White Balloon, The (1995)
17                                 Antonia's Line (1995)
87                                   Blade Runner (1982)
0                                       GoldenEye (1995)
1                                      Four Rooms (1995)
2                                      Get Shorty (1995)
3                                         Copycat (1995)
54                                   Pulp Fiction (1994)
74                                  Carlito's Way (1993)
125                            