# INITIAL INFORMATION - 참조 함수

In [None]:
# !pip install scikit-surprise

In [None]:
import pandas as pd
import numpy as np

def load_movies_dataset() -> pd.DataFrame:
    """영화에 대한 정보 불러오기"""
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'datasets/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id'
    )
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings() -> pd.DataFrame:
    ratings_data = pd.read_csv(
        'datasets/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    return ratings_data

In [None]:
movie_data = load_movies_dataset()
ratings_data = load_ratings()

In [None]:
movie_data.head()

In [None]:
ratings_data.head(10)

In [None]:
ratings_data['user_id'].max()

# Ratings dataset

Contains the **interactions** between users and movies

- User **196** rated movie **242** with a score of **3** 
- User **186** rated movie **302** with a score of **3** 
- User **22** rated movie **377** with a score of **3** 

In [None]:
ratings_data[ratings_data['movie_id'] == 1]['rating'].describe()

> NOW SOLVE!!!!

# 해답) 문제 풀이

In [None]:
from surprise import SVD, NMF, accuracy
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split

# Surprise has some preset datasets, including ml-100k!
# data = Dataset.load_builtin('ml-100k')

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_data[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.25)

# Let's train a new Nonnegative SVD
model = SVD(n_factors=100, biased=False)
model.fit(trainset)

# In reality, we should perform a train/test split and check RMSE to see if our model is trained
# but today, for simplicity, I'm skipping this step
predictions = model.test(testset)
accuracy.rmse(predictions)

## Inspecting our Product Matrix

Surprise SVD stores the product matrix under the `model.qi` attribute.

In [None]:
pd.DataFrame(model.qi).head(10)

## Exploring the product matrix

The matrix has `n_factors` columns (we chose 10). Every row represents a movie

In [None]:
print(f"The shape of our product matrix is {model.qi.shape}.")
print(f"There are {ratings_data['movie_id'].unique().shape[0]} unique movies movies")

## Generating predictions with simplicity

Before looking into the latent features of our movies, let's use the API provided by Surprise. More specifically, Surprise provides us 1 API

 - `model.predict` computes the rating prediction for given user and movie
 
Let's look at how we can use this API to generate movies that a given user may like

```python
>>> model.predict('302', '1')
Prediction(uid=302, iid=1, r_ui=None, est=3.5327866666666665, details={'was_impossible': False})
```

NOTE: User ID and Movie ID are **strings**

In [None]:
# The prediction for user 196 to like movie#1 (Toy Story)
print(movie_data.loc[1])
print()
user_score_prediction = model.predict(196, 1)
print(user_score_prediction)
print(f"\n\nUSER 196 gives Toy Story: {user_score_prediction.est}")

## Recommend 출력 함수 만들기

In [None]:
movie_id_to_title_map = dict(movie_data['title'])

In [None]:
def generate_recommended_movies_for_user(model, user_id):
    """Return a DataFrame containing recommendations for the user, and the
    associated score
    """
    results = []
    for movie_id, movie_title in movie_id_to_title_map.items():
        
        # For each movie, calculate score prediction 
        prediction = model.predict(user_id, movie_id)
        results.append((movie_id, prediction.est, movie_title))
       
    return pd.DataFrame(results, columns=['movie_id', 'Estimated Prediction', 'Movie Title']).set_index('movie_id')


def display_best_and_worse_recommendations(recommendations: pd.DataFrame):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)

    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']
    # worse_recommendations = recommendations.iloc[-10:]
    # worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']

    return top_recommendations

In [None]:
# Let's generate some recommendations for a user 302
recommendations = generate_recommended_movies_for_user(model, 302)
display_best_and_worse_recommendations(recommendations)

## 내가 좋아하는 영화 고르고, 데이터에 추가해서 추천 영화 뽑기

In [None]:
# 나는 최근 영화만 알기 때문에 최근 영화만 살펴보기
movie_data.sort_values('release_date', ascending=False).iloc[:100]

In [None]:
movie_data.sort_values('release_date', ascending=False).iloc[:200].to_clipboard(sep='\t')
# 엑셀에서 내가 좋아하는 영화 선택

In [None]:
#선택한 내가 좋아하는 영화
my_movie_lst = pd.Series([916, 355,350,258,298,252,987,250], name='movie_id')
movie_data.loc[my_movie_lst, ['title', 'release_date']]

In [None]:
ratings_attach = my_movie_lst.to_frame().assign(rating=5)
ratings_attach.insert(0, 'user_id', 1000)

In [None]:
ratings_data_ = pd.concat([ratings_data, ratings_attach], axis=0).reset_index(drop=True)

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_data_[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.25)

# Let's train a new Nonnegative SVD
model = SVD(n_factors=100, biased=False)
model.fit(trainset)

predictions = model.test(testset)
accuracy.rmse(predictions)

In [None]:
# Let's generate some recommendations for myself - user_id(1000)
recommendations = generate_recommended_movies_for_user(model, 1000)
display_best_and_worse_recommendations(recommendations)