In [160]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Практика Python
В данном разделе мы выступим в роли data scientist и попытаемся построить простую модель для рекомендации фильмов пользователям.

## Загрузка данных
Загружаю дата-сеты по оценкам и фильмам 

In [129]:
ratings_df_columns = ['user_id', 'item_id', 'rating', 'timestamp']
movies_df_columns = [
    'movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb URL', 'unknow', 
    'Action', 'Adventure', 'Animation', 'Children s', 'Comedy', 'Crime', 'Documentary', 'Drama',
    'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

In [130]:
ratings_df = pd.read_csv(
    './Data/u.data.csv', 
    sep='\t', 
    names=ratings_df_columns
)

movies_df = pd.read_csv(
    './Data/u.item.csv', 
    sep='|',
    names=movies_df_columns,
    encoding='latin-1'
)

In [131]:
movies_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb URL,unknow,Action,Adventure,Animation,Children s,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Найти пользователя, который поставил больше всех оценок.

In [132]:
ratings_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [133]:
ratings_df.groupby('user_id').agg({'rating': 'sum'}).sort_values(by='rating', ascending=False).head(1)

Unnamed: 0_level_0,rating
user_id,Unnamed: 1_level_1
450,2087


## Оставить в датафрейме ratings только те фильмы, который оценил данный пользователь 

In [134]:
ratings_the_best_user = ratings_df[ratings_df['user_id'] == 450]

## Добавить к датафрейму столбцы
Для построения модели нам нужны признаки. В качестве таковых будем использовать:
* Год выхода
* Жанры
* Общее количество оценок
* Суммарную оценку


По жанрам. Каждый столбец - это жанр. 
* Единицу записываем, если фильм принадлежит данному жанру и 0 - если нет
* Cтолбцы с общим количеством оценок от всех пользователей на фильм и суммарной оценкой от всех пользователей


In [135]:
selected_colun_movies_df = [
    'movie_id', 'release_date','Action', 'Adventure', 'Animation', 'Children s', 'Comedy', 'Crime', 'Documentary', 'Drama',
    'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

In [136]:
ratings_the_best_user = ratings_the_best_user.merge(
    movies_df[selected_colun_movies_df], 
    left_on='item_id', 
    right_on='movie_id'
)

In [137]:
ratings_the_best_user['count_ratings'] = ratings_the_best_user['rating'].count()
ratings_the_best_user['sum_ratings'] = ratings_the_best_user['rating'].sum()
ratings_the_best_user['release_date'] = pd.to_datetime(ratings_the_best_user['release_date'])
ratings_the_best_user['release_date'] = ratings_the_best_user['release_date'].dt.year

## Train and test sample

In [138]:
ratings_the_best_user.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,release_date,Action,Adventure,Animation,Children s,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,count_ratings,sum_ratings
0,450,470,5,887139517,470,1993,0,0,0,0,...,0,0,0,0,0,0,0,1,540,2087
1,450,783,3,882399818,783,1994,0,0,0,0,...,0,0,0,1,0,0,0,0,540,2087


In [139]:
X = ratings_the_best_user.drop('rating', axis=1)
y = ratings_the_best_user['rating']

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Построение модели 

In [142]:
ratings_the_best_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 540 entries, 0 to 539
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   user_id        540 non-null    int64
 1   item_id        540 non-null    int64
 2   rating         540 non-null    int64
 3   timestamp      540 non-null    int64
 4   movie_id       540 non-null    int64
 5   release_date   540 non-null    int64
 6   Action         540 non-null    int64
 7   Adventure      540 non-null    int64
 8   Animation      540 non-null    int64
 9   Children s     540 non-null    int64
 10  Comedy         540 non-null    int64
 11  Crime          540 non-null    int64
 12  Documentary    540 non-null    int64
 13  Drama          540 non-null    int64
 14  Fantasy        540 non-null    int64
 15  Film-Noir      540 non-null    int64
 16  Horror         540 non-null    int64
 17  Musical        540 non-null    int64
 18  Mystery        540 non-null    int64
 19  Romance 

In [143]:
model_linear_reg = LinearRegression()

In [144]:
model_linear_reg.fit(X_train, y_train)

## Оценка качества модели

In [162]:
y_pred = model_linear_reg.predict(X_test)

In [163]:
mean_squared_error(y_pred, y_test)

0.8654995944809156

In [164]:
import pyspark
from pyspark.sql import SparkSession

In [165]:
spark = SparkSession.builder.master('local[*]').getOrCreate()
sc = spark.sparkContext

## Загрузка данных в Spark

In [225]:
ratings_df_columns = ['user_id', 'item_id', 'rating', 'timestamp']
movies_df_columns = [
    'movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb URL', 'unknow', 
    'Action', 'Adventure', 'Animation', 'Children s', 'Comedy', 'Crime', 'Documentary', 'Drama',
    'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

In [226]:
movies_df = spark.read.csv('./Data/u.item.csv', 
                    inferSchema=True, 
                    sep='|', 
                    header=False
)

ratings_df = spark.read.csv('./Data/u.data.csv', 
                    inferSchema=True, 
                    sep='\t', 
                    header=False
)

In [232]:
movies_df = movies_df.toDF(*movies_df_columns)
ratings_df = ratings_df.toDF(*ratings_df_columns)

In [233]:
merge_df_movies = ratings_df.join(movies_df, ratings_df['item_id'] == movies_df['movie_id'])

## Средняя оценка каждого фильма

In [229]:
merge_df_movies.groupBy('movie_title').mean('rating').show()

+--------------------+------------------+
|         movie_title|       avg(rating)|
+--------------------+------------------+
|   Annie Hall (1977)| 3.911111111111111|
|Heavenly Creature...|3.6714285714285713|
|       Psycho (1960)| 4.100418410041841|
|Snow White and th...|3.7093023255813953|
|Night of the Livi...|          3.421875|
|When We Were King...| 4.045454545454546|
| If Lucy Fell (1996)|2.7586206896551726|
|    Fair Game (1995)|2.1818181818181817|
| Three Wishes (1995)|3.2222222222222223|
|         Cosi (1996)|               4.0|
|Paris, France (1993)|2.3333333333333335|
|Spanking the Monk...| 3.074074074074074|
|I'll Do Anything ...|               2.6|
|        Mondo (1996)|               3.0|
| Evil Dead II (1987)|3.5168539325842696|
|    Threesome (1994)| 2.838709677419355|
|Last Action Hero ...|2.7457627118644066|
|Reality Bites (1994)| 2.961038961038961|
|Colonel Chabert, ...|               3.5|
|   Blue Chips (1994)|2.6666666666666665|
+--------------------+------------

## Средняя оценка для каждого жанра

In [230]:
movies_genre = [
    'Action', 'Adventure', 'Animation', 'Children s', 'Comedy', 'Crime', 'Documentary', 'Drama',
    'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

In [231]:
merge_df_movies.groupBy(genre).mean('rating').show()

+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------------------+
|Action|Adventure|Animation|Children s|Comedy|Crime|Documentary|Drama|Fantasy|Film-Noir|Horror|Musical|Mystery|Romance|Sci-Fi|Thriller|War|Western|       avg(rating)|
+------+---------+---------+----------+------+-----+-----------+-----+-------+---------+------+-------+-------+-------+------+--------+---+-------+------------------+
|     0|        0|        0|         0|     1|    1|          0|    0|      1|        0|     0|      0|      0|      0|     0|       0|  0|      0| 3.193798449612403|
|     1|        0|        0|         0|     0|    1|          0|    0|      0|        0|     0|      0|      1|      0|     0|       0|  0|      0|          3.359375|
|     0|        0|        0|         0|     0|    0|          0|    0|      0|        1|     0|      0|      0|      0|     0|       0|  0|      0| 4.119402985074627

## Получние двух датафреймов с 5-ю самыми популярными и самыми непопулярными фильмами по количеству оценок.

In [287]:
most_popular_films = merge_df_movies.groupBy('movie_title').count()
most_popular_films = most_popular_films.sort(most_popular_films['count'].desc()).take(5)

In [290]:
most_not_popular_films = merge_df_movies.groupBy('movie_title').count()
most_not_popular_films = most_not_popular_films.sort(most_not_popular_films['count'].asc()).take(5)