# 영화 평점 분석 실습

실습관련 YouTube 영상  
https://youtu.be/krmthaX9WD4  
https://youtu.be/bbSDVNYUmb4  
https://youtu.be/JOska9sZVrw

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## 영화 평점 데이터 적재 및 전처리

In [3]:
# 사용자 데이터 읽어오기
users = pd.read_csv('data/movielens/users.dat', 
                    sep = '::', 
                    engine = 'python', 
                    names = ['사용자아이디', '성별','연령','직업','지역'])
users.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
# 평점 데이터 읽어오기
ratings = pd.read_csv('data/movielens/ratings.dat',
                      sep = '::', 
                      engine = 'python',
                      names = ['사용자아이디', '영화아이디','평점','타임스탬프'])
ratings.head()

Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# 영화데이터 읽어오기
movies = pd.read_csv('data/movielens/movies.dat',
                     sep = '::', 
                     engine = 'python',
                     names = ['영화아이디','영화제목','장르'], 
                     encoding = 'latin-1')
movies.head()

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
#3개의 데이터프레임을 하나로 합치기
data = pd.merge(users,ratings,on='사용자아이디')
data.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프
0,1,F,1,10,48067,1193,5,978300760
1,1,F,1,10,48067,661,3,978302109
2,1,F,1,10,48067,914,3,978301968
3,1,F,1,10,48067,3408,4,978300275
4,1,F,1,10,48067,2355,5,978824291


In [7]:
data = pd.merge(data,movies,on='영화아이디')
data.head()

Unnamed: 0,사용자아이디,성별,연령,직업,지역,영화아이디,평점,타임스탬프,영화제목,장르
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


## 보고 싶은 영화  찾기 (평점 높은 영화 찾기)

In [8]:
data.영화제목.unique().size

3706

In [9]:
data.영화아이디.unique().size # 영화 제목이 안 겹친다는걸 알 수 있음

3706

In [10]:
# 영화들의 평점 평균을 구하여, 평점이 높은 영화 찾기 
data.pivot_table(index='영화제목', # 영화제목이 안 겹치니까
                 aggfunc='mean',
                 values='평점').sort_values(by='평점',ascending=False).head(10)

Unnamed: 0_level_0,평점
영화제목,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.0
Lured (1947),5.0
Follow the Bitch (1998),5.0
Bittersweet Motel (2000),5.0
Song of Freedom (1936),5.0
One Little Indian (1973),5.0
Smashing Time (1967),5.0
Schlafes Bruder (Brother of Sleep) (1995),5.0
"Gate of Heavenly Peace, The (1995)",5.0
"Baby, The (1973)",5.0


In [11]:
# 영화들의 평점 갯수도 함께 구하기
영화평점 = data.pivot_table(index='영화제목',aggfunc=['mean','count'],values=['평점'])
영화평점.columns = ['평균','갯수']

In [12]:
영화평점.head()

Unnamed: 0_level_0,평균,갯수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.027027,37
'Night Mother (1986),3.371429,70
'Til There Was You (1997),2.692308,52
"'burbs, The (1989)",2.910891,303
...And Justice for All (1979),3.713568,199


In [13]:
# 평점의 갯수가 2000개 이상이고 평균이 4.5이상인 영화를 평균으로 내림차순
영화평점[(영화평점.갯수>=2000) & (영화평점.평균 > 4.3)].sort_values(by='평균',ascending=False)

Unnamed: 0_level_0,평균,갯수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",4.554558,2227
"Godfather, The (1972)",4.524966,2223
Schindler's List (1993),4.510417,2304
Raiders of the Lost Ark (1981),4.477725,2514
Star Wars: Episode IV - A New Hope (1977),4.453694,2991
"Sixth Sense, The (1999)",4.406263,2459
"Silence of the Lambs, The (1991)",4.351823,2578
Saving Private Ryan (1998),4.337354,2653
American Beauty (1999),4.317386,3428
"Matrix, The (1999)",4.31583,2590


## (실습) 여자들이 좋아하는 영화 찾기 (평점이 4.0 이상이고 여성 평점의 개수가 500개 이상인 영화)

In [16]:
ex1 = data[data.성별 == 'F']

In [22]:
ex1 = ex1.pivot_table(index='영화제목',values='평점',aggfunc=['mean','count'])

In [23]:
ex1.columns = ['여성평점평균','여성평점개수']

In [25]:
여성인기영화 = ex1[(ex1.여성평점평균>=4.0)&(ex1.여성평점개수>=500)]

In [26]:
# 다른 방법 - 성별로 영화평점 집계를 각각 수행
ex1_1 = data.pivot_table(index='영화제목',columns='성별',values='평점',aggfunc=['mean','count'])

In [28]:
ex1_1[(ex1_1[('mean','F')]>=4.0)&(ex1_1[('count','F')]>=500)]

Unnamed: 0_level_0,mean,mean,count,count
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
American Beauty (1999),4.238901,4.347301,946.0,2482.0
Being John Malkovich (1999),4.15993,4.113636,569.0,1672.0
Braveheart (1995),4.016484,4.297839,546.0,1897.0
Casablanca (1942),4.30099,4.46134,505.0,1164.0
E.T. the Extra-Terrestrial (1982),4.08985,3.920264,601.0,1668.0
Fargo (1996),4.217656,4.26778,657.0,1856.0
Forrest Gump (1994),4.045031,4.105806,644.0,1550.0
L.A. Confidential (1997),4.106007,4.256678,566.0,1722.0
"Matrix, The (1999)",4.128405,4.362235,514.0,2076.0
"Princess Bride, The (1987)",4.342767,4.288942,636.0,1682.0


### (심화) 여자들이 어떤 장르의 영화를 좋아하는지 찾아보자.(위에서 찾은 여성들이 좋아하는 영화들의 장르를 찾아낸 후, 어떤 장르가 많았는지 확인)

In [30]:
여성인기영화

Unnamed: 0_level_0,여성평점평균,여성평점개수
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.238901,946
Being John Malkovich (1999),4.15993,569
Braveheart (1995),4.016484,546
Casablanca (1942),4.30099,505
E.T. the Extra-Terrestrial (1982),4.08985,601
Fargo (1996),4.217656,657
Forrest Gump (1994),4.045031,644
L.A. Confidential (1997),4.106007,566
"Matrix, The (1999)",4.128405,514
"Princess Bride, The (1987)",4.342767,636


In [31]:
movies.장르

0          Animation|Children's|Comedy
1         Adventure|Children's|Fantasy
2                       Comedy|Romance
3                         Comedy|Drama
4                               Comedy
5                Action|Crime|Thriller
6                       Comedy|Romance
7                 Adventure|Children's
8                               Action
9            Action|Adventure|Thriller
10                Comedy|Drama|Romance
11                       Comedy|Horror
12                Animation|Children's
13                               Drama
14            Action|Adventure|Romance
15                      Drama|Thriller
16                       Drama|Romance
17                            Thriller
18                              Comedy
19                              Action
20                 Action|Comedy|Drama
21                Crime|Drama|Thriller
22                            Thriller
23                        Drama|Sci-Fi
24                       Drama|Romance
25                       

In [33]:
movies2 = movies.set_index('영화제목')['장르']

In [38]:
ex2 = pd.concat([여성인기영화,movies2],axis = 1,join='inner')['장르']

In [42]:
ex2 = ex2.str.split('|',expand = True)

In [44]:
ex2[0].value_counts()

Action        7
Comedy        4
Drama         4
Crime         3
Animation     1
Thriller      1
Adventure     1
Children's    1
Name: 0, dtype: int64

In [45]:
ex2[0].value_counts().add(ex2[4].value_counts(),fill_value=0)

Action        7.0
Adventure     1.0
Animation     1.0
Children's    1.0
Comedy        4.0
Crime         3.0
Drama         4.0
Thriller      1.0
War           1.0
dtype: float64

In [54]:
여성인기장르 = Series()
for col in ex2.columns:
    여성인기장르 = 여성인기장르.add(ex2[col].value_counts(),fill_value=0)
여성인기장르

Action         7.0
Adventure      5.0
Animation      1.0
Children's     3.0
Comedy         6.0
Crime          3.0
Drama         12.0
Fantasy        2.0
Film-Noir      1.0
Musical        1.0
Mystery        1.0
Romance        4.0
Sci-Fi         4.0
Thriller       5.0
War            6.0
dtype: float64

## (실습) 남자와 여자의 호불호가 크게 갈리는 영화 10개를 출력 
### 전체 평점의 개수가 500개 이상인 영화만 대상으로 함.

## (실습) 연령대 별로 영화 평점 분석하기
### 연령대(10대 미만, 10대, 20대, ...50대) 컬럼을 추가한 후, 연령대로 집계를 수행하여 영화별 연령대별 영화평점 구하기