In [2]:
# 분석 절차(PDCNLDNSAER)
# 1. Package import
# 2. Data loading
# 3. Column select: 구조 파악 및 필요한 컬럼 선별
# 4. NaN: 결측치 처리
# 5. Label encoding: 범주형 변수의 변환
# 6. Derivative variable: 파생 변수 만들기
# 7. Normal: 정규화
# 8. Split: 데이터 분할
# 9. Analysis: 분석
# 10. Evaluation: 평가
# 11. Result save: 결과 저장

In [2]:
# NPTLSMRRXX
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
# from xgboost import XGBClassifier
# from xgboost import XGBRegressor

In [3]:
# u.user 파일을 DataFrame으로 읽기 
u_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('./data/u.user', sep='|', names=u_cols, encoding='utf-8')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [4]:
# u.item 파일을 DataFrame으로 읽기
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
          'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('./data/u.item', sep='|', names=i_cols, encoding='utf-8')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# u.data 파일을 DataFrame으로 읽기
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./data/u.data', sep='\t', names=r_cols, encoding='utf-8') 
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


In [6]:
#196번 회원의 평가 정보
cdt = (ratings.index == 196)
print(cdt)

[ True False False ... False False False]


In [10]:
ratings[cdt]

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
196,393,4,881251863
196,381,4,881251728
196,251,3,881251274
196,655,5,881251793
196,67,5,881252017
196,306,4,881251021
196,238,4,881251820
196,663,5,881251911
196,111,4,881251793


In [11]:
print(type(ratings.loc[196]['movie_id']))
print(ratings.loc[196]['movie_id'])

<class 'pandas.core.series.Series'>
user_id
196     242
196     393
196     381
196     251
196     655
196      67
196     306
196     238
196     663
196     111
196     580
196      25
196     286
196      94
196     692
196       8
196     428
196    1118
196      70
196      66
196     257
196     108
196     202
196     340
196     287
196     116
196     382
196     285
196    1241
196    1007
196     411
196     153
196      13
196     762
196     173
196    1022
196     845
196     269
196     110
Name: movie_id, dtype: int64


In [13]:
print(ratings.loc[196]['rating'])

user_id
196    3
196    4
196    4
196    3
196    5
196    5
196    4
196    4
196    5
196    4
196    2
196    4
196    5
196    3
196    5
196    5
196    4
196    4
196    3
196    3
196    2
196    4
196    3
196    3
196    3
196    3
196    4
196    5
196    3
196    4
196    4
196    5
196    2
196    3
196    2
196    4
196    4
196    3
196    1
Name: rating, dtype: int64


In [None]:
print(ratings, loc[196]['rating'])

In [14]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 196 to 12
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   movie_id   100000 non-null  int64
 1   rating     100000 non-null  int64
 2   timestamp  100000 non-null  int64
dtypes: int64(3)
memory usage: 5.1 MB


In [15]:
ratings.describe()

Unnamed: 0,movie_id,rating,timestamp
count,100000.0,100000.0,100000.0
mean,425.53013,3.52986,883528900.0
std,330.798356,1.125674,5343856.0
min,1.0,1.0,874724700.0
25%,175.0,3.0,879448700.0
50%,322.0,4.0,882826900.0
75%,631.0,4.0,888260000.0
max,1682.0,5.0,893286600.0


In [18]:
# 영화별 평균 평점
movie_mean=ratings.groupby(['movie_id'])['rating'].mean()
print(type(movie_mean))
print(movie_mean.head())

<class 'pandas.core.series.Series'>
movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64


In [19]:
movie_mean.loc[1:3]

movie_id
1    3.878319
2    3.206107
3    3.033333
Name: rating, dtype: float64

In [20]:
#가장 평점 높은 영화 20건 출력
movie_mean.sort_values(ascending=False)[:20]

movie_id
814     5.000000
1599    5.000000
1201    5.000000
1122    5.000000
1653    5.000000
1293    5.000000
1500    5.000000
1189    5.000000
1536    5.000000
1467    5.000000
1449    4.625000
119     4.500000
1398    4.500000
1642    4.500000
1594    4.500000
408     4.491071
318     4.466443
169     4.466102
483     4.456790
114     4.447761
Name: rating, dtype: float64

In [21]:
#가장 평점 높은 영화명 20건 출력
top20=movie_mean.sort_values(ascending=False)[:20].index
print(top20)
print(movies.loc[top20][['title']])

Int64Index([ 814, 1599, 1201, 1122, 1653, 1293, 1500, 1189, 1536, 1467, 1449,
             119, 1398, 1642, 1594,  408,  318,  169,  483,  114],
           dtype='int64', name='movie_id')
                                                      title
movie_id                                                   
814                           Great Day in Harlem, A (1994)
1599                          Someone Else's America (1995)
1201             Marlene Dietrich: Shadow and Light (1996) 
1122                         They Made Me a Criminal (1939)
1653      Entertaining Angels: The Dorothy Day Story (1996)
1293                                        Star Kid (1997)
1500                              Santa with Muscles (1996)
1189                                     Prefontaine (1997)
1536                                   Aiqing wansui (1994)
1467                   Saint of Fort Washington, The (1993)
1449                                 Pather Panchali (1955)
119                  Maya Lin: A

In [22]:
# 평점이 가장 높은 영화 수를 전달받아 영화 정보를 리턴하는 함수 제작
def recom_movie1(k):
    top=movie_mean.sort_values(ascending=False)[:k].index
    print(movies.loc[top][['title']])
recom_movie1(5)

                                                      title
movie_id                                                   
814                           Great Day in Harlem, A (1994)
1599                          Someone Else's America (1995)
1201             Marlene Dietrich: Shadow and Light (1996) 
1122                         They Made Me a Criminal (1939)
1653      Entertaining Angels: The Dorothy Day Story (1996)
