In [1]:
import pandas as pd

# 전처리
## u.user : 사용자

In [5]:
# 컬럼명 지정
user_col = ['user_id', 'age', 'gender', 'job', 'zip_code']
# user_col = ['아이디', '나이', '성별', '직업', '우편번호']

users = pd.read_csv('u.user', sep='|', names=user_col)
users[:3]

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067


In [6]:
# 컬럼 확인 
users.columns

Index(['user_id', 'age', 'gender', 'job', 'zip_code'], dtype='object')

In [8]:
# 인덱스 변경 : user_id를 index로 설정
users.set_index('user_id', inplace=True)

In [9]:
users[:2]

Unnamed: 0_level_0,age,gender,job,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [10]:
# 컬럼 확인
users.columns

Index(['age', 'gender', 'job', 'zip_code'], dtype='object')

In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 943 entries, 1 to 943
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       943 non-null    int64 
 1   gender    943 non-null    object
 2   job       943 non-null    object
 3   zip_code  943 non-null    object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB


In [13]:
# users.csv 파일 저장
users.to_csv('users.csv', encoding='utf-8')

## u.item : 영화 정보
- 영화아이디, 제목, 개봉날짜, 장르

In [18]:
item_col = ['movie_id', 'title', 'release date', 'a', 'imdb url', 'b', 'action',
            'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary',
            'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance',
            'sci-fi', 'thriller', 'war', 'western']

movies = pd.read_csv('u.item', encoding='latin-1', sep='|', names=item_col)
movies[:2]

Unnamed: 0,movie_id,title,release date,a,imdb url,b,action,adventure,animation,children,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      1682 non-null   int64  
 1   title         1682 non-null   object 
 2   release date  1681 non-null   object 
 3   a             0 non-null      float64
 4   imdb url      1679 non-null   object 
 5   b             1682 non-null   int64  
 6   action        1682 non-null   int64  
 7   adventure     1682 non-null   int64  
 8   animation     1682 non-null   int64  
 9   children      1682 non-null   int64  
 10  comedy        1682 non-null   int64  
 11  crime         1682 non-null   int64  
 12  documentary   1682 non-null   int64  
 13  drama         1682 non-null   int64  
 14  fantasy       1682 non-null   int64  
 15  film-noir     1682 non-null   int64  
 16  horror        1682 non-null   int64  
 17  musical       1682 non-null   int64  
 18  mystery       1682 non-null 

In [21]:
# 컬럼 삭제 : a, b
movies.drop(columns=['a', 'b'], inplace=True)

In [22]:
movies.columns

Index(['movie_id', 'title', 'release date', 'imdb url', 'action', 'adventure',
       'animation', 'children', 'comedy', 'crime', 'documentary', 'drama',
       'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance',
       'sci-fi', 'thriller', 'war', 'western'],
      dtype='object')

In [24]:
movies.set_index('movie_id', inplace=True)

In [26]:
movies.to_csv('movies.csv', encoding='utf-8')

## u.data : 영화 평점
- 사용자 아이디, 영화 아이디, 평점, timestamp
- timestamp : 연도/날짜/시간

In [31]:
data_col = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('u.data', sep='\t', names=data_col)

# 인덱스 설정
ratings.set_index('user_id', inplace=True)

ratings[:2]

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742


In [34]:
# 인덱스 오름차순 정렬
ratings.sort_index(inplace=True)
ratings[:3]

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,55,5,875072688
1,203,4,878542231
1,183,5,875072262


In [35]:
ratings.to_csv('ratings.csv', encoding='utf-8')

# 추천 시스템
## best-seller 
- 가장 인기있는 제품(상품)을 추천
- 사용자 개인(개별) 정보가 없을 때 사용
- 각 영화의 평점 평균을 구한 후 평균값이 높은 것을 순서대로 추천

In [39]:
# 영화별 평점의 평균
movie_mean = ratings.groupby('movie_id').rating.mean()

In [42]:
for m in movie_mean:
    print(m)

3.8783185840707963
3.2061068702290076
3.033333333333333
3.550239234449761
3.302325581395349
3.576923076923077
3.798469387755102
3.9954337899543377
3.8963210702341136
3.831460674157303
3.847457627118644
4.385767790262173
3.4184782608695654
3.9672131147540983
3.7781569965870307
3.2051282051282053
3.119565217391304
2.8
3.9565217391304346
3.4166666666666665
2.761904761904762
4.151515151515151
4.1208791208791204
3.4482758620689653
3.4436860068259385
3.452054794520548
3.1052631578947367
3.931159420289855
2.6666666666666665
3.945945945945946
3.6298701298701297
3.7901234567901234
3.4536082474226806
2.7142857142857144
2.1818181818181817
2.1538461538461537
2.25
3.0083333333333333
3.264367816091954
2.8947368421052633
3.081081081081081
3.804054054054054
3.0
3.3417721518987342
4.05
3.5555555555555554
3.601503759398496
4.094017094017094
3.3209876543209877
4.3584905660377355
3.45679012345679
3.769230769230769
2.953125
3.2403846153846154
3.704697986577181
4.060913705583756
4.0
3.645714285714286
4.0602

In [46]:
movie_mean.sort_values(ascending=False)[:10]

movie_id
814     5.0
1599    5.0
1201    5.0
1122    5.0
1653    5.0
1293    5.0
1500    5.0
1189    5.0
1536    5.0
1467    5.0
Name: rating, dtype: float64

In [48]:
movies.loc[[2, 4]]['title']
movies.loc[[2, 4]].title

movie_id
2     GoldenEye (1995)
4    Get Shorty (1995)
Name: title, dtype: object

In [49]:
# Best-seller 추천
# 함수 정의
def recom_movie(n_items):
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recommend = movies.loc[movie_sort.index].title
    return recommend

In [51]:
# 상위 15개 영화 추천
# 함수 호출
recom_movie(15)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
1293                                      Star Kid (1997)
1500                            Santa with Muscles (1996)
1189                                   Prefontaine (1997)
1536                                 Aiqing wansui (1994)
1467                 Saint of Fort Washington, The (1993)
1449                               Pather Panchali (1955)
119                Maya Lin: A Strong Clear Vision (1994)
1398                                          Anna (1996)
1642                             Some Mother's Son (1996)
1594                                       Everest (1998)
Name: title, dtype: object