# Tree 기반 모델 예제

## 데이터 전처리

In [177]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics.pairwise import cosine_similarity
from typing import List

In [178]:
data_path = '../../../data/anime/'

# Read the dataset
animes = pd.read_csv(data_path + 'anime.csv')
ratings = pd.read_csv(data_path + 'rating.csv')

In [179]:
animes.sample(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
3790,1755,Di Gi Charat Ohanami Special,Comedy,Special,4,6.88,1553
1803,13333,Tari Tari,"Music, School, Slice of Life",TV,13,7.44,80960
5375,8026,Super Street Fighter IV,"Action, Adventure, Shounen",OVA,1,6.48,4498
3044,3620,Blue Seed Omake,"Comedy, Parody",Special,14,7.09,1952
7607,30334,Sleepy,Dementia,Movie,1,5.25,111
7524,32813,2010,Music,Music,1,5.34,217
8862,9882,High School Mystery: Gakuen Nanafushigi,"Horror, Mystery",TV,41,7.14,344
9745,21491,Ninjaman Ippei,"Action, Comedy, School, Slice of Life",TV,13,6.56,75
8077,17501,Abe George Kattobi Seishun Ki: Shibuya Honky Tonk,"Drama, Ecchi",OVA,4,4.79,175
2899,19697,Toaru Kagaku no Railgun S: Motto Marutto Railgun,Comedy,Special,2,7.14,14807


In [180]:
ratings.sample(10)

Unnamed: 0,user_id,anime_id,rating
3037681,28180,807,5
538101,5388,2943,7
6077903,57017,30276,9
2736115,25767,846,9
578919,5688,1887,6
4455221,42016,12753,7
6788882,62830,31404,8
6676936,61578,9074,9
5773447,54067,9756,10
1846222,17888,10713,9


In [181]:
# 사용하는 피처만 남김
animes = animes[['anime_id', 'type', 'rating', 'members', 'name', 'genre']]

In [182]:
animes

Unnamed: 0,anime_id,type,rating,members,name,genre
0,32281,Movie,9.37,200630,Kimi no Na wa.,"Drama, Romance, School, Supernatural"
1,5114,TV,9.26,793665,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,28977,TV,9.25,114262,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,9253,TV,9.17,673572,Steins;Gate,"Sci-Fi, Thriller"
4,9969,TV,9.16,151266,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."
...,...,...,...,...,...,...
12289,9316,OVA,4.15,211,Toushindai My Lover: Minami tai Mecha-Minami,Hentai
12290,5543,OVA,4.28,183,Under World,Hentai
12291,5621,OVA,4.88,219,Violence Gekiga David no Hoshi,Hentai
12292,6133,OVA,4.98,175,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai


In [183]:
# 컬럼명 변경
animes.columns = ['anime_id', 'type', 'anime_total_rating', 'members', 'name', 'genre']

In [184]:
animes

Unnamed: 0,anime_id,type,anime_total_rating,members,name,genre
0,32281,Movie,9.37,200630,Kimi no Na wa.,"Drama, Romance, School, Supernatural"
1,5114,TV,9.26,793665,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili..."
2,28977,TV,9.25,114262,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S..."
3,9253,TV,9.17,673572,Steins;Gate,"Sci-Fi, Thriller"
4,9969,TV,9.16,151266,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S..."
...,...,...,...,...,...,...
12289,9316,OVA,4.15,211,Toushindai My Lover: Minami tai Mecha-Minami,Hentai
12290,5543,OVA,4.28,183,Under World,Hentai
12291,5621,OVA,4.88,219,Violence Gekiga David no Hoshi,Hentai
12292,6133,OVA,4.98,175,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai


In [185]:
len(animes)

12294

In [186]:
# 결측값이 포함된  애니는 dropna()
animes = animes.dropna()

In [187]:
len(animes)

12017

In [188]:
# 평점테이블도 동일하게 처리
ratings = ratings[['user_id', 'anime_id', 'rating']]

In [189]:
ratings

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [190]:
ratings = ratings[ratings['rating'] != -1]

In [191]:
len(ratings)

6337241

In [192]:
# dataframe join
data = ratings.merge(animes, on='anime_id', how='inner')

In [193]:
data.head(5)

Unnamed: 0,user_id,anime_id,rating,type,anime_total_rating,members,name,genre
0,1,8074,10,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
1,3,8074,6,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
2,5,8074,2,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
3,12,8074,6,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
4,14,8074,6,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"


In [194]:
# 유저 정보가 없으므로, 유저의 평점 패턴에서 유저 피처 추출 (평균 평점 및 평점을 내린 수)
user_avg_rating = data.groupby('user_id')['rating'].mean().rename('User-AvgRating')

In [195]:
user_avg_rating.head(5)

user_id
1    10.000000
2    10.000000
3     7.565217
5     4.355120
7     7.387755
Name: User-AvgRating, dtype: float64

In [196]:
user_num_ratings = data.groupby('user_id')['rating'].count().rename('User-NumRatings')

In [197]:
user_num_ratings.head(5)

user_id
1      4
2      1
3     92
5    459
7    343
Name: User-NumRatings, dtype: int64

In [198]:
# 추출한 유저 피처 결합
user_features = pd.concat([user_avg_rating, user_num_ratings], axis=1)

In [199]:
user_features.head()

Unnamed: 0_level_0,User-AvgRating,User-NumRatings
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.0,4
2,10.0,1
3,7.565217,92
5,4.35512,459
7,7.387755,343


In [200]:
# 애니 메타 데이터에서 아이템 관련 피처 추가 생성
item_num_ratings = data.groupby('anime_id')['rating'].count().rename('Item-NumRatings')

In [201]:
item_num_ratings.head(5)

anime_id
1    13449
5     5790
6     9385
7     2169
8      308
Name: Item-NumRatings, dtype: int64

In [202]:
# 애니 이름 피처에서 TF-IDF 피처 생성, 총 10개의 이름 생성 (most frequent 10 words)
tfidf_vectorizer = TfidfVectorizer(max_features=10)
item_name_tfidf = tfidf_vectorizer.fit_transform(animes['name']).toarray()

In [203]:
item_name_tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.89261098],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [204]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [205]:
tfidf_feature_names

array(['chan', 'ga', 'movie', 'ni', 'no', 'special', 'specials', 'the',
       'to', 'wa'], dtype=object)

In [206]:
# 'type' 컬럼을 원핫인코딩
onehot_encoder = OneHotEncoder(sparse=False)
item_type_onehot = onehot_encoder.fit_transform(animes[['type']])

In [207]:
item_type_onehot

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

In [208]:
# TF-IDF 피처와 type 원핫 피처를 결합
tf_idf_df = pd.DataFrame(item_name_tfidf, columns=tfidf_feature_names, index=animes.anime_id)

In [209]:
tf_idf_df

Unnamed: 0_level_0,chan,ga,movie,ni,no,special,specials,the,to,wa
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
32281,0.0,0.0,0.0,0.0,0.450828,0.0,0.0,0.0,0.0,0.892611
5114,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
28977,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
9253,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
9969,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
9316,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
5543,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
5621,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000
6133,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000


In [210]:
type_df = pd.DataFrame(item_type_onehot, columns=[f'Type-{cat}' for cat in onehot_encoder.categories_[0]], index=animes.anime_id)

In [211]:
type_df.head()

Unnamed: 0_level_0,Type-Movie,Type-Music,Type-ONA,Type-OVA,Type-Special,Type-TV
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32281,1.0,0.0,0.0,0.0,0.0,0.0
5114,0.0,0.0,0.0,0.0,0.0,1.0
28977,0.0,0.0,0.0,0.0,0.0,1.0
9253,0.0,0.0,0.0,0.0,0.0,1.0
9969,0.0,0.0,0.0,0.0,0.0,1.0


In [212]:
animes_features = animes[['anime_id', 'members', 'anime_total_rating']]

In [213]:
animes_features

Unnamed: 0,anime_id,members,anime_total_rating
0,32281,200630,9.37
1,5114,793665,9.26
2,28977,114262,9.25
3,9253,673572,9.17
4,9969,151266,9.16
...,...,...,...
12289,9316,211,4.15
12290,5543,183,4.28
12291,5621,219,4.88
12292,6133,175,4.98


In [214]:
animes_features.set_index('anime_id')

Unnamed: 0_level_0,members,anime_total_rating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
32281,200630,9.37
5114,793665,9.26
28977,114262,9.25
9253,673572,9.17
9969,151266,9.16
...,...,...
9316,211,4.15
5543,183,4.28
5621,219,4.88
6133,175,4.98


In [215]:
item_features = pd.DataFrame(item_num_ratings)\
    .merge(tf_idf_df, on='anime_id')\
    .merge(type_df, on='anime_id')\
    .merge(animes_features.set_index('anime_id'), on='anime_id')

In [216]:
item_features

Unnamed: 0_level_0,Item-NumRatings,chan,ga,movie,ni,no,special,specials,the,to,wa,Type-Movie,Type-Music,Type-ONA,Type-OVA,Type-Special,Type-TV,members,anime_total_rating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,13449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,486824,8.82
5,5790,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,137636,8.40
6,9385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,283069,8.32
7,2169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,64905,7.36
8,308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9848,7.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34324,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,817,5.40
34325,15,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1454,7.05
34349,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1885,7.25
34367,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,582,5.97


In [217]:
# 인터렉션 데이터와 함께, user 및 item 피처 결합
display(data.head(5))
data = data.drop(['anime_total_rating', 'members'], axis=1)

Unnamed: 0,user_id,anime_id,rating,type,anime_total_rating,members,name,genre
0,1,8074,10,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
1,3,8074,6,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
2,5,8074,2,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
3,12,8074,6,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
4,14,8074,6,TV,7.46,535892,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"


In [218]:
data.head(5)

Unnamed: 0,user_id,anime_id,rating,type,name,genre
0,1,8074,10,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
1,3,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
2,5,8074,2,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
3,12,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"
4,14,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural"


In [219]:
data = data.join(user_features, on='user_id')
data

Unnamed: 0,user_id,anime_id,rating,type,name,genre,User-AvgRating,User-NumRatings
0,1,8074,10,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",10.000000,4
1,3,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",7.565217,92
2,5,8074,2,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",4.355120,459
3,12,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",8.818182,22
4,14,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",7.195122,123
...,...,...,...,...,...,...,...,...
6337141,69964,23585,7,Special,Haha wo Tazunete Sanzenri Specials,"Adventure, Drama, Slice of Life",7.891859,823
6337142,69964,33659,6,Special,Fushigi na Somera-chan Special,"Comedy, Magic, Slice of Life",7.891859,823
6337143,72800,30738,4,Movie,Gamba: Gamba to Nakama-tachi,"Adventure, Kids",5.930159,315
6337144,73135,8723,5,OVA,Anime Rakugo Kan,Comedy,5.710953,986


In [220]:
data = data.merge(item_features, on ='anime_id')
data.head(5)

Unnamed: 0,user_id,anime_id,rating,type,name,genre,User-AvgRating,User-NumRatings,Item-NumRatings,chan,...,to,wa,Type-Movie,Type-Music,Type-ONA,Type-OVA,Type-Special,Type-TV,members,anime_total_rating
0,1,8074,10,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",10.0,4,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
1,3,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",7.565217,92,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
2,5,8074,2,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",4.35512,459,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
3,12,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",8.818182,22,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
4,14,8074,6,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",7.195122,123,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46


In [221]:
# implicit feedback 표현을 위해 이진화 (유저의 평균 평점보다 높은 경우 1)
# 절대값으로 지정하는 방법도 있음
data['rating'] = (data['rating'] > data['User-AvgRating']).astype(int)

In [222]:
data.head(5)

Unnamed: 0,user_id,anime_id,rating,type,name,genre,User-AvgRating,User-NumRatings,Item-NumRatings,chan,...,to,wa,Type-Movie,Type-Music,Type-ONA,Type-OVA,Type-Special,Type-TV,members,anime_total_rating
0,1,8074,0,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",10.0,4,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
1,3,8074,0,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",7.565217,92,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
2,5,8074,0,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",4.35512,459,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
3,12,8074,0,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",8.818182,22,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
4,14,8074,0,TV,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",7.195122,123,19488,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46


In [223]:
# Train-test split
X = data.drop(columns=['user_id', 'anime_id', 'rating', 'type', 'name', 'genre'])
display(X.head(5))
y = data['rating']
display(y.head(5))

Unnamed: 0,User-AvgRating,User-NumRatings,Item-NumRatings,chan,ga,movie,ni,no,special,specials,...,to,wa,Type-Movie,Type-Music,Type-ONA,Type-OVA,Type-Special,Type-TV,members,anime_total_rating
0,10.0,4,19488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
1,7.565217,92,19488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
2,4.35512,459,19488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
3,8.818182,22,19488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46
4,7.195122,123,19488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,535892,7.46


0    0
1    0
2    0
3    0
4    0
Name: rating, dtype: int64

In [224]:
X.columns

Index(['User-AvgRating', 'User-NumRatings', 'Item-NumRatings', 'chan', 'ga',
       'movie', 'ni', 'no', 'special', 'specials', 'the', 'to', 'wa',
       'Type-Movie', 'Type-Music', 'Type-ONA', 'Type-OVA', 'Type-Special',
       'Type-TV', 'members', 'anime_total_rating'],
      dtype='object')

In [225]:
y.columns

AttributeError: 'Series' object has no attribute 'columns'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 모델 정의
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    # 'LGBM': LGBMClassifier(random_state=42)
}

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
y_train.head()

In [None]:
X_test.head(5)

## 모델 학습

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [None]:
# 데이터셋 사이즈 지정
dataset_sizes = [1_000, 10_000, 100_000]

In [None]:
# 각 모델 별 auc score를 저장하기 위해 딕셔너리 정의
auc_scores = {model_name: [] for model_name in models.keys()}

In [None]:
auc_scores

In [None]:
# 각 모델 별 roc curve 정보 저장을 위해 딕셔너리 정의
roc_curves = {model_name: [] for model_name in models.keys()}

In [None]:
roc_curves

In [None]:
# ROC curve 그리기
# Parameter fpr: False Positive Rate (Fall-Out)
# Parameter tpr: True Positive Rate (Recall)
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    # 대각선 추가
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16)
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)

In [None]:
# 데이터 사이즈별로 iteration
for size in dataset_sizes:
    # 각 사이즈별로 데이터를 샘플링함
    X_train_sample = X_train.sample(n=size, random_state=42)
    y_train_sample = y_train.loc[X_train_sample.index]

    # 모델 별로 반복
    for model_name, model in tqdm(models.items()):
        print(f'Training {model_name} with {size} samples...')
        
        # fitting
        model.fit(X_train_sample, y_train_sample)
        
        # predict
        y_pred = model.predict_proba(X_test)[:, 1]
        print('----------------')
        print(y_pred)
        print('----------------')
        
        # AUC 점수 계산
        auc_score = roc_auc_score(y_test, y_pred)
        auc_scores[model_name].append(auc_score)
        
        # ROC 곡선
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        roc_curves[model_name].append((fpr, tpr))
        
        print(f'{model_name} AUC: {auc_scores}')

In [None]:
# AUC 스코어 시각화
for model_name, model_auc_scores in auc_scores.items():
    plt.plot(dataset_sizes, model_auc_scores, label=model_name)
plt.xlabel('Training dataset size')
plt.ylabel('AUC score')
plt.legend()
plt.show()


In [None]:
# 가장 큰 데이터셋에 대해서, ROC 커브 그리기
for model_name, model_roc_curves in roc_curves.items():
    fpr, tpr = model_roc_curves[-1] # Select the ROC curve ofr the largest dataset size
    plot_roc_curve(fpr, tpr, label=f'{model_name} (AUC = {auc_scores[model_name][-1]:.2f}')
plt.legend()
plt.show()

## 실제 예측값 확인

In [None]:
user_id = 20

In [None]:
user_data = data[data['user_id'] == user_id]

In [None]:
user_data

In [None]:
rated_animes = user_data['anime_id'].unique()

In [None]:
rated_animes

In [None]:
# rated_animes에 속하지 않은 것(~)들
unratted_animes = item_features[~item_features.index.isin(rated_animes)]

In [None]:
unratted_animes

In [None]:
# 유저정보 붙여주지
user_features_df = user_features.loc[user_id]

In [None]:
user_features_df

In [None]:
# wrong. assign method doesn't mutate the original dataframe
unrated_animes = unratted_animes.assign(**user_features_df)

In [None]:
unratted_animes.head(5)

In [None]:
unrated_animes.assign(**user_features_df).head(5)

In [None]:
combined_unrated_animes = unratted_animes.assign(**user_features_df)

In [None]:
combined_unrated_animes

In [None]:
def recommend_top_n(user_id, model, n=5):
    # 아직 유저가 평가하지 않은 애니 가져오기
    user_data = data[data['user_id'] == user_id]
    rated_animes = user_data['anime_id'].unique()
    unrated_animes = item_features[~item_features.index.isin(rated_animes)]

    # 유저 정보 붙여주기
    user_features_df = user_features.loc[user_id]
    combined_unrated_animes = unrated_animes.assign(**user_features_df)

    # 해당 유저 대상으로 모델 예측
    combined_unrated_animes['predicted_rating'] = model.predict_proba(combined_unrated_animes)[:, 1]
    
    # 상위 N개의 평점 예측
    top_n_animes = combined_unrated_animes.sort_values('predicted_rating', ascending=False).head(n)
    
    return top_n_animes


In [None]:
# 특정 유저에 대한 상위 n개 예측 생성
user_id = 26
top_n = recommend_top_n(user_id, models['Random Forest'], n=5)[['predicted_rating']]

In [None]:
# top n개 예측의 상세 정보
top_n_details = top_n.merge(animes, how='left', left_index=True, right_on='anime_id')

In [None]:
print(f'Top 5 anime recommendations for user {user_id}:')
display(top_n_details)

In [None]:
# 좋은 평점을 준 애니 검색
user_ratings = ratings[ratings['user_id'] == user_id]
well_rated_animes = user_ratings[user_ratings['rating'] >= user_ratings['rating'].mean()]


In [None]:
# 상세 정보 조회
well_rated_details = animes[animes['anime_id'].isin(well_rated_animes['anime_id'])]

In [None]:
well_rated_animes

In [None]:
well_rated_details = well_rated_details.merge(well_rated_animes[['anime_id', 'rating']], on='anime_id', how='inner')

In [None]:
well_rated_details

In [None]:
print(f'\nWell-rated animes by user {user_id}:')
display(well_rated_details)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# model = models['lgbm']
model = models['Random Forest']

feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()