## PART 3: Tiktoker recommend

In [None]:
mean_df = mean_df.merge(influencer_counts, on='name', how='left')

In [None]:
# 쉼표 제거 및 숫자로 변환
cols_to_int = ["follower_cnt", "view_cnt", "like_cnt", "comment_cnt", "save_cnt", "ad_cost"]
cols_to_float = ["ER", "avg_upload_interval"]

# int로 변환할 컬럼
mean_df[cols_to_int] = mean_df[cols_to_int].replace(",", "", regex=True).astype(float).round(0).astype(int)

# float (소수 둘째 자리까지)로 변환할 컬럼
mean_df[cols_to_float] = mean_df[cols_to_float].replace(",", "", regex=True).astype(float).round(2)
mean_df.head(3)

Unnamed: 0,name,follower_cnt,view_cnt,like_cnt,comment_cnt,save_cnt,avg_upload_interval,tiktoker_size,ad_cost,ER,...,fashiion_tf,unboxing_tf,skin_routine_tf,trend_tf,asmr,eating_tf,others_tf,no.1,no.2,no.3
0,.blissdiaries,81700,9718,1447,60,196,4.07,micro_influancer,1634000000,1.85,...,29,17,5,3,8,1,0,fashiion_tf,color_tf,skincare_tf
1,.m.egan,238200,178070,23864,183,3527,5.03,middle_influancer,-2147483648,10.1,...,3,4,5,0,2,2,5,skincare_tf,color_tf,skin_routine_tf
2,.woniluv,135600,41956,2399,86,370,2.38,middle_influancer,-2147483648,1.83,...,3,4,9,7,7,1,4,skincare_tf,skin_routine_tf,trend_tf


In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

final_df3 = mean_df.copy()

scaler = MinMaxScaler()
final_df3['normalized_ER'] = scaler.fit_transform(final_df3[['ER']])

# 콘텐츠 컬럼 가중치 반영하여 합치기
# ER가 너무 낮으면 가중치 값이 0이 되어 콘텐츠 키워드가 삭제될 위험 있음
# 이를 방지하려면 최소 1번은 반영되도록 max(1, int(...)) 적용
final_df3['content_features'] = final_df3.apply(lambda x: 
    (x['no.1'] + ' ') * max(1, int(round(x['normalized_ER'] * 3, 0))) + 
    (x['no.2'] + ' ') * max(1, int(round(x['normalized_ER'] * 2, 0))) + 
    (x['no.3'] + ' ') * max(1, int(round(x['normalized_ER'], 0))),
    axis=1) 

# TF-IDF 변환
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(final_df3['content_features'])

# 기존 선택한 인플루언서 기준 유사도 측정
selected_influencers = ["krystallee2222", "emchu_"]
selected_indices = final_df3[final_df3['name'].isin(selected_influencers)].index
similarity_scores = cosine_similarity(tfidf_matrix, tfidf_matrix[selected_indices])

# 평균 유사도 점수 계산 후 정렬
final_df3["content_similarity"] = similarity_scores.mean(axis=1)
df_sorted = final_df3.sort_values(by="content_similarity", ascending=False)
df_sorted

# 최종 추천 인플루언서 리스트
top_n = 10  # 원하는 추천 개수
recommended_influencers = df_sorted.loc[~df_sorted['name'].isin(selected_influencers)].head(top_n)
display(recommended_influencers[["name", "content_similarity"]])

Unnamed: 0,name,content_similarity
14,cleangirlhacks,0.73
28,jasminnlily_,0.73
13,caspertheghostyy,0.73
36,minseonk1m,0.73
1,.m.egan,0.73
24,its.kaylas,0.73
17,dearwonii_,0.73
10,ayheyt3u,0.73
51,thelipstickgirly,0.66
37,misscasxie,0.62
