# Tiktok K-beauty Dataset

## PART 1: Data Preprocessing

In [None]:
# 틱톡커 추천을 위한 라이브러리
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Load Data

In [None]:
# 키워드 입력하여 크롤링한 df
clean_beauty_df = pd.read_csv('tiktok_post_clean_beauty_0124.csv')
glow_skin_df = pd.read_csv('tiktok_post_glow_skin_0123.csv')
kbeauty_skin_care_df = pd.read_csv('tiktok_post_kbeauty_skin_care_0124.csv')
korean_skincare_df = pd.read_csv('tiktok_post_korean_skincare_0124.csv')

# 인플루언서별 크롤링 df
tiktoker_df = pd.read_csv('tiktoker_crawling_df_0127.csv')

### Data Preprocessing

#### 1. 키워드df 합치기

##### 1-1 : 키워드 df별 중복행 제거

In [None]:
clean_beauty_df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
glow_skin_df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
kbeauty_skin_care_df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
korean_skincare_df.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)

##### 1-2 : 각 df별 키워드 컬럼 만들어주기
> - 어떤 검색어를 입력하여 추출한 영상인지 알기 위해 검색어 컬럼을 만들어줌

In [None]:
# clean_beauty_df
for i in range(len(clean_beauty_df)):
    clean_beauty_df.loc[i, 'search_term'] = 'clean_beauty'

# glow_skin_df
for i in range(len(glow_skin_df)):
    glow_skin_df.loc[i, 'search_term'] = 'glow skin'

# kbeauty_skin_care_df
for i in range(len(kbeauty_skin_care_df)):
    kbeauty_skin_care_df.loc[i, 'search_term'] = 'kbeauty skin care'

# korean_skincare_df
for i in range(len(korean_skincare_df)):
    korean_skincare_df.loc[i, 'search_term'] = 'korean skincare'

##### 1-3 : 각 df별 노출 순서 컬럼 만들어주기 
> - 각 검색어를 입력했을 때 영상들이 몇 번째에 떴는지에 대한 컬럼 만들어주기

In [None]:
# clean_beauty_df
for i in range(len(clean_beauty_df)):
    clean_beauty_df.loc[i, 'vedio_order'] = i + 1

# glow_skin_df
for i in range(len(glow_skin_df)):
    glow_skin_df.loc[i, 'vedio_order'] = i + 1

# kbeauty_skin_care_df
for i in range(len(kbeauty_skin_care_df)):
    kbeauty_skin_care_df.loc[i, 'vedio_order'] = i + 1

# korean_skincare_df
for i in range(len(korean_skincare_df)):
    korean_skincare_df.loc[i, 'vedio_order'] = i + 1

clean_beauty_df['vedio_order'] = clean_beauty_df['vedio_order'].astype(int)
glow_skin_df['vedio_order'] = glow_skin_df['vedio_order'].astype(int)
kbeauty_skin_care_df['vedio_order'] = kbeauty_skin_care_df['vedio_order'].astype(int)
korean_skincare_df['vedio_order'] = korean_skincare_df['vedio_order'].astype(int)

##### 1-4 : clean_beauty_df 컬럼 이름 바꿔주기
> - 틱톡커 이름(tiktoker_name)컬럼이 titoker_name으로 잘못 입력되어 있어서 tiktoker_name으로 바꿔줌

In [None]:
clean_beauty_df.rename(columns={'titoker_name':'tiktoker_name'}, inplace=True)

##### 1-5 : df 합치기

In [None]:
cb_gl_concat = pd.concat([clean_beauty_df,glow_skin_df], axis=0, ignore_index=True)
kbs_ks_concat = pd.concat([kbeauty_skin_care_df,korean_skincare_df], axis=0, ignore_index=True)
keyword_df = pd.concat([cb_gl_concat,kbs_ks_concat], axis=0, ignore_index=True)
keyword_df

Unnamed: 0,like,comment,save,tiktoker_name,date,info,search_term,vedio_order
0,1563,73,894,morganlkeen,2024-9-5,Replying to @user8022071884889 Non-Toxic Makeu...,clean_beauty,1
1,1337,102,189,therealkatiestone,1-3,#greenscreen new year new diagram #cleansers #...,clean_beauty,2
2,8057,164,5654,kylies.muse,2024-2-1,Replying to @MTMama3 clean makeup options! Som...,clean_beauty,3
3,831,32,432,balancewithani,2024-7-2,I've been looking for a non-toxic beauty brand...,clean_beauty,4
4,5634,104,542,labmuffinbeautyscience,2024-6-22,Clean beauty isn’t making products safer - tha...,clean_beauty,5
...,...,...,...,...,...,...,...,...
778,21.8K,187,6709,4chriisty,2024-6-9,use my yesstyle rewards code CRISPY5 to save m...,korean skincare,193
779,216.3K,381,22.6K,moonskinclub,2023-2-26,✨✨douyin evening skincare routine ✨✨ D: 248154...,korean skincare,194
780,9841,59,2616,koocat,2024-11-18,(non sponsored) a very candid random vid but i...,korean skincare,195
781,881,8,73,ririsglow_,2024-8-26,I’m not leaving the house once my 12 step skin...,korean skincare,196


#### 2. 숫자형, 날짜형 컬럼 전처리
> - view, like, comment, save 컬럼 : 단위(k,m) 변환
> - date
    >> - 25년도 1월 업로드한 경우 2025-01-nn으로 변환 
    >> - nh ago, nd ago, 1w ago인 경우 2025-01-21기준으로 날짜 계산

##### 2-1. date 컬럼 형식 통일 및 datetime으로 변환

In [None]:
def preprocess_date(df):
    today = datetime(2025, 1, 21)

    for i in range(len(df)):
    
        if len(df['date'][i]) >= 8:
            df.loc[i, 'upload_date'] = df['date'][i]
    
        elif len(df['date'][i]) >= 6:
            date = df['date'][i].replace(' ','')
            alphabet_list = [a for a in date]
    
            if 'h' in alphabet_list:
                df.loc[i, 'upload_date'] = today
    
            elif 'd' in alphabet_list:
                day = alphabet_list[0]
                day = int(day)
                df.loc[i, 'upload_date'] = today - timedelta(days=day)
    
            elif 'w' in alphabet_list:
                week = alphabet_list[0]
                week = int(week)
                df.loc[i, 'upload_date'] = today - timedelta(weeks=week)
    
        elif len(df['date'][i]) == 4:
            df.loc[i, 'upload_date'] = '2025-' + df['date'][i]
    
        elif len(df['date'][i]) == 3:
            df.loc[i, 'upload_date'] = '2025-' + df['date'][i]

    # datetime형으로 변환
    df['upload_date'] = pd.to_datetime(df['upload_date'], format="%Y-%m-%d", errors='coerce')


preprocess_date(keyword_df)
preprocess_date(tiktoker_df)

In [None]:
# follower 컬럼 단위 전처리 / tiktoker_df만 진행

def process_follower(df):
    for i in range(len(df)):
        follower = df['follower'][i].replace(' ','')
        follower_list = [a for a in follower]
        if 'K' in follower_list:
            num = follower.split('K')[0]
            num = float(num)
            follower_cal = num*1000
            df.loc[i, 'follower_cnt'] = follower_cal
        elif 'M' in follower_list:
            num = follower.split('M')[0]
            num = float(num)
            follower_cal = num*1000000
            df.loc[i, 'follower_cnt'] = follower_cal
        else:
            num = df['follower'][i].replace(' ','')
            follower_cal = float(num)
            df.loc[i, 'follower_cnt'] = follower_cal

process_follower(tiktoker_df)

In [None]:
# keyword_df 결과 확인 1
keyword_df.tail()

Unnamed: 0,like,comment,save,tiktoker_name,date,info,search_term,vedio_order,upload_date
778,21.8K,187,6709,4chriisty,2024-6-9,use my yesstyle rewards code CRISPY5 to save m...,korean skincare,193,2024-06-09
779,216.3K,381,22.6K,moonskinclub,2023-2-26,✨✨douyin evening skincare routine ✨✨ D: 248154...,korean skincare,194,2023-02-26
780,9841,59,2616,koocat,2024-11-18,(non sponsored) a very candid random vid but i...,korean skincare,195,2024-11-18
781,881,8,73,ririsglow_,2024-8-26,I’m not leaving the house once my 12 step skin...,korean skincare,196,2024-08-26
782,226.5K,1086,55.5K,amyflamy1,1-11,Cheap skincare routine pt.2 #ILLIYOONpartner #...,korean skincare,197,2025-01-11


In [None]:
# keyword_df 결과 확인 1
print(keyword_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783 entries, 0 to 782
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   like           783 non-null    object        
 1   comment        783 non-null    object        
 2   save           783 non-null    object        
 3   tiktoker_name  783 non-null    object        
 4   date           783 non-null    object        
 5   info           782 non-null    object        
 6   search_term    783 non-null    object        
 7   vedio_order    783 non-null    int32         
 8   upload_date    779 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(7)
memory usage: 52.1+ KB
None


In [None]:
# tiktoker_df 결과 확인 1
tiktoker_df.head(3)

Unnamed: 0.1,Unnamed: 0,name,follower,view,like,comment,save,date,info_tag,hash_tag,upload_date,follower_cnt
0,0,mydelicate,399.8K,5158,1334,74,175,14h ago,my favourite blushes and why (with shades)💓 yo...,"['@YesStyle', '@Flower Knows Makeup', '@House ...",2025-01-21,399800.0
1,1,mydelicate,399.8K,24.3K,6606,246,917,3d ago,spend a productive morning with me 💓🎀 why is i...,"['#koreanskincare', '#kbeauty', '#morningrouti...",2025-01-18,399800.0
2,2,mydelicate,399.8K,10.4K,1881,90,218,6d ago,for my girlies who still want to look cute whe...,"['@SHEINUS', '@SHEIN', '#fashion', '#winterclo...",2025-01-15,399800.0


In [None]:
# tiktoker_df 결과 확인 2
tiktoker_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1680 entries, 0 to 1679
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    1680 non-null   int64         
 1   name          1680 non-null   object        
 2   follower      1680 non-null   object        
 3   view          1680 non-null   object        
 4   like          1680 non-null   object        
 5   comment       1680 non-null   int64         
 6   save          1680 non-null   object        
 7   date          1680 non-null   object        
 8   info_tag      1664 non-null   object        
 9   hash_tag      1680 non-null   object        
 10  upload_date   1680 non-null   datetime64[us]
 11  follower_cnt  1680 non-null   float64       
dtypes: datetime64[us](1), float64(1), int64(2), object(8)
memory usage: 157.6+ KB


##### 2-2. view, like, comment, save컬럼 단위 변환

In [None]:
# view 컬럼 단위 전처리

def process_view(df):
    for i in range(len(df)):
        view = df['view'][i].replace(' ','')
        view_list = [a for a in view]
        if 'K' in view_list:
            num = view.split('K')[0]
            num = float(num)
            view_cal = num*1000
            df.loc[i, 'view_cnt'] = view_cal
        elif 'M' in view_list:
            num = view.split('M')[0]
            num = float(num)
            view_cal = num*1000000
            df.loc[i, 'view_cnt'] = view_cal
        else:
            num = df['view'][i].replace(' ','')
            view_cal = float(num)
            df.loc[i, 'view_cnt'] = view_cal

process_view(tiktoker_df)

In [None]:
# like 컬럼 단위 전처리

def process_like(df):
    for i in range(len(df)):
        like = df['like'][i].replace(' ','')
        like_list = [a for a in like]
        if 'K' in like_list:
            num = like.split('K')[0]
            num = float(num)
            like_cal = num*1000
            df.loc[i, 'like_cnt'] = like_cal
        elif 'M' in like_list:
            num = like.split('M')[0]
            num = float(num)
            like_cal = num*1000000
            df.loc[i, 'like_cnt'] = like_cal
        else:
            num = df['like'][i].replace(' ','')
            like_cal = float(num)
            df.loc[i, 'like_cnt'] = like_cal

process_like(keyword_df)
process_like(tiktoker_df)

In [None]:
# comment 컬럼 단위 전처리

def process_comment(df):
    for i in range(len(df)):
        comment = str(df['comment'][i]).replace(' ','')
        comment_list = [a for a in comment]
        if 'K' in comment_list:
            num = comment.split('K')[0]
            num = float(num)
            comment_cal = num*1000
            df.loc[i, 'comment_cnt'] = comment_cal
        elif 'M' in comment_list:
            num = comment.split('M')[0]
            num = float(num)
            comment_cal = num*1000000
            df.loc[i, 'comment_cnt'] = comment_cal
        else:
            num = str(df['comment'][i]).replace(' ','')
            comment_cal = float(num)
            df.loc[i, 'comment_cnt'] = comment_cal

process_comment(keyword_df)
process_comment(tiktoker_df)

In [None]:
# save 컬럼 단위 전처리

def process_save(df):
    for i in range(len(df)):
        save = df['save'][i].replace(' ','')
        save_list = [a for a in save]
        if 'K' in save_list:
            num = save.split('K')[0]
            num = float(num)
            save_cal = num*1000
            df.loc[i, 'save_cnt'] = save_cal
        elif 'M' in save_list:
            num = save.split('M')[0]
            num = float(num)
            save_cal = num*1000000
            df.loc[i, 'save_cnt'] = save_cal
        else:
            num = df['save'][i].replace(' ','')
            save_cal = float(num)
            df.loc[i, 'save_cnt'] = save_cal

process_save(keyword_df)
process_save(tiktoker_df)

In [None]:
# like_cnt, comment_cnt, save_cnt컬럼 float에서 int로 변환
keyword_df['like_cnt'] = keyword_df['like_cnt'].round(0).astype(int)
keyword_df['comment_cnt'] = keyword_df['comment_cnt'].round(0).astype(int)
keyword_df['save_cnt'] = keyword_df['save_cnt'].round(0).astype(int)

tiktoker_df['view_cnt'] = tiktoker_df['view_cnt'].round(0).astype(int)
tiktoker_df['like_cnt'] = tiktoker_df['like_cnt'].round(0).astype(int)
tiktoker_df['comment_cnt'] = tiktoker_df['comment_cnt'].round(0).astype(int)
tiktoker_df['save_cnt'] = tiktoker_df['save_cnt'].round(0).astype(int)

In [None]:
# keyword_df 결과 확인 1
keyword_df.head(5)

Unnamed: 0,like,comment,save,tiktoker_name,date,info,search_term,vedio_order,upload_date,like_cnt,comment_cnt,save_cnt
0,1563,73,894,morganlkeen,2024-9-5,Replying to @user8022071884889 Non-Toxic Makeu...,clean_beauty,1,2024-09-05,1563,73,894
1,1337,102,189,therealkatiestone,1-3,#greenscreen new year new diagram #cleansers #...,clean_beauty,2,2025-01-03,1337,102,189
2,8057,164,5654,kylies.muse,2024-2-1,Replying to @MTMama3 clean makeup options! Som...,clean_beauty,3,2024-02-01,8057,164,5654
3,831,32,432,balancewithani,2024-7-2,I've been looking for a non-toxic beauty brand...,clean_beauty,4,2024-07-02,831,32,432
4,5634,104,542,labmuffinbeautyscience,2024-6-22,Clean beauty isn’t making products safer - tha...,clean_beauty,5,2024-06-22,5634,104,542


In [None]:
# keyword_df 결과 확인 2
keyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783 entries, 0 to 782
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   like           783 non-null    object        
 1   comment        783 non-null    object        
 2   save           783 non-null    object        
 3   tiktoker_name  783 non-null    object        
 4   date           783 non-null    object        
 5   info           782 non-null    object        
 6   search_term    783 non-null    object        
 7   vedio_order    783 non-null    int32         
 8   upload_date    779 non-null    datetime64[ns]
 9   like_cnt       783 non-null    int32         
 10  comment_cnt    783 non-null    int32         
 11  save_cnt       783 non-null    int32         
dtypes: datetime64[ns](1), int32(4), object(7)
memory usage: 61.3+ KB


In [None]:
# tiktoker_df 결과 확인 1
tiktoker_df.head(3)

Unnamed: 0.1,Unnamed: 0,name,follower,view,like,comment,save,date,info_tag,hash_tag,upload_date,follower_cnt,view_cnt,like_cnt,comment_cnt,save_cnt
0,0,mydelicate,399.8K,5158,1334,74,175,14h ago,my favourite blushes and why (with shades)💓 yo...,"['@YesStyle', '@Flower Knows Makeup', '@House ...",2025-01-21,399800.0,5158,1334,74,175
1,1,mydelicate,399.8K,24.3K,6606,246,917,3d ago,spend a productive morning with me 💓🎀 why is i...,"['#koreanskincare', '#kbeauty', '#morningrouti...",2025-01-18,399800.0,24300,6606,246,917
2,2,mydelicate,399.8K,10.4K,1881,90,218,6d ago,for my girlies who still want to look cute whe...,"['@SHEINUS', '@SHEIN', '#fashion', '#winterclo...",2025-01-15,399800.0,10400,1881,90,218


In [None]:
# tiktoker_df 결과 확인2
tiktoker_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1680 entries, 0 to 1679
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    1680 non-null   int64         
 1   name          1680 non-null   object        
 2   follower      1680 non-null   object        
 3   view          1680 non-null   object        
 4   like          1680 non-null   object        
 5   comment       1680 non-null   int64         
 6   save          1680 non-null   object        
 7   date          1680 non-null   object        
 8   info_tag      1664 non-null   object        
 9   hash_tag      1680 non-null   object        
 10  upload_date   1680 non-null   datetime64[us]
 11  follower_cnt  1680 non-null   float64       
 12  view_cnt      1680 non-null   int32         
 13  like_cnt      1680 non-null   int32         
 14  comment_cnt   1680 non-null   int32         
 15  save_cnt      1680 non-null   int32   

#### 3. 해시태그 컬럼 생성
> - 크롤링 한 데이터를 csv파일로 변환하는 과정에서 hash_tag컬럼의 일부 행들이 손상되는 문제 발생
> - info 컬럼에서 해시태그 추출하여 해시태그 컬럼 다시 생성

##### 3-1. info 컬럼에서 해시태그 추출

In [None]:
tiktoker_df = tiktoker_df.rename(columns={'info_tag': 'info'})

def new_hash_tag_process(df):
    for i in range(len(df)):
        info = str(df['info'][i])
        info_split_list = info.split(' ')
        hash_tag_list = []
        for a in info_split_list:
            if a.startswith('#'):
                hash_tag_list.append(a)
            elif a.startswith('@'):
                hash_tag_list.append(a)
            else:
                pass
        df.loc[i, 'hash_tag'] = ','.join(hash_tag_list)

new_hash_tag_process(keyword_df)
new_hash_tag_process(tiktoker_df)

In [None]:
# keyword_df 결과 확인 1
keyword_df.head(3)

Unnamed: 0,like,comment,save,tiktoker_name,date,info,search_term,vedio_order,upload_date,like_cnt,comment_cnt,save_cnt,hash_tag
0,1563,73,894,morganlkeen,2024-9-5,Replying to @user8022071884889 Non-Toxic Makeu...,clean_beauty,1,2024-09-05,1563,73,894,@user8022071884889
1,1337,102,189,therealkatiestone,1-3,#greenscreen new year new diagram #cleansers #...,clean_beauty,2,2025-01-03,1337,102,189,"#greenscreen,#cleansers,#skincare,#beauty,#bea..."
2,8057,164,5654,kylies.muse,2024-2-1,Replying to @MTMama3 clean makeup options! Som...,clean_beauty,3,2024-02-01,8057,164,5654,@MTMama3


In [None]:
# keyword_df 결과 확인 2
keyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783 entries, 0 to 782
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   like           783 non-null    object        
 1   comment        783 non-null    object        
 2   save           783 non-null    object        
 3   tiktoker_name  783 non-null    object        
 4   date           783 non-null    object        
 5   info           782 non-null    object        
 6   search_term    783 non-null    object        
 7   vedio_order    783 non-null    int32         
 8   upload_date    779 non-null    datetime64[ns]
 9   like_cnt       783 non-null    int32         
 10  comment_cnt    783 non-null    int32         
 11  save_cnt       783 non-null    int32         
 12  hash_tag       783 non-null    object        
dtypes: datetime64[ns](1), int32(4), object(8)
memory usage: 67.4+ KB


In [None]:
# tiktoker_df 결과 확인 1
tiktoker_df.head(3)

Unnamed: 0.1,Unnamed: 0,name,follower,view,like,comment,save,date,info,hash_tag,upload_date,follower_cnt,view_cnt,like_cnt,comment_cnt,save_cnt
0,0,mydelicate,399.8K,5158,1334,74,175,14h ago,my favourite blushes and why (with shades)💓 yo...,"@YesStyle,@House,#cbeauty,#koreanmakeup,#skinc...",2025-01-21,399800.0,5158,1334,74,175
1,1,mydelicate,399.8K,24.3K,6606,246,917,3d ago,spend a productive morning with me 💓🎀 why is i...,"#kbeauty,#morningroutine,#vlog,#wonyoungism,#s...",2025-01-18,399800.0,24300,6606,246,917
2,2,mydelicate,399.8K,10.4K,1881,90,218,6d ago,for my girlies who still want to look cute whe...,"@SHEINUS,@SHEIN\nuse,#winterclothes,#coquette,...",2025-01-15,399800.0,10400,1881,90,218


In [None]:
# tiktoker_df 결과 확인2
tiktoker_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1680 entries, 0 to 1679
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Unnamed: 0    1680 non-null   int64         
 1   name          1680 non-null   object        
 2   follower      1680 non-null   object        
 3   view          1680 non-null   object        
 4   like          1680 non-null   object        
 5   comment       1680 non-null   int64         
 6   save          1680 non-null   object        
 7   date          1680 non-null   object        
 8   info          1664 non-null   object        
 9   hash_tag      1680 non-null   object        
 10  upload_date   1680 non-null   datetime64[us]
 11  follower_cnt  1680 non-null   float64       
 12  view_cnt      1680 non-null   int32         
 13  like_cnt      1680 non-null   int32         
 14  comment_cnt   1680 non-null   int32         
 15  save_cnt      1680 non-null   int32   

#### 4. 필요한 컬럼들로 데이터프레임 다시 생성

In [None]:
keyword_df = keyword_df[['search_term','vedio_order','tiktoker_name', 'upload_date', 'like_cnt', 'comment_cnt', 'save_cnt', 'info', 'hash_tag']]
tiktoker_df = tiktoker_df[['name', 'follower_cnt', 'info', 'hash_tag', 'upload_date', 'view_cnt', 'like_cnt', 'comment_cnt','save_cnt']]

display(keyword_df.head(3))
display(tiktoker_df.head(3))

Unnamed: 0,search_term,vedio_order,tiktoker_name,upload_date,like_cnt,comment_cnt,save_cnt,info,hash_tag
0,clean_beauty,1,morganlkeen,2024-09-05,1563,73,894,Replying to @user8022071884889 Non-Toxic Makeu...,@user8022071884889
1,clean_beauty,2,therealkatiestone,2025-01-03,1337,102,189,#greenscreen new year new diagram #cleansers #...,"#greenscreen,#cleansers,#skincare,#beauty,#bea..."
2,clean_beauty,3,kylies.muse,2024-02-01,8057,164,5654,Replying to @MTMama3 clean makeup options! Som...,@MTMama3


Unnamed: 0,name,follower_cnt,info,hash_tag,upload_date,view_cnt,like_cnt,comment_cnt,save_cnt
0,mydelicate,399800.0,my favourite blushes and why (with shades)💓 yo...,"@YesStyle,@House,#cbeauty,#koreanmakeup,#skinc...",2025-01-21,5158,1334,74,175
1,mydelicate,399800.0,spend a productive morning with me 💓🎀 why is i...,"#kbeauty,#morningroutine,#vlog,#wonyoungism,#s...",2025-01-18,24300,6606,246,917
2,mydelicate,399800.0,for my girlies who still want to look cute whe...,"@SHEINUS,@SHEIN\nuse,#winterclothes,#coquette,...",2025-01-15,10400,1881,90,218


In [None]:
# # 데이터 프레임으로 저장
keyword_df.to_csv('tiktok_post_final_df.csv',index=False, encoding="utf-8-sig")
tiktoker_df.to_csv('tiktoker_final_df_0127.csv',index=False, encoding="utf-8-sig")

#### 5. 자연어 처리

In [None]:
# 데이터 다시 불러오기 
keyword_df = pd.read_csv('tiktok_post_final_df.csv')
tiktoker_df = pd.read_csv('tiktoker_final_df_0127.csv')

In [None]:
# porter stemmer 초기화 및 영어 불용어 세트 생성 (캐글 참고)
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# 특정 단어들을 불용어 목록에 추가
additional_stopwords = ['clean beauty', 'glow skin', 'kbeauty skin care', 'korean skincare', 'kbeauty', 'product', 'skincare', 'skin', 'beauty',
                        'Kbeauty', 'wonyoungism ', ' wonyoungism', 'wonyoungism', ' wonyoungism ', 'koreanskincare', 'Wonyoungism']
stop_words.update(additional_stopwords)

# 텍스트 전처리
def clean_text(text):
    if isinstance(text, str):
    
        # 소문자로 모두 변환
        text = text.lower()

        # URL 제거
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # 마크다운 스타일 링크 제거
        text = re.sub(r'\[.*?\]\(.*?\)', '', text)

        # @ 제거
        text = re.sub(r'@\w+', '', text)

        # 구두점, 특수문자 제거
        text = text.translate(str.maketrans('', '', string.punctuation))

        return text
    else:
        return text

# 텍스트 토큰화
def tokenize_text(text):
    if isinstance(text, str):
        return word_tokenize(text)
    else:
        return text

# 불용어 제거
def remove_stopwords(tokens):
    if isinstance(tokens, list):
        print(f"Tokens: {tokens}")  # 중간 결과 확인
        print(f"Stopwords: {stop_words}")
        return [token for token in tokens if token not in stop_words]
    else:
        return tokens

# -> stemming 함수 추가
def stem_tokens(tokens):
    if isinstance(tokens, list):
        return [stemmer.stem(token) for token in tokens]
    else:
        return tokens

In [None]:
# keyword_df 자연어처리 저장
keyword_df['cleaned_info'] = keyword_df['info'].apply(clean_text)
keyword_df['cleaned_hash_tag'] = keyword_df['hash_tag'].apply(clean_text)

keyword_df['tokenized_info'] = keyword_df['cleaned_info'].apply(tokenize_text)
keyword_df['tokenized_hash_tag'] = keyword_df['cleaned_hash_tag'].apply(tokenize_text)

keyword_df['removed_info'] = keyword_df['tokenized_info'].apply(remove_stopwords)
keyword_df['removed_hash_tag'] = keyword_df['tokenized_hash_tag'].apply(remove_stopwords)

keyword_df['stpw_processed_info'] = keyword_df['removed_info'].apply(stem_tokens)
keyword_df['stpw_processed_hash_tag'] = keyword_df['removed_hash_tag'].apply(stem_tokens)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keyword_df['cleaned_info'] = keyword_df['info'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keyword_df['cleaned_hash_tag'] = keyword_df['hash_tag'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  keyword_df['tokenized_info'] = keyword_df['cleaned_info'].apply(t

Tokens: ['replying', 'to', 'nontoxic', 'makeup', 'blushes', 'powder', 'cream', 'stick', 'cheek', 'stain', 'what', 'clean', 'swaps', 'do', 'you', 'want', 'to', 'see', 'next', 'brands', 'mentioned']
Stopwords: {'am', 'any', "isn't", 'themselves', 'up', 'for', 'of', 'i', 'too', 'beauty', 'you', 'each', 'same', 'why', 'or', 'over', 'our', 'such', 'these', "hasn't", "shan't", "haven't", 'wouldn', 'clean beauty', 'that', 'hadn', 'were', 'wonyoungism', 'until', "she's", 'by', 'at', 'will', 'isn', 'while', 'not', 'd', "that'll", 'skin', 'her', 'does', 'the', 'Kbeauty', 'aren', 'some', ' wonyoungism ', 'do', 'where', 'to', 'because', 'into', 'ours', 'hasn', 'whom', 've', 'herself', "shouldn't", "wouldn't", 'yourself', 'ma', "mightn't", 'an', "weren't", 'glow skin', 'with', 'it', 'about', 'who', 'itself', 'koreanskincare', 'few', 'in', 'theirs', 'here', 'very', "you've", "don't", 'during', 'than', ' wonyoungism', 'y', 'needn', 'shouldn', 'again', "hadn't", 'yourselves', 'himself', 'a', 'against'

In [None]:
# tiktoker_df 자연어처리 저장
tiktoker_df = tiktoker_df.rename(columns={'info_tag': 'info'})

tiktoker_df['cleaned_info'] = tiktoker_df['info'].apply(clean_text)
tiktoker_df['cleaned_hash_tag'] = tiktoker_df['hash_tag'].apply(clean_text)

tiktoker_df['tokenized_info'] = tiktoker_df['cleaned_info'].apply(tokenize_text)
tiktoker_df['tokenized_hash_tag'] = tiktoker_df['cleaned_hash_tag'].apply(tokenize_text)

tiktoker_df['removed_info'] = tiktoker_df['tokenized_info'].apply(remove_stopwords)
tiktoker_df['removed_hash_tag'] = tiktoker_df['tokenized_hash_tag'].apply(remove_stopwords)

tiktoker_df['stpw_processed_info'] = tiktoker_df['removed_info'].apply(stem_tokens)
tiktoker_df['stpw_processed_hash_tag'] = tiktoker_df['removed_hash_tag'].apply(stem_tokens)

Tokens: ['my', 'favourite', 'blushes', 'and', 'why', 'with', 'shades💓', 'you', 'can', 'find', 'them', 'at', 'using', 'rewards', 'cde', 'mydelicate10', '💓', 'knows', 'makeup', 'of', 'hur', 'kbeauty', 'cbeauty', 'koreanmakeup', 'skincare', 'wonyoungism', 'blush']
Stopwords: {'am', 'any', "isn't", 'themselves', 'up', 'for', 'of', 'i', 'too', 'beauty', 'you', 'each', 'same', 'why', 'or', 'over', 'our', 'such', 'these', "hasn't", "shan't", "haven't", 'wouldn', 'clean beauty', 'that', 'hadn', 'were', 'wonyoungism', 'until', "she's", 'by', 'at', 'will', 'isn', 'while', 'not', 'd', "that'll", 'skin', 'her', 'does', 'the', 'Kbeauty', 'aren', 'some', ' wonyoungism ', 'do', 'where', 'to', 'because', 'into', 'ours', 'hasn', 'whom', 've', 'herself', "shouldn't", "wouldn't", 'yourself', 'ma', "mightn't", 'an', "weren't", 'glow skin', 'with', 'it', 'about', 'who', 'itself', 'koreanskincare', 'few', 'in', 'theirs', 'here', 'very', "you've", "don't", 'during', 'than', ' wonyoungism', 'y', 'needn', 'sho