In [1]:
import pandas as pd
import os
import emoji
import regex as re

In [2]:
folder_path = "../dataset/clockworks-scraper"

In [3]:
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

In [4]:
dfs = [pd.read_csv(os.path.join(folder_path, file), encoding='utf-8') for file in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

In [5]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23043 entries, 0 to 23042
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   avatarThumbnail    23039 non-null  object 
 1   cid                23042 non-null  float64
 2   createTime         23042 non-null  float64
 3   createTimeISO      23042 non-null  object 
 4   diggCount          23042 non-null  float64
 5   repliesToId        4430 non-null   float64
 6   replyCommentTotal  18612 non-null  float64
 7   submittedVideoUrl  23042 non-null  object 
 8   text               23042 non-null  object 
 9   uid                23042 non-null  float64
 10  uniqueId           23039 non-null  object 
 11  videoWebUrl        23042 non-null  object 
 12  error              1 non-null      object 
 13  input              13744 non-null  object 
 14  likedByAuthor      13744 non-null  object 
 15  pinnedByAuthor     11235 non-null  object 
 16  url                1 n

In [6]:
# Remove NaN values before processing
data = combined_df.dropna(subset=['text'])

# Text Normalization - Convert to lowercase
data['text'] = data['text'].astype(str).str.lower()

# Removing Mentions, Emojis, and Emoticons
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

def remove_emoticons(text):
    emoticon_pattern = r'[:;=8xX][-^]?[\)DPOp\*\(\|\\\/ ]'
    return re.sub(emoticon_pattern, '', text)

def majority_special_chars(text):
    special_chars = re.sub(r'[a-zA-Z0-9\s]', '', text)  # Extract special characters
    return len(special_chars) > len(text) / 2  # Check if majority are special chars

def clean_tiktok_text(text):
    if pd.isna(text):  # Handle missing values
        return ""
    text = remove_mentions(text)  # Remove mentions
    text = remove_emojis(text)    # Remove emojis
    text = remove_emoticons(text) # Remove ASCII emoticons
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning function
data['text'] = data['text'].apply(clean_tiktok_text)

# Remove rows where 'text' is empty after cleaning
data = data[data['text'].str.strip() != ""]

# Remove rows where 'text' has 4 characters or less
data = data[data['text'].str.len() > 4]

# Remove rows where the majority of characters are special characters
data = data[~data['text'].apply(majority_special_chars)]

# Remove duplicate text entries
data = data.drop_duplicates(subset=['text'])

# Keep only required columns
data = data[['cid', 'submittedVideoUrl', 'text']]

# Display cleaned text
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].astype(str).str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = data['text'].apply(clean_tiktok_text)


Unnamed: 0,cid,submittedVideoUrl,text
0,7.42429e+18,https://www.tiktok.com/@sundarindah/video/7424...,sayangnya orang-orang terdekat yang justru mem...
1,7.424297e+18,https://www.tiktok.com/@sundarindah/video/7424...,justru malah org trdekat yg menghancurkan ment...
2,7.467563e+18,https://www.tiktok.com/@sundarindah/video/7424...,halo ka indah sundari
3,7.424316e+18,https://www.tiktok.com/@sundarindah/video/7424...,justru orang terdekat malah jadi penyebabnya
4,7.455553e+18,https://www.tiktok.com/@sundarindah/video/7424...,ka tolong aku


In [7]:
cleaned_df = data

In [8]:
cleaned_df.duplicated(subset=['text']).sum()

0

In [9]:
output_filename = "cleaned_dataset.csv"
cleaned_df.to_csv(output_filename, index=False, encoding='utf-8')

In [10]:
print(f"Dataset bersih telah disimpan di {output_filename}")
cleaned_df.head()

Dataset bersih telah disimpan di cleaned_dataset.csv


Unnamed: 0,cid,submittedVideoUrl,text
0,7.42429e+18,https://www.tiktok.com/@sundarindah/video/7424...,sayangnya orang-orang terdekat yang justru mem...
1,7.424297e+18,https://www.tiktok.com/@sundarindah/video/7424...,justru malah org trdekat yg menghancurkan ment...
2,7.467563e+18,https://www.tiktok.com/@sundarindah/video/7424...,halo ka indah sundari
3,7.424316e+18,https://www.tiktok.com/@sundarindah/video/7424...,justru orang terdekat malah jadi penyebabnya
4,7.455553e+18,https://www.tiktok.com/@sundarindah/video/7424...,ka tolong aku


In [11]:
cleaned_df.info()
print(f"Dataset bersih telah disimpan di {output_filename}")
print(f"Jumlah data sebelum: {combined_df.shape[0]}, setelah: {cleaned_df.shape[0]}")

<class 'pandas.core.frame.DataFrame'>
Index: 14021 entries, 0 to 23041
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cid                14021 non-null  float64
 1   submittedVideoUrl  14021 non-null  object 
 2   text               14021 non-null  object 
dtypes: float64(1), object(2)
memory usage: 438.2+ KB
Dataset bersih telah disimpan di cleaned_dataset.csv
Jumlah data sebelum: 23043, setelah: 14021
