## Aims to supplement existing Tiktok scraper

check for column for unique authorMeta.name. for each authorMeta.name, use an apify actor to scrape data on the creator's features e.g. no of followers, distribution source, watch time, completion rate, geographic reach, Close/medium shots, text overlays, first‑person POV then left join the features back to df

In [None]:
# pip install apify-client


Collecting apify-client
  Downloading apify_client-2.0.0-py3-none-any.whl.metadata (17 kB)
Collecting apify-shared<3.0.0,>=2.0.0 (from apify-client)
  Downloading apify_shared-2.1.0-py3-none-any.whl.metadata (14 kB)
Collecting colorama>=0.4.0 (from apify-client)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting impit>=0.5.3 (from apify-client)
  Downloading impit-0.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Downloading apify_client-2.0.0-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.9/84.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading apify_shared-2.1.0-py3-none-any.whl (16 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading impit-0.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m79.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling colle

In [None]:
from google.colab import drive
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pathlib
from collections import Counter
import os, time, json, re, requests
from tqdm.auto import tqdm

os.environ['APIFY_TOKEN'] = ''

from apify_client import ApifyClient
client = ApifyClient(os.environ['APIFY_TOKEN'])

me      = client.user().get()
limits  = client.user().limits()
usage   = client.user().monthly_usage()

In [None]:
# Uncomment to mount Google Drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/FYP/

Mounted at /content/drive
/content/drive/My Drive/FYP


In [None]:
# Load JSON
with open("tiktok_general_catmum.json", "r") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

#uncomment to convert
# # Save as CSV
# df.to_csv("tiktok_general_data.csv", index=False)

In [None]:
# !pip -q install apify-client tqdm
# !pip -q install tqdm

In [None]:
if 'authorMeta.name' not in df.columns:
    df['authorMeta.name'] = df.get('authorMeta', pd.Series([None]*len(df))).apply(
        lambda x: x.get('name') if isinstance(x, dict) else None
    )
df['authorMeta.name'] = (
    df['authorMeta.name']
    .astype(str)
    .str.strip()
    .str.lstrip('@')
    .replace({'None': np.nan})
)

usernames = df['authorMeta.name'].dropna().unique().tolist()
print(f"Unique creators detected: {len(usernames)}")

client = ApifyClient(os.environ['APIFY_TOKEN'])

# helpers: extract embedded profile stats from each video item
def _safe_get(d, *path):
    cur = d
    for k in path:
        if not isinstance(cur, dict):
            return None
        cur = cur.get(k)
    return cur

def extract_creator_row(item: dict) -> dict | None:
    uname = _safe_get(item, "authorMeta", "name") or _safe_get(item, "user", "uniqueId") or item.get("username")
    if not uname:
        return None

    followers = (
        _safe_get(item, "authorMeta", "fans")
        or _safe_get(item, "authorStats", "followerCount")
        or _safe_get(item, "stats", "followerCount")
        or item.get("followers")
    )
    following = (
        _safe_get(item, "authorMeta", "following")
        or _safe_get(item, "authorStats", "followingCount")
        or _safe_get(item, "stats", "followingCount")
        or item.get("following")
    )
    likes_total = (
        _safe_get(item, "authorMeta", "heart")
        or _safe_get(item, "authorStats", "heartCount")
        or _safe_get(item, "stats", "heartCount")
        or item.get("likes") or item.get("hearts")
    )
    videos = (
        _safe_get(item, "authorMeta", "video")
        or _safe_get(item, "authorStats", "videoCount")
        or _safe_get(item, "stats", "videoCount")
        or item.get("videos")
    )
    verified = (
        _safe_get(item, "authorMeta", "verified")
        or _safe_get(item, "user", "verified")
        or item.get("verified")
    )

    return {
        "authorMeta.name": str(uname).lstrip("@"),
        "creator_followers": pd.to_numeric(followers, errors="coerce"),
        "creator_following": pd.to_numeric(following, errors="coerce"),
        "creator_likes_total": pd.to_numeric(likes_total, errors="coerce"),
        "creator_videos": pd.to_numeric(videos, errors="coerce"),
        "creator_verified": bool(verified) if verified is not None else pd.NA,
    }


# run the actor with 1 video to read profile stats
def fetch_one_row_per_creator(usernames, batch_size=50, pause_s=1.5) -> pd.DataFrame:
    rows_by_user = {}  # username -> row
    for i in tqdm(range(0, len(usernames), batch_size)):
        batch = usernames[i:i+batch_size]
        run = client.actor("clockworks/tiktok-profile-scraper").call(
            run_input={
                "profiles": batch,
                "resultsPerPage": 1, #1 item per profile
                "shouldDownloadVideos": False,
                "shouldDownloadCovers": False,
                "shouldDownloadSlideshowImages": False,
                "shouldDownloadSubtitles": False,
            }
        )
        ds = client.dataset(run["defaultDatasetId"])
        for item in ds.iterate_items():
            row = extract_creator_row(item)
            if row and row["authorMeta.name"]:
                uname = row["authorMeta.name"]
                # keep the first we see or update if new row has a larger follower count (more recent)
                if uname not in rows_by_user:
                    rows_by_user[uname] = row
                else:
                    old = rows_by_user[uname]
                    if (pd.notna(row["creator_followers"]) and
                        (pd.isna(old["creator_followers"]) or row["creator_followers"] > old["creator_followers"])):
                        rows_by_user[uname] = row
        time.sleep(pause_s)
    out = pd.DataFrame(list(rows_by_user.values()))
    return out

features_df = fetch_one_row_per_creator(usernames)

# # add placeholders for non-public analytics
# for col in [
#     "distribution_source",   # e.g., FYP vs Following (not public)
#     "watch_time",            # private analytics
#     "completion_rate",       # private analytics
#     "geographic_reach",      # private analytics
#     "close_or_medium_shots", # needs CV on videos
#     "has_text_overlays",     # needs OCR on frames
#     "first_person_pov"       # needs CV heuristics
# ]:
#     features_df[col] = pd.NA

print("features_df (deduped, one row per creator):", features_df.shape)
display(features_df.head(3))

# left join back to your original df
features_df["__join_key__"] = features_df["authorMeta.name"].str.lower()
df["__join_key__"] = df["authorMeta.name"].astype(str).str.lower()

df_merged = df.merge(
    features_df.drop(columns=["authorMeta.name"]),
    on="__join_key__", how="left"
).drop(columns="__join_key__")

print("Merged shape:", df_merged.shape)
# df_merged.to_parquet("tiktok_general_with_creator_features.parquet", index=False)
df_merged.to_csv("tiktok_general_with_creator_features.csv", index=False)

Unique creators detected: 732


  0%|          | 0/15 [00:00<?, ?it/s]

[36m[apify.tiktok-profile-scraper runId:NbOsIfBylT6p8U3MP][0m -> Status: RUNNING, Message: 
[36m[apify.tiktok-profile-scraper runId:NbOsIfBylT6p8U3MP][0m -> 2025-09-09T08:20:51.866Z ACTOR: Pulling Docker image of build ReRPZBhgCxpRFSIAU from registry.
[36m[apify.tiktok-profile-scraper runId:NbOsIfBylT6p8U3MP][0m -> 2025-09-09T08:20:51.870Z ACTOR: Creating Docker container.
[36m[apify.tiktok-profile-scraper runId:NbOsIfBylT6p8U3MP][0m -> 2025-09-09T08:20:52.026Z ACTOR: Starting Docker container.
[36m[apify.tiktok-profile-scraper runId:NbOsIfBylT6p8U3MP][0m -> 2025-09-09T08:20:52.249Z Will run command: xvfb-run -a -s "-ac -screen 0 1920x1080x24+32 -nolisten tcp" /bin/sh -c ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
[36m[apify.tiktok-profile-scraper runId:NbOsIfBylT6p8U3MP][0m -> 2025-09-09T08:20:54.175Z [32mINFO[39m  System info[90m {"apifyVersion":"3.4.2","apifyClientVersion":"2.12.6","crawleeVersion":"3.13.9","osType":"Linux","nodeVersion":"v20.19.4"}[39

features_df (deduped, one row per creator): (732, 6)


Unnamed: 0,authorMeta.name,creator_followers,creator_following,creator_likes_total,creator_videos,creator_verified
0,pet_health_daily,160800,66.0,749800,326,
1,zoomies.official,112800,5.0,1500000,38,
2,julissa__xv,3878,692.0,288900,19,


Merged shape: (998, 19)
