<a href="https://colab.research.google.com/github/mabdulghofur142/Analisis-Media-Sosial/blob/main/Mid%20Project%20AMS/Get_Data_Youtube_No_duplicates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-api-python-client
!pip install unidecode

In [None]:
from googleapiclient.discovery import build
import pandas as pd

# === KONFIGURASI ===
api_key = 'Your API'
keywords = ['keyword1', 'keyword2']
max_results = 3 # maksimum result

youtube = build('youtube', 'v3', developerKey=api_key)

video_info_list = []
top_comments = []
replies = []

# === FUNGSI ===
def search_videos_by_keyword(keywords, max_results=5):
    video_ids = []
    for keyword in keywords:
        search_response = youtube.search().list(
            q=keyword,
            part='id',
            type='video',
            maxResults=max_results
        ).execute()
        video_ids_temp = [
            item['id']['videoId']
            for item in search_response.get('items', [])
            if 'videoId' in item.get('id', {})
        ]
        video_ids = video_ids + video_ids_temp
        print(f"Ditemukan {len(video_ids_temp)} video untuk keyword '{keyword}'")
    return set(video_ids)


def get_video_info(video_id):
    response = youtube.videos().list(
        part='snippet,statistics',
        id=video_id
    ).execute()

    if not response['items']:
        return {}

    item = response['items'][0]
    snippet = item['snippet']
    stats = item['statistics']

    return {
        'video_id': video_id,
        'title': snippet.get('title'),
        'description': snippet.get('description'),
        'uploader': snippet.get('channelTitle'),
        'upload_date': snippet.get('publishedAt'),
        'view_count': stats.get('viewCount'),
        'like_count': stats.get('likeCount'),
        'comment_count': stats.get('commentCount')
    }

def get_all_replies(parent_id):
    all_replies = []
    next_page_token = None
    while True:
        response = youtube.comments().list(
            part="snippet",
            parentId=parent_id,
            maxResults=100,
            pageToken=next_page_token,
            textFormat="plainText"
        ).execute()

        for r in response.get("items", []):
            s = r['snippet']
            all_replies.append({
                'reply_id': r['id'],
                'parent_id': parent_id,
                'author': s.get('authorDisplayName'),
                'text': s.get('textOriginal'),
                'likes': s.get('likeCount'),
                'published': s.get('publishedAt'),
                'updated': s.get('updatedAt')
            })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    return all_replies

def get_all_comments(video_id, cek_replies = True):
    next_page_token = None
    while True:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token,
            textFormat='plainText'
        ).execute()

        for item in response.get("items", []):
            try:
                top = item['snippet']['topLevelComment']
                s = top['snippet']
                comment_id = top['id']

                top_comment_data = {
                    'comment_id': comment_id,
                    'video_id': video_id,
                    'author': s.get('authorDisplayName'),
                    'text': s.get('textOriginal'),
                    'likes': s.get('likeCount'),
                    'published': s.get('publishedAt'),
                    'updated': s.get('updatedAt')
                }
                top_comments.append(top_comment_data)
                if cek_replies:
                    all_r = get_all_replies(comment_id)
                    replies.extend(all_r)

            except KeyError:
                continue

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

# === PROSES UTAMA ===
video_ids = search_videos_by_keyword(keywords, max_results=max_results)
print(f"Ditemukan {len(video_ids)} video untuk keyword '{keywords}'")
print(video_ids)
for vid in video_ids:
    try:
        print(f"Mengambil data video: {vid}")
        info = get_video_info(vid)
        if info:
            video_info_list.append(info)
            get_all_comments(vid, cek_replies=False) # cek_replies=True, Jika ingin mengambil data balasan komen
    except Exception as e:
        print(f"Error saat memproses video {vid}: {e}")

# === SIMPAN KE EXCEL ===
df_video = pd.DataFrame(video_info_list)
df_top = pd.DataFrame(top_comments)
df_replies = pd.DataFrame(replies)

output_file = "Filename.xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    df_video.to_excel(writer, sheet_name='VideoInfo', index=False)
    df_top.to_excel(writer, sheet_name='TopComments', index=False)
    # df_replies.to_excel(writer, sheet_name='Replies', index=False) # aktifkan Jika ingin menyimpan data balasan komen

print(f"✅ Data disimpan ke file Excel: {output_file}")


In [None]:
import pandas as pd

# Baca file Excel
file_path = output_file

# Baca masing-masing sheet ke DataFrame
df_video = pd.read_excel(file_path, sheet_name='VideoInfo')
df_top = pd.read_excel(file_path, sheet_name='TopComments')
# df_replies = pd.read_excel(file_path, sheet_name='Replies') # aktifkan Jika sebelumnya menyimpan data balasan komen

In [None]:
# Fungsi yang digunakan
import re
import pandas as pd
from html import unescape # Import unescape from html module
from unidecode import unidecode # Import unidecode

def get_teks(teks): # bersihkan teks dari mention dan hashtag - Removed df as it's not used
    # Ensure teks is a string before applying regex
    if isinstance(teks, str):
        teks = re.sub(r'@\w+','',teks).strip()
        teks = re.sub(r'#\w+','',teks).strip()
        return teks
    else:
        return '' # Return empty string for non-string values

def remove_links_email(teks): # besihkan teks dari link dan email
    # Ensure teks is a string before applying regex
    if isinstance(teks, str):
        docx = teks.strip()
        urlPattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        emailPattern = re.compile(r'[\w._%+-]+@[\w\.-]+\.[a-zA-Z]{2,4}')
        docx = re.sub(urlPattern,' ', docx) # Remove links
        docx = re.sub(emailPattern,' ', docx) # Remove email
        return docx
    else:
        return '' # Return empty string for non-string values


def unidecode_text(text): # fungsi untuk encode format ASCII dan membersihkan posting media sosial/website dengan entitas html menggunakan fungsi "unescape" di modul "html"
     # Ensure text is a string
    if isinstance(text, str):
        return unescape(unidecode(text))
    else:
        return '' # Return empty string for non-string values

def lower(text):
    return text.lower()

# Handle potential NaN values in the 'text' column by replacing them with empty strings
df_top['text'] = df_top['text'].fillna('')

df_top['teks_new'] = df_top['text'].apply(lambda x: get_teks(x))
df_top['teks_new'] = df_top['teks_new'].apply(lambda x: remove_links_email(x))
df_top['teks_new'] = df_top['teks_new'].apply(lambda x: unidecode_text(x))
df_top['teks_new_lower'] = df_top['teks_new'].apply(lambda x: lower(x))
df_top.info()

In [None]:
df_top_uniq = df_top.drop_duplicates(subset=['author', 'teks_new_lower'], keep='first')
# df_top_uniq = df_top_uniq.drop(columns=['teks_new', 'teks_new_lower'])
df_top_uniq.info()

In [None]:
output_file = "Filename_uniq.xlsx"
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
    df_video.to_excel(writer, sheet_name='VideoInfo', index=False)
    df_top_uniq.to_excel(writer, sheet_name='TopComments', index=False)
