In [25]:
import requests
import pandas as pd
import time
import random
from openpyxl import load_workbook
import json
import os

In [26]:
def fetch_instagram_data(api_key, code_or_id_or_url, amount=500, pagination_token=None):
    url = "https://instagram-scraper-api2.p.rapidapi.com/v1/followers"
    querystring = {"username_or_id_or_url": code_or_id_or_url, "amount": amount, "pagination_token": pagination_token}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com"
    }
    response = requests.get(url, headers=headers, params=querystring)
    response.raise_for_status()
    return response.json()

In [27]:
def parse_instagram_data(data):
    parsed_data = []
    items = data.get("data", {}).get("items", [])
    for item in items:
        username = item.get("username", "Unknown")
        parsed_data.append({
            "User ID": item.get("id", ""),
            "Username": username,
            "Fullname": item.get("full_name",""),
            "is Private": item.get("is_private",False),
            "Profile URL": f"https://www.instagram.com/{username}/" if username else ""
        })
    return parsed_data

In [28]:
def save_data_to_excel(data, username):
    if not data:
        print("No data to save.")
        return

    # 새로운 데이터프레임 생성
    df = pd.DataFrame(data)
    filename = f'instagram_followers_{username}.xlsx'

    try:
        from openpyxl import Workbook

        # 파일이 없으면 새로 생성
        if not os.path.exists(filename):
            print(f"File does not exist. Creating new file: {filename}")
            wb = Workbook()
            ws = wb.active
            ws.title = "Data"
            wb.save(filename)

        # 파일 로드
        book = load_workbook(filename)
        writer = pd.ExcelWriter(filename, engine='openpyxl')
        writer.book = book
        writer.sheets = {ws.title: ws for ws in book.worksheets}

        # 기존 데이터 로드
        if "Data" in writer.sheets:
            try:
                existing_df = pd.read_excel(filename, sheet_name="Data", engine='openpyxl')
                if existing_df.empty or set(existing_df.columns) != set(df.columns):
                    #print("Existing data structure is incompatible. Reinitializing with new data structure.")
                    existing_df = pd.DataFrame({col: pd.Series(dtype=str) for col in df.columns})
            except Exception as e:
                print(f"Error reading existing data: {e}")
                existing_df = pd.DataFrame({col: pd.Series(dtype=str) for col in df.columns})
        else:
            print("No existing sheet found. Initializing new sheet.")
            existing_df = pd.DataFrame({col: pd.Series(dtype=str) for col in df.columns})

        # 데이터 타입 강제 변환
        df = df.astype({col: str for col in df.columns})
        existing_df = existing_df.astype({col: str for col in df.columns})

        # 중복 제거 (User ID가 있는 경우)
        if "User ID" in df.columns and "User ID" in existing_df.columns:
            existing_ids = set(existing_df["User ID"])
            df = df[~df["User ID"].isin(existing_ids)]

        # 새로운 데이터 추가
        if not df.empty:
            start_row = writer.sheets['Data'].max_row if writer.sheets['Data'].max_row else 1
            df.to_excel(writer, index=False, header=False, startrow=start_row, sheet_name="Data", engine='openpyxl')
            writer.save()
            print(f"{len(df)} new rows added to {filename}.")
        else:
            print("No new data to add. All entries are duplicates.")

    except Exception as e:
        print(f"An error occurred while saving to Excel: {e}")

In [29]:
def save_state(username, pagination_token):
    state = {"username": username, "pagination_token": pagination_token}
    with open("state.json", "w") as f:
        json.dump(state, f)

In [30]:
def load_state():
    try:
        with open("state.json", "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return None

In [31]:
def main(file_path, api_key):
    state = load_state()
    if state:
        print(f"Resuming from saved state: {state}")
        start_username = state["username"]
        pagination_token = state["pagination_token"]
    else:
        start_username = None
        pagination_token = None

    # 파일 읽기
    if file_path.endswith('.csv'):
        posts_df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        posts_df = pd.read_excel(file_path)
    else:
        print("지원되지 않는 파일 형식입니다. CSV 또는 엑셀 파일을 사용하세요.")
        return

    if 'user_name' not in posts_df.columns:
        print("파일에 'user_name' 열이 포함되어 있어야 합니다.")
        return

    # 각 사용자 이름에 대해 데이터 수집
    for _, row in posts_df.iterrows():
        username = row['user_name']
        
        # 이전에 처리 중이던 사용자부터 시작
        if start_username and username != start_username:
            continue
        start_username = None  # 첫 번째 매칭 이후 리셋

        all_data = []

        while True:
            try:
                # 데이터 가져오기
                data = fetch_instagram_data(api_key, username, 200, pagination_token)
                parsed_data = parse_instagram_data(data)
                all_data.extend(parsed_data)

                # 중간 저장
                save_data_to_excel(data=parsed_data, username=username)

                # 다음 페이지 토큰 업데이트
                pagination_token = data.get("pagination_token")

                # 상태 저장
                save_state(username, pagination_token)

                #print(f"Fetched {username} followers with pagination token {pagination_token} successfully.")

                if not pagination_token:
                    print(f"Scraped {username} followers successfully.")
                    break
                time.sleep(random.randint(1, 5))

            except Exception as e:
                print(f"An error occurred: {e}")
                # 중간 데이터 저장
                if all_data:
                    save_data_to_excel(data=all_data, username=username)
                save_state(username, pagination_token)  # 상태 저장
                break

    print("Data fetching completed.")
    # 상태 초기화
    save_state(None, None)


In [32]:
api_key = "fb1be9caf7mshc1fea79903f370fp1c0b11jsn97070d82933d"  # 실제 API 키로 교체하세요
file_path = "./success_users.xlsx"  # 게시물 URL 목록이 포함된 파일 경로

main(file_path, api_key)

Resuming from saved state: {'username': 'dadazzizzi', 'pagination_token': 'Ik1ICwEpGAdJLDMTA15RIRM1GCIsFh4-Fh0UPX0VSzRGEF8WGwkRNwxCBzgEKi02ESNEFU0CRzoxOwIFLBsBERsbXytIG0Y4BB4-NC0JGDIXChoNRitNPU4uLwAVWj8BBAIaECMRBSw_IikjPDEhAykbCBYCKjY5GDFDMwcICgEeHgJCBA1KJV4nEBQfCjoqEUAJAQ8xARxlHlQeUSsOFhAHDAQQAwIVSCNnLENeETgCJh0DKDQdNR0gT1RrbVIBVRsLKgBfOlIVOwcxGzcIfVVQEAcaC1AGLztDCFII'}
An error occurred while saving to Excel: File is not a zip file
An error occurred while saving to Excel: File is not a zip file


KeyboardInterrupt: 