In [9]:
import os
import json
from google.oauth2 import service_account
from googleapiclient.discovery import build
import pandas as pd
import gspread
# from gspread_dataframe import set_with_dataframe
# from io import BytesIO
# import openpyxl
import re
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

# Get environment variables
SERVICE_ACCOUNT_FILE = os.getenv("SERVICE_ACCOUNT_FILE")
SCOPES = os.getenv("API_SCOPES").split(",")

credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE,
    scopes=SCOPES
)

# Setup Drive and Sheets API clients
drive_service = build('drive', 'v3', credentials=credentials)
gc = gspread.authorize(credentials)

In [8]:
def list_google_sheets_in_first_level_subfolders(drive_service, root_folder_id):
    sheets = []

    # Step 1: Get first-level sub-folders
    subfolders_query = f"'{root_folder_id}' in parents and mimeType='application/vnd.google-apps.folder'"
    subfolders = drive_service.files().list(
        q=subfolders_query,
        spaces='drive',
        fields='files(id, name)'
    ).execute().get("files", [])

    print(f"📁 Found {len(subfolders)} subfolders")

    # Step 2: In each subfolder, get Google Sheets
    for folder in subfolders:
        folder_id = folder['id']
        folder_name = folder['name']

        sheet_query = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.spreadsheet'"
        sheet_files = drive_service.files().list(
            q=sheet_query,
            spaces='drive',
            fields='files(id, name)'
        ).execute().get("files", [])

        for sheet in sheet_files:
            sheets.append({
                "id": sheet["id"],
                "name": sheet["name"],
                "folder": folder_name,
                "folder_id": folder_id
            })

    return sheets

In [13]:
# Load environment variables from the .env file
load_dotenv()

DRIVER_ROOT_FOLDER_ID = os.getenv("DRIVER_ROOT_FOLDER_ID")

sheets = list_google_sheets_in_first_level_subfolders(
    drive_service, DRIVER_ROOT_FOLDER_ID
)

print(f"✅ Found {len(sheets)} Google Sheets:")

dates = []
party_files_counter = 0
for s in sheets:
    name_arrayed = s["name"].split(" ")
    for i in range(len(name_arrayed)):
        earlyRegistration = re.search("תגובות", s["name"])
        if earlyRegistration != None:
            party_files_counter += 1
            # print(f"- {s['name']} (Folder: {s['folder']}) → ID: {s['id']}")
            dates.append(
                {
                    "file_name": s["name"],
                    "folder_name": s["folder"],
                    "id": s["id"],
                }
            )
            # Found Google sheet in folder, move to next folder
            break


print(f"✅ Found {party_files_counter} Google Sheets with 'תגובות' in their names:")
# print(dates)

📁 Found 30 subfolders
✅ Found 33 Google Sheets:
✅ Found 20 Google Sheets with 'תגובות' in their names:


In [17]:
# Load environment variables from the .env file
load_dotenv()

CACHED_FILE_NAME = os.getenv("CACHED_FILE_NAME")
# Initialize an empty set to store unique column names
columns_set = set()

# Define the cache file
CACHE_FILE = os.path.join(os.getcwd(), CACHED_FILE_NAME)

# Load the cache if it exists
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        cache = json.load(f)
else:
    cache = {}

# Iterate through all files in the dates object
for sheet_info in dates:
    file_id = sheet_info["id"]
    folder_name = sheet_info["folder_name"]

    # Check if the file is already cached
    if folder_name in cache and file_id in cache[folder_name]:
        print(f"Loading data for file '{sheet_info['folder_name']}' from cache.")
        data = cache[folder_name][file_id]
    else:
        print(f"Fetching data for file '{sheet_info['folder_name']}' from Google Sheets.")

        # Open the Google Sheet by ID
        sheet = gc.open_by_key(file_id)

        # Access the first worksheet
        worksheet = sheet.get_worksheet(0)  # 0 is the index of the first worksheet

        # Fetch all rows at once (including headers)
        all_rows = worksheet.get_all_values()  # Single API call to fetch all data

        # Extract headers from the first row
        headers = all_rows[0]  # First row is assumed to be the header row
        print(f"Headers in sheet '{sheet_info['folder_name']}': {headers}")

        # Extract data from the remaining rows
        data = [dict(zip(headers, row)) for row in all_rows[1:]]  # Map rows to headers

        # Cache the data
        if folder_name in cache:
            cache[folder_name][file_id] = data
        else:
            cache[folder_name] = {file_id: data}

        # Iterate through the data list
        for row in data:
            # Add all keys (column names) from the current dictionary to the set
            columns_set.update(row.keys())

        # Save the updated cache to the file
        with open(CACHE_FILE, "w") as f:
            json.dump(cache, f, ensure_ascii=False, indent=4)

# Print the unique column names
print("Unique columns across all sheets:", columns_set)

CACHED_FILE_NAME cached_sheets.json
Loading data for file 'Douze Snapua 8/5/25' from cache.
Loading data for file 'PRIDE & LEGACY 20.3.25' from cache.
Loading data for file 'Gay Haifa 23.1.25' from cache.
Loading data for file 'Divas Party 28/11/24' from cache.
Loading data for file 'halloween party 31/10' from cache.
Loading data for file '5/9/24 Tropical Party' from cache.
Loading data for file 'Pride 2024' from cache.
Loading data for file 'making history 16.5.24' from cache.
Loading data for file 'יש מגדר 28/3/24' from cache.
Loading data for file '2 queer 2 handle   ' from cache.
Loading data for file 'New Year Party 2024' from cache.
Loading data for file 'holigay party 28.9.23' from cache.
Loading data for file 'Summerparty 3.8.23' from cache.
Loading data for file 'PRIDE 15\6\23' from cache.
Loading data for file 'May the fourth Parrty 4/5/23' from cache.
Loading data for file 'אין מגדר תרקדו 30.3.23' from cache.
Loading data for file 'אין יומולדת 16.2' from cache.
Loading data

In [137]:
# Load the cached sheets data
with open(CACHE_FILE, "r") as f:
    cached_sheets = json.load(f)

df_list = []

# Iterate through each file ID and its corresponding data
for folder_name, files in cached_sheets.items():
    for file_id, rows in files.items():
        # Flatten the rows for the current file ID
        df = pd.json_normalize(rows)
        df["file_id"] = file_id  # Add a column for the file ID
        df["folder_name"] = folder_name  # Add a column for the file ID
        df_list.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(df_list, ignore_index=True)

# Remove columns with empty or whitespace-only headers
final_df = final_df.loc[:, ~(final_df.columns.str.strip() == "")]

# Remove columns where all values are either NaN or empty strings
final_df = final_df.loc[:, ~(final_df.isna() | (final_df == '')).all(axis=0)]

def merge_and_rename(content, new_column_name, df):
    # Identify columns that contain the word "שם"
    columns_containing = [col for col in df.columns if content in col.lower()]
    # Merge the columns into a single column named "full_name"
    df[new_column_name] = df[columns_containing].fillna("").agg(" ".join, axis=1)
    df = df.drop(columns=columns_containing)
    return df


## NAME
final_df = merge_and_rename("שם", "full_name", final_df)

## SPECIAL REQUESTS
final_df = merge_and_rename("הערות", "special_requests", final_df)

## SONGS REQUESTS
final_df = merge_and_rename("song", "song_requests", final_df)

# Rename Date
final_df = final_df.rename(columns={
    "חותמת זמן": "timestamp",
    "הגיע.ה?": "arrived",
})

# Display the resulting DataFrame
final_df.head()

Unnamed: 0,timestamp,file_id,folder_name,arrived,full_name,special_requests,song_requests
0,25/04/2025 09:58:03,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,בדיקה,,צ'ה צ'ה צ'ה
1,25/04/2025 12:17:23,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,מורן ענתבי,,
2,25/04/2025 12:23:05,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,מיכל שולדמן,Firstמגיבים לראשונים?,
3,25/04/2025 12:37:56,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,עדי ויסמן,,"ריאהנה, דמי לובאטו, ליידי גאגא, ברונו מארס"
4,25/04/2025 12:39:23,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,קרולינה גררו,,"שאנל, סלין דיון, לורין, דנה אינטרנשיונל"
...,...,...,...,...,...,...,...
3928,08/06/2022 16:52:52,1Vh3HgS-c0VYczLnwY3rVgAUDCYqSbtUeWiLdZEN2eo8,מסיבת פרה פרייד 23.6.22,,Matvei,,Talking Head - Psychokiller
3929,23/06/2022 05:44:21,1Vh3HgS-c0VYczLnwY3rVgAUDCYqSbtUeWiLdZEN2eo8,מסיבת פרה פרייד 23.6.22,,Nacho Dlugo,,Nathy Peluso:Bzrp Music Session
3930,19/06/2022 17:44:12,1Vh3HgS-c0VYczLnwY3rVgAUDCYqSbtUeWiLdZEN2eo8,מסיבת פרה פרייד 23.6.22,,Shneor lirnovoy,,אחלה גבר דנה אינטרנשיונל
3931,18/06/2022 18:00:44,1Vh3HgS-c0VYczLnwY3rVgAUDCYqSbtUeWiLdZEN2eo8,מסיבת פרה פרייד 23.6.22,,Zahi Ajami,,


In [18]:
# duplicates = final_df[final_df["full_name"].duplicated(keep=False)]
# print(duplicates)

In [19]:
# Aggregate by "הגיע.ה?" and calculate the sum of another column (e.g., "value_column")
# final_df.groupby("הגיע.ה?").count()

In [None]:
# TODO: Transpose the DataFrame by swapping names in rows and folder names in columns and 
# TODO: Add arrival column by the rows painted by marker with google apps scripts
# TODO: ML Bag of words for the song requests (For fun)
# TODO: Sentiment analysis for the special requests
# TODO: Check how many of the audience are returning and how many are new (each party and overall)

In [120]:
final_df["עמודה 5"].unique()

array([nan, ''], dtype=object)

In [109]:
final_df.head(10)

Unnamed: 0,timestamp,בקשות לשירים מהדיג'יי طلبات أغاني خاصة من الديجي Song request,הערות\בקשות? ملاحظات/طلبات؟ Any notes?,file_id,folder_name,עמודה 1,בקשות לשירים מהדיג'יי. Song request,הערות\בקשות. Any notes?,עמודה 5,Unnamed: 10,Unnamed: 11,הגיע.ה?,Unnamed: 13,בקשות לשירים. Song request,"הערה, משאלה או השיר האהוב 3> Notes\favourite song",full_name
0,25/04/2025 09:58:03,צ'ה צ'ה צ'ה,,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,בדיקה
1,25/04/2025 12:17:23,,,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,מורן ענתבי
2,25/04/2025 12:23:05,,Firstמגיבים לראשונים?,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,מיכל שולדמן
3,25/04/2025 12:37:56,"ריאהנה, דמי לובאטו, ליידי גאגא, ברונו מארס",,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,עדי ויסמן
4,25/04/2025 12:39:23,"שאנל, סלין דיון, לורין, דנה אינטרנשיונל",,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,קרולינה גררו
5,25/04/2025 12:44:24,,,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,נדב שביט
6,25/04/2025 13:11:00,,,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,רועי רצקובסקי
7,25/04/2025 13:16:11,,,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,עמית ויינברגר
8,25/04/2025 13:16:24,,,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,עמיאל ליבלינג
9,25/04/2025 13:17:46,"Hi, I'm a slut -lil mariko",,1G_kmdFNmcB3M3FXzObnqTsBOfQCwzW6DO4zJPRmvtes,Douze Snapua 8/5/25,,,,,,,,,,,דניאל מלמד


In [31]:
import os
print("Current working directory:", os.getcwd())

Current working directory: /Users/orikanner/Documents/python/meta_snap_project


In [None]:
columns_set

{' ',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 '1',
 '3',
 '5',
 '>',
 '?',
 'A',
 'F',
 'N',
 'S',
 '\\',
 'a',
 'e',
 'f',
 'g',
 'i',
 'l',
 'm',
 'n',
 'o',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'א',
 'ב',
 'ג',
 'ד',
 'ה',
 'ו',
 'ז',
 'ח',
 'י',
 'ך',
 'ל',
 'ם',
 'מ',
 'ן',
 'ע',
 'ק',
 'ר',
 'ש',
 'ת',
 '؟',
 'أ',
 'ا',
 'ب',
 'ة',
 'ت',
 'ج',
 'ح',
 'خ',
 'د',
 'س',
 'ص',
 'ط',
 'ظ',
 'غ',
 'ف',
 'ق',
 'ك',
 'ل',
 'م',
 'ن',
 'و',
 'ي'}

In [7]:
print(dates)


[]
