In [7]:
%pip install google-generativeai
%pip install --upgrade google-auth-oauthlib google-auth-httplib2 google-api-python-client

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [37]:
import os
import base64
import random
from googleapiclient.errors import HttpError
import time
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import google.generativeai as genai
from google_auth_oauthlib.flow import Flow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.modify']

# Set up Gemini API
api_key = open('work/api_key.txt', 'r').read().strip()
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')

# 'work/client_secret_192399790253-hmkpdfgc9mt0rk2luvdjusrpr9rkc1sc.apps.googleusercontent.com.json',


def authenticate():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = Flow.from_client_secrets_file(
                'apps.googleusercontent.com.json', SCOPES)
            flow.redirect_uri = 'urn:ietf:wg:oauth:2.0:oob'
            
            auth_url, _ = flow.authorization_url(prompt='consent')
            
            print(f"Please go to this URL and authorize the application: {auth_url}")
            auth_code = input("Enter the authorization code: ")
            
            flow.fetch_token(code=auth_code)
            creds = flow.credentials
            
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return creds

def get_gmail_service():
    creds = authenticate()
    return build('gmail', 'v1', credentials=creds)

def exponential_backoff(func):
    def wrapper(*args, **kwargs):
        max_retries = 5
        for n in range(0, max_retries):
            try:
                return func(*args, **kwargs)
            except HttpError as error:
                if error.resp.status == 429:
                    wait = (2 ** n) + random.random()
                    print(f"429 error. Retrying in {wait} seconds.")
                    time.sleep(wait)
                else:
                    raise
        print("Max retries reached.")
    return wrapper

@exponential_backoff
def fetch_emails(service, user_id='me', max_results=2000):
    batch_size = 500
    try:
        emails = []
        next_page_token = None
        while len(emails) < max_results:
            results = service.users().messages().list(
                userId=user_id, 
                labelIds=['INBOX'], 
                maxResults=min(batch_size, max_results - len(emails)),
                pageToken=next_page_token
            ).execute()
            
            emails.extend(results.get('messages', []))
            next_page_token = results.get('nextPageToken')
            
            if not next_page_token:
                break
            
            time.sleep(1)  # Add a small delay between requests
        
        print(f"Fetched {len(emails)} emails")
        return emails[:max_results]
    except HttpError as error:
        print(f'An error occurred: {error}')
        return []

def get_email_content(email):
    headers = {header['name']: header['value'] for header in email['payload']['headers']}
    subject = headers.get('Subject', '')
    sender = headers.get('From', '')
    
    if 'parts' in email['payload']:
        body = email['payload']['parts'][0]['body']
    else:
        body = email['payload']['body']

    if 'data' in body:
        content = base64.urlsafe_b64decode(body['data']).decode('utf-8')
    else:
        content = ''

    return f"Subject: {subject}\nFrom: {sender}\n\n{content[:500]}"  # Limit content to first 500 characters

@exponential_backoff
def categorize_email_with_gemini(email_content):
    prompt = f"""
    Categorize the following email into one of these categories:
    1. Study
    2. Work-related
    3. Personal
    4. Newsletter
    5. Promotion
    6. Social Media
    7. Miscellaneous

    Provide only the category name as the answer.

    Email:
    {email_content}
    """

    response = model.generate_content(prompt, 
                                      safety_settings ={
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        })
    return response.text.strip()

def create_or_get_label(service, user_id, label_name):
    try:
        labels = service.users().labels().list(userId=user_id).execute()
        for label in labels['labels']:
            if label['name'] == label_name:
                return label['id']
        
        label = service.users().labels().create(userId=user_id, body={'name': label_name}).execute()
        return label['id']
    except HttpError as error:
        print(f'An error occurred: {error}')
        return None

def move_email(service, user_id, message_id, label_id):
    try:
        service.users().messages().modify(
            userId=user_id,
            id=message_id,
            body={'addLabelIds': [label_id], 'removeLabelIds': ['INBOX']}
        ).execute()
    except HttpError as error:
        print(f'An error occurred: {error}')

@exponential_backoff
def move_emails_batch(service, user_id, email_moves):
    batch = service.new_batch_http_request()
    
    for email_id, label_id in email_moves:
        batch.add(service.users().messages().modify(
            userId=user_id,
            id=email_id,
            body={'addLabelIds': [label_id], 'removeLabelIds': ['INBOX']}
        ))
    
    batch.execute()

def main():
    service = get_gmail_service()
    emails = fetch_emails(service)
    email_moves = []
    
    print(f"Processing {len(emails)} emails...")
    
    for i, email in enumerate(emails):
        full_email = service.users().messages().get(userId='me', id=email['id']).execute()
        email_content = get_email_content(full_email)
        category = categorize_email_with_gemini(email_content)
        
        label_id = create_or_get_label(service, 'me', category)
        
        if label_id:
            email_moves.append((email['id'], label_id))
        else:
            print(f"Couldn't create or find label for category: {category}")
        
        # Process in batches of 100 emails
        if (i + 1) % 100 == 0 or i == len(emails) - 1:
            print(f"Processing batch {(i + 1) // 100} of {(len(emails) - 1) // 100 + 1}")
            move_emails_batch(service, 'me', email_moves)
            email_moves = []  # Reset for the next batch
            time.sleep(5)  # Add a delay between large batches
        
        # Print progress every 10 emails
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1} of {len(emails)} emails")

    print(f"Finished processing all emails.")

main()

Fetched 2000 emails
Processing 2000 emails...
Processed 10 of 2000 emails
Processed 20 of 2000 emails
Processed 30 of 2000 emails
Processed 40 of 2000 emails
Processed 50 of 2000 emails
Processed 60 of 2000 emails
Processed 70 of 2000 emails
Processed 80 of 2000 emails
Processed 90 of 2000 emails
Processing batch 1 of 20
Processed 100 of 2000 emails
Processed 110 of 2000 emails
Processed 120 of 2000 emails
Processed 130 of 2000 emails
Processed 140 of 2000 emails
Processed 150 of 2000 emails
Processed 160 of 2000 emails
Processed 170 of 2000 emails
Processed 180 of 2000 emails
Processed 190 of 2000 emails
Processing batch 2 of 20
Processed 200 of 2000 emails
Processed 210 of 2000 emails
Processed 220 of 2000 emails
Processed 230 of 2000 emails
Processed 240 of 2000 emails
Processed 250 of 2000 emails
Processed 260 of 2000 emails
Processed 270 of 2000 emails
Processed 280 of 2000 emails
Processed 290 of 2000 emails
Processing batch 3 of 20
Processed 300 of 2000 emails
Processed 310 of 2