# Gathering Training Data using Google Photos API

This was the first part of the main notebook, which was to gather training data using Google Photos API and my Google Photos album. Since it was my first time using Google Photos API, I made it into a separate notebook.

The following code cell authenticates a user with the Google Photos API and list the first 10 media items from their Google Photos library. 

#### Functions

- The `authenticate_google_photos()` function ensures that the user is authenticated and the credentials are valid.
- The `list_photos()` function uses these credentials to interact with the Google Photos API and list media items.

This setup allows the user to authenticate once and reuse the saved credentials for subsequent runs.

In [1]:
import os
import pickle # Used for serializing and deserializing Python objects, in this case, to save and load authentication tokens
from google_auth_oauthlib.flow import InstalledAppFlow # for authenticating the user
from google.auth.transport.requests import Request # for refreshing the user's access token
from googleapiclient.discovery import build # for accessing the Google Photos API

# Set the scope of access to read-only access to the Google Photos library
SCOPES = ['https://www.googleapis.com/auth/photoslibrary.readonly'] 

def authenticate_google_photos():
    creds = None
    # The file token.pickle stores the user's access and refresh tokens
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'client_secret.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return creds

def list_photos(pagesize=10):
    """ This function tests the connection to the Google Photos API by listing the first 10 photos in the user's library """

    creds = authenticate_google_photos()
    # service = build("photoslibrary", "v1", http=creds.authorize(Http()),static_discovery=False)
    service = build('photoslibrary', 'v1', credentials=creds, 
                    discoveryServiceUrl="https://photoslibrary.googleapis.com/$discovery/rest?version=v1")
    # Print the API library name and version
    print("Using Google API library:", service._baseUrl)
    results = service.mediaItems().list(pageSize=pagesize).execute()
    items = results.get('mediaItems', [])
    
    if not items:
        print('No media items found.')
    else:
        for item in items:
            print(f"Photo Title: {item['filename']} - URL: {item['baseUrl']}")

list_photos(pagesize=10)

Using Google API library: https://photoslibrary.googleapis.com/
Photo Title: 20241012_173652.mp4 - URL: https://lh3.googleusercontent.com/lr/AAJ1LKdz9Vlh2fGcLcEnZRcUZOtI0GLs9ZR24AgvnyXqP0wQK-8_NsJbuzA5XgE4Tgg-2lT4_W5mZpyObVOlUL0PohKUXgNKEKNWW_4Lf2jdELVsk_B_oqup6PNcWLBv4J4a3kzmiYdj2sFs4_KnVtUxAQjIpta1GwhAfdc1660s8ZBGJTwJkf9eMQzF4We5FW6F1DybSmPjZGiIr9nbTmjuJ9y2qgZRWfM4VxCTKGpAcF0AiYv1E3e4UgFm1e5MUGH1oWW026lpTonCoL3wE_Pz_0iHRqD8u7x12Dbmo07A41ii5tUlFkntd-8TgIBjgoGQBGaD6QjZsHRqal9JpDgdchYMG2c-ZqKTWb0zpl06TCK_YgJk2-J0DpipFFYPl3hmtINjfFEb9yrKwdG2R0VTDU-0l3NQGGqGpHU7QF9al46F9Gjb8lnIpdf_oAo-SFN1Q3QXRAybJ1jGajK7QNx-tWrufe1wIEUQjSWQAsXkIiYSfGJgC1Bjd_qI4WJE6wqiLjQArkDNDVdJOLcGJJEGjedLvF8wZckOe76md-HzS_6xz6T1fPaphNQ_ouMqu8lsrcreCA6Qwssab4cs7-pRmb4S_K6laQ8igXb2UdvwhcN5M_PGoNIM_9k7O59hh3Tovy0-5MzlyCpt6uhRms1HSd0goxSbKejjA3kOKJOQyQyJ3KfzmzfNiFZBrIYbrFt-4b1UuNyicrtI-UKg_4CJR3WX-7U1HuwDSVWYU1knKnibKfNooRQm0ObzLfraFLwrg3xn5zPsyyjeX4wGrJMAp2K72k52zo7lonAFceSZ4x0baDYSTkLRXuFHJlln80d1Zl7WjS0xNKvKIR-DjDTNO_n

The connection to the Google Photos API is successful.

## List photos of me using Google Photos API.

- I've prepared albums inside Google Photos named "husband" and "wife", which contain about 900 picture of me and 1200 pictures of my wife, respectively.

- Let's first see if the API correctly accesses my albums.


In [10]:
def list_albums():
    creds = authenticate_google_photos()
    service = build('photoslibrary', 'v1', credentials=creds, 
                    discoveryServiceUrl="https://photoslibrary.googleapis.com/$discovery/rest?version=v1")

    results = service.albums().list(pageSize=10).execute()
    albums = results.get('albums', [])
    
    if not albums:
        print('No albums found.')
        husband_album_id = None
        wife_album_id = None
    else:
        for album in albums:
            print(f"Album Title: {album['title']} - ID: {album['id']}")
            if album['title'] == 'husband':
                husband_album_id = album['id']
            elif album['title'] == 'wife':
                wife_album_id = album['id'] 
            elif album['title'] == 'baby':
                baby_album_id = album['id'] 
    
    return husband_album_id, wife_album_id, baby_album_id
          
husband_album_id, wife_album_id, baby_album_id = list_albums()

Album Title: juyeon - ID: ALV-wv03matWGqOVbZDelG_zK9PrgFo3YGN24rBo_zZBN85wdTIJJpZDSvRcxhWP5YScWSqoBT92
Album Title: baby - ID: ALV-wv2CvAgqxb-f8VJbzD7T1ZQH6YV1y3Mm8HJABkun0uZ9MSpXYOQADjh1U9z4KZJHT6re9sJq
Album Title: wife - ID: ALV-wv1e0wojtN8dyoGozicMS1SHjRvd73b5pdI7M-dmy0LSAwW0E0EaWI63S_Iy1mXjfSWPS7Z7
Album Title: husband - ID: ALV-wv3-YBgY3kJXZcxq4d3KBIFA2-S1Q8VjB8oLqubWuVGRKjsgH27xtqOJeFk7iKAwkJs4ONkr
Album Title: JJ 2022 - ID: ALV-wv1DhmLjqBP5JlYYcfmSjQelUBfVibPVjKFGVJkfibkutsifWsJAzeWnjGJs_T83AVWnCrjI
Album Title: jp, nk - ID: ALV-wv1WuJ8dFAxLrKijujmtvHG9JBEFFgp5gCKrSEdO8wym0oFkbeGr8snxCug37YGic7ZfkTUl
Album Title: zy, neil - hugging - ID: ALV-wv0NdnuqYvbKkgDLLpNzHzDjEbMpan-7QMqv4XQEIUrLyrrsKf1z8eeNp4WjBWG6sOVEpKNB
Album Title: Dali fam - ID: ALV-wv0VbVsz8--6-oE2G5-iDLTsZhCPR5pMgzFaeXfuJiU540mLe-9Qybed8I8D4SHZlECkWUaI


It correctly lists all the albums I have in my account.

Now let's see whether it correctly retrieves the photos within the album titled "husband" and "wife".

In [4]:
  
def list_photos_from_album(album_id, n_number=10):
    creds = authenticate_google_photos()
    service = build('photoslibrary', 'v1', credentials=creds, 
                    discoveryServiceUrl="https://photoslibrary.googleapis.com/$discovery/rest?version=v1")
    
    # List items in a specific album (e.g., "Me")
    results = service.mediaItems().search(body={
        'pageSize': n_number,
        'albumId': album_id  # Replace this with your album ID
    }).execute()
    
    items = results.get('mediaItems', [])
    
    if not items:
        print('No media items found.')
    else:
        filtered_items = [item for item in items if item['mimeType'] == 'image/jpeg']
        
        for item in filtered_items:
            print(f"Photo Title: {item['filename']} - URL: {item['baseUrl']}")

list_photos_from_album(husband_album_id, n_number=10)

print('---')


list_photos_from_album(wife_album_id, n_number=10)

NameError: name 'husband_album_id' is not defined

It successfully retrives the photos inside the album.

Now, let's download the cropped faces from each album, "husband" and "wife".
Only download the faces if the photo contains a single face.

In [21]:
import requests
from io import BytesIO
from PIL import Image
import face_recognition
from matplotlib import pyplot as plt
import numpy as np
from IPython.display import clear_output

def list_and_crop_photos(album_id, save_folder):
    creds = authenticate_google_photos()
    service = build('photoslibrary', 'v1', credentials=creds, 
                    discoveryServiceUrl="https://photoslibrary.googleapis.com/$discovery/rest?version=v1")
    
    page_size = 100 
    items = []
    next_page_token = None

    while True:
        body = {
            'pageSize': page_size,
            'albumId': album_id
        }
        if next_page_token:
            body['pageToken'] = next_page_token

        results = service.mediaItems().search(body=body).execute()
        items.extend(results.get('mediaItems', []))
        next_page_token = results.get('nextPageToken')

        if not next_page_token:
            break

    print(f"Number of retrieved pictures: {len(items)}")
    
    # items = results.get('mediaItems', [])
    
    if not items:
        print('No media items found.')
    else:
        cropped_faces = []
        output_count = 0
        process_count = 0
        for item in items:
            # Print progress
            process_count += 1
            clear_output(wait=True)
            print(f"Processing image {process_count}/{len(items)}", end='\r')
            # Filter to only include .jpg files
            if item['mimeType'] == 'image/jpeg':
                # Download the image
                response = requests.get(item['baseUrl'])
                img = Image.open(BytesIO(response.content))
                img_array = face_recognition.load_image_file(BytesIO(response.content))
                
                # Find all the faces in the image
                face_locations = face_recognition.face_locations(img_array)
                
                # Only process images with a single face
                if len(face_locations) == 1:
                    top, right, bottom, left = face_locations[0]
                    face_image = img.crop((left, top, right, bottom))
                    cropped_faces.append(face_image)
                    if (right - left) > 60:
                        face_image.save(os.path.join(save_folder, f'face_{output_count}.jpg'))
                        output_count += 1
                        if output_count >= 660:
                            break
        
        print(f"{len(cropped_faces)} picture had a single face of significant size")

        # # Display the first 5 cropped face images using matplotlib
        # _, axes = plt.subplots(1, 5, figsize=(15, 5))
        # for ax, face in zip(axes, cropped_faces[:5]):
        #     ax.imshow(face)
        #     ax.axis('off')
        # plt.show()



In [None]:

# Download cropped face images from the husband album
list_and_crop_photos(husband_album_id, 'husband')


![Husband Pictures](./screenshots/husband_pictures.png)

In [9]:
# Download cropped face images from the wife album
list_and_crop_photos(wife_album_id, 'wife')

Number of retrieved pictures: 8596


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x000001F12CDB2C50>

![Wife Pictures](./screenshots/wife_pictures.png)

Of the saved files, I manually deleted the ones that are obviously not the "husband" or "wife" in each case.

As a result, I have 638 images of the wife's face and 510 images of the husband's face.

In [15]:
list_and_crop_photos(baby_album_id, 'baby')

Number of retrieved pictures: 718
266 picture had a single face of significant size


![Baby Pictures](./screenshots/baby_pictures.png)

In [18]:
def get_album_id(album_name):
    creds = authenticate_google_photos()
    service = build('photoslibrary', 'v1', credentials=creds, 
                    discoveryServiceUrl="https://photoslibrary.googleapis.com/$discovery/rest?version=v1")

    results = service.albums().list(pageSize=10).execute()
    albums = results.get('albums', [])
    
    if not albums:
        print('No albums found.')
    else:
        for album in albums:
            if album['title'] == album_name:
                print(f"Album Title: {album['title']} - ID: {album['id']}")
                return album['id']
          
h_dad_album_id = get_album_id('H-dad')
h_mom_album_id = get_album_id('H-mom')
w_dad_album_id = get_album_id('W-dad')
w_mom_album_id = get_album_id('W-mom')

Album Title: H-dad - ID: ALV-wv3mF6djjr0S_8izwqaV6pEKP5IFsUMIFtTevnbbOnef5FCo9AIO-ZxvDAYaSgWs6VDaDp10
Album Title: H-mom - ID: ALV-wv1741TqhtuQFvoKYUeZckToKdM4c-SdTDVqeT5utbedSegF_1KzenLud1fulCj8FMHgjm8n
Album Title: W-dad - ID: ALV-wv0wCQk-7jFCjLDEcc8UajdYyz-tfueIdGv2CTjYoMW8-kKYwiYbCLWL95G3WXFKsac2TbBm
Album Title: W-mom - ID: ALV-wv21U2gWvm5lMI_UMMGkCNEpUdA1aQ43zkpLu8f5JLQKgPw1SssikWfbxIyuf1kgdVandDyV


Since I don't have many pictures of our parents where they are the only person in it, I will lift the "single person picture" condition and save all faces and then manually erase resulting photos of other people.

In [30]:
def list_and_crop_photos(album_id, save_folder, require_single_person=True):
    creds = authenticate_google_photos()
    service = build('photoslibrary', 'v1', credentials=creds, 
                    discoveryServiceUrl="https://photoslibrary.googleapis.com/$discovery/rest?version=v1")
    
    page_size = 100 
    items = []
    next_page_token = None

    while True:
        body = {
            'pageSize': page_size,
            'albumId': album_id
        }
        if next_page_token:
            body['pageToken'] = next_page_token

        results = service.mediaItems().search(body=body).execute()
        items.extend(results.get('mediaItems', []))
        next_page_token = results.get('nextPageToken')

        if not next_page_token:
            break

    print(f"Number of retrieved pictures: {len(items)}")
    
    # items = results.get('mediaItems', [])
    
    if not items:
        print('No media items found.')
    else:
        cropped_faces = []
        output_count = 0
        process_count = 0
        for item in items:
            # Print progress
            process_count += 1
            clear_output(wait=True)
            print(f"Processing image {process_count}/{len(items)}", end='\r')
            # Filter to only include .jpg files
            if item['mimeType'] == 'image/jpeg':
                # Download the image
                response = requests.get(item['baseUrl'])
                img = Image.open(BytesIO(response.content))
                img_array = face_recognition.load_image_file(BytesIO(response.content))
                
                # Find all the faces in the image
                face_locations = face_recognition.face_locations(img_array)
                
                # Only process images with a single face
                if (not require_single_person and len(face_locations) > 0) or (require_single_person and len(face_locations) == 1):
                    top, right, bottom, left = face_locations[0]
                    face_image = img.crop((left, top, right, bottom))
                    cropped_faces.append(face_image)
                    if (right - left) > 60:
                        face_image.save(os.path.join(save_folder, f'face_{output_count}.jpg'))
                        output_count += 1
                        if output_count >= 660:
                            break
        
        print(f"{len(cropped_faces)} picture had a single face of significant size")

        # # Display the first 5 cropped face images using matplotlib
        # _, axes = plt.subplots(1, 5, figsize=(15, 5))
        # for ax, face in zip(axes, cropped_faces[:5]):
        #     ax.imshow(face)
        #     ax.axis('off')
        # plt.show()



In [31]:
list_and_crop_photos(h_dad_album_id, './other_data/h_dad', False)
list_and_crop_photos(h_mom_album_id, './other_data/h_mom', False)
list_and_crop_photos(w_mom_album_id, './other_data/w_mom', False)
list_and_crop_photos(w_dad_album_id, './other_data/w_dad', False)

68 picture had a single face of significant size
