In [1]:
# Install the Python port of google-play-scraper
!pip install google-play-scraper
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up driver
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

# Define keywords to search
keywords = [
    "AI", "AI chatbot", "AI assistant", "ChatGPT", "AI girlfriend", "AI friend",
    "virtual assistant", "AI photo enhancer", "AI art", "AI avatar",
    "AI image generator", "AI face", "photo to anime", "AI writer", "AI writing assistant",
    "AI story generator", "copywriting AI", "text to speech AI", "AI camera",
    "AI filter", "deepfake", "face swap AI", "AI note taker", "AI summarizer",
    "AI study tool", "AI homework help", "AI tutor", "voice AI", "AI voice changer",
    "AI talking", "speech to text AI", "AI logo generator", "AI resume builder",
    "AI content maker", "AI scheduler", "AI tools", "machine learning", "AI app builder",
    "GPT", "Gemini", "Claude AI", "Llama AI", "Sora AI"
]

# Track unique apps by package ID
seen_ids = set()
apps = []

# Helper functions

def get_full_description(app_link):
    driver.get(app_link + "&hl=en")
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    desc_tag = soup.find('meta', {'name': 'description'})
    return desc_tag['content'].strip() if desc_tag else 'No description found'

def extract_data_safety(app_id):
    data_safety_url = f"https://play.google.com/store/apps/datasafety?id={app_id}&hl=en"
    driver.get(data_safety_url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    shared_data = []
    collected_data = []
    security_practices = []

    for section in soup.find_all("h2", class_="q1rIdc"):
        title = section.get_text(strip=True)

        if "Data shared" in title:
            block = section.find_next("div", class_="XgPdwe")
            items = block.find_all("h3", class_="aFEzEb")
            shared_data = [item.get_text(strip=True) for item in items]

        elif "Data collected" in title:
            block = section.find_next("div", class_="XgPdwe")
            items = block.find_all("h3", class_="aFEzEb")
            collected_data = [item.get_text(strip=True) for item in items]

        elif "Security practices" in title:
            block = section.find_next("div", class_="XgPdwe")
            items = block.find_all("h3", class_="aFEzEb")
            security_practices = [item.get_text(strip=True) for item in items]

    return {
        "Shared Data": ", ".join(shared_data),
        "Collected Data": ", ".join(collected_data),
        "Security Practices": ", ".join(security_practices)
    }

# Scrape apps for each keyword
base_url = 'https://play.google.com'

for keyword in keywords:
    print(f"\n🔍 Searching: {keyword}")
    search_url = f"https://play.google.com/store/search?q={keyword}&c=apps&hl=en"
    driver.get(search_url)
    time.sleep(4)

    # Scroll to load more apps
    for _ in range(2):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    app_cards = soup.find_all('div', class_='VfPpkd-EScbFb-JIbuQc')

    for card in app_cards:
        name_tag = card.find('span', class_='DdYX5')
        publisher_tag = card.find('span', class_='wMUdtb')
        link_tag = card.find('a', class_='Si6A0c')

        if name_tag and link_tag:
            link = base_url + link_tag['href']
            app_id = link.split("id=")[-1]

            if app_id in seen_ids:
                continue
            seen_ids.add(app_id)

            app = {
                'App Name': name_tag.text.strip(),
                'Publisher': publisher_tag.text.strip() if publisher_tag else '',
                'Link': link
            }

            try:
                print(f"→ Processing: {app['App Name']}")
                app['Full Description'] = get_full_description(link)
                safety = extract_data_safety(app_id)
                app.update(safety)
            except Exception as e:
                print(f"⚠️ Error with {app['App Name']}: {e}")
                app.update({
                    'Full Description': 'Unavailable',
                    'Shared Data': 'Unavailable',
                    'Collected Data': 'Unavailable',
                    'Security Practices': 'Unavailable'
                })

            apps.append(app)

# STEP 6: Save to CSV
driver.quit()
df = pd.DataFrame(apps)
df.to_csv("AI_apps_full_dataset.csv", index=False)
df.head(10)


Collecting selenium
  Downloading selenium-4.30.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.30.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

Unnamed: 0,App Name,Publisher,Link,Full Description,Shared Data,Collected Data,Security Practices
0,PolyBuzz:formerly Poly.AI,CLOUD WHALE INTERACTIVE TECHNOLOGY LLC.,https://play.google.com/store/apps/details?id=...,Chat & engage with your anime friends and star...,Location,"Financial info, Audio, App info and performanc...","Data is encrypted in transit, You can request ..."
1,Talkie: Creative AI Community,SUBSUP,https://play.google.com/store/apps/details?id=...,Unleash Your AI Imagination,App activity,,"Data is encrypted in transit, You can request ..."
2,Chai: Chat AI Platform,Chai Research Corp.,https://play.google.com/store/apps/details?id=...,Build and Share AI,"App info and performance, App activity, Person...","App info and performance, App activity, Person...","Data is encrypted in transit, You can request ..."
3,ChatGPT,OpenAI,https://play.google.com/store/apps/details?id=...,The official app by OpenAI,Device or other IDs,"Personal info, App activity, Location, Message...","Data is encrypted in transit, You can request ..."
4,"Character AI: Chat, Talk, Text",Character.AI,https://play.google.com/store/apps/details?id=...,"Super-intelligent AI chat bots that hear you, ...",,"Device or other IDs, App activity, Personal in...","Data is encrypted in transit, You can request ..."
5,BALA AI: Character AI Chat App,Pallar Media Limited,https://play.google.com/store/apps/details?id=...,"Character AI Chat, AI Assistant & Copilot App,...",App info and performance,"Personal info, Photos and videos","Data is encrypted in transit, You can request ..."
6,Google Gemini,Google LLC,https://play.google.com/store/apps/details?id=...,"Chat to start writing, planning, learning and ...",,"Contacts, Audio, Messages, Web browsing, Perso...","Data is encrypted in transit, You can request ..."
7,"Linky AI: Chat, Play, Connect",Skywork AI Pte. Ltd.,https://play.google.com/store/apps/details?id=...,"Linky, an unprecedented AI chatbot, brings inf...","App activity, Device or other IDs","Personal info, App info and performance, App a...","Data is encrypted in transit, You can request ..."
8,Question.AI - Chatbot&Math AI,D3 DIMENSION TECHNOLOGY PTE.LTD.,https://play.google.com/store/apps/details?id=...,Chatbot: Scan&Ask AI Assistant Anything and Ge...,Location,"Personal info, Location, Photos and videos, Fi...","Data is encrypted in transit, You can request ..."
9,Perplexity - Ask Anything,PerplexityAI,https://play.google.com/store/apps/details?id=...,The most powerful answer engine powered by AI.,"Device or other IDs, App info and performance","Personal info, App activity, Financial info, D...","Data is encrypted in transit, You can request ..."
