# ***scrap links***

In [None]:
!pip install aiohttp backoff

Collecting backoff
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff
Successfully installed backoff-2.2.1


In [None]:
import asyncio
import aiohttp
import random
import time
import pandas as pd
from bs4 import BeautifulSoup as bf
from aiohttp import ClientSession
from typing import Set, Tuple, Optional
import logging
import nest_asyncio
import asyncio

nest_asyncio.apply()

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

user_agents = [
    "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
    "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
    "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
    "Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)",
    "Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)",
    "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)",
    "Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)",
    "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)",
    "Mozilla/5.0 (compatible; Sogou web spider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)",
    "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)",
    "Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznambot-intro/)",
    "Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)",
    "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup.com/crawler.html)",
    "Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)",
    "Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)",
]

def get_random_user_agent() -> str:
    return random.choice(user_agents)

async def get_html(url: str, session: ClientSession) -> Tuple[Optional[str], Optional[str]]:
    user_agent = get_random_user_agent()
    print(f"User-Agent: {user_agent}")
    headers = {
        'User-Agent': user_agent,
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Referer': 'https://www.google.com/'
    }
    try:
        async with session.get(url, headers=headers, timeout=120) as response:
            response.raise_for_status()
            html = await response.text()
            await asyncio.sleep(random.uniform(1, 2))  # Increased delay
            return html, user_agent
    except aiohttp.ClientError as e:
        logger.error(f"Request error for {url} \n User-Agent: {user_agent}: {e}")
        return None, None

page_count = 0


async def extract_job_links(html: str) -> Set[str]:
    soup = bf(html, 'html.parser')
    product_links = soup.find_all('a', attrs={'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    print(f'Found {len(product_links)} URLs -----------------------------------------------------')

    urls_set = {f'https://www.amazon.com{link.get("href")}' for link in product_links if link.get('href')}
    print(f'Found {len(urls_set)} unique URLs')
    global page_count
    page_count += 1
    print(f"Processed page {page_count}")

    return urls_set

async def scrape_page(session: ClientSession, page_num: int, max_retries: int = 6) -> Set[str]:
    url = f"https://www.amazon.com/s?i=mobile&rh=n%3A7072561011%2Cp_n_feature_thirty-nine_browse-bin%3A113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page={page_num}&qid=1723567169&rnid=113334702011&ref=sr_pg_2"

    for attempt in range(max_retries):
        html, user_agent = await get_html(url, session)
        if html:
            urls = await extract_job_links(html)
            if urls:
                return urls
        logger.warning(f"No product links found on page {page_num}. Retrying... (Attempt {attempt + 1}/{max_retries})")
        await asyncio.sleep(random.uniform(2, 5))  # Longer delay before retrying

    logger.error(f"Failed to find product links on page {page_num} after {max_retries} attempts.")
    return set()

async def main():
    all_urls = set()
    async with aiohttp.ClientSession() as session:
        tasks = [scrape_page(session, page) for page in range(1, 190)]
        results = await asyncio.gather(*tasks)
        for urls in results:
            all_urls.update(urls)

    logger.info(f"Total URLs scraped: {len(all_urls)}")
    print(f"Total URLs scraped: {len(all_urls)}")
    return list(all_urls)


# Run the main function
urls = asyncio.get_event_loop().run_until_complete(main())

# Create and save the DataFrame
df = pd.DataFrame(urls, columns=['URL'])
df.to_csv('links.csv', index=False)

if __name__ == "__main__":
    asyncio.run(main())


User-Agent: Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)
User-Agent: Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)
User-Agent: Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup.com/crawler.html)
User-Agent: Mozilla/5.0 (compatible; Sogou web spider/4.0; +http://w

ERROR:__main__:Request error for https://www.amazon.com/s?i=mobile&rh=n%3A7072561011%2Cp_n_feature_thirty-nine_browse-bin%3A113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page=1&qid=1723567169&rnid=113334702011&ref=sr_pg_2 
 User-Agent: Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html): 503, message='Service Unavailable', url='https://www.amazon.com/s?i=mobile&rh=n:7072561011,p_n_feature_thirty-nine_browse-bin:113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page=1&qid=1723567169&rnid=113334702011&ref=sr_pg_2'
ERROR:__main__:Request error for https://www.amazon.com/s?i=mobile&rh=n%3A7072561011%2Cp_n_feature_thirty-nine_browse-bin%3A113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page=2&qid=1723567169&rnid=113334702011&ref=sr_pg_2 
 User-Agent: Mozilla/5.0 

User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)
User-Agent: Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
Found 28 -----------------------------------------------------
Found 24 unique URLs
Processed page 1
U



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 5
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; Sogou web spider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)
User-Agent: Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://as



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 15
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 16
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 17
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 18




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 19
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 20
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 21
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 22
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 23
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 24




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 25
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 26
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 27
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 28
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 29
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 30
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 31
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 32
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 33
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
F



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 48
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 49
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 50
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 51
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 52
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 53
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 54
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 55
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 56




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 57
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 58
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 59
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 60
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 61
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 62
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 63
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 64
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 65
Found 24 -----------------------------------------------------
Found 24 unique URLs




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 83
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 84
Found 34 -----------------------------------------------------
Found 33 unique URLs
Processed page 85
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 86
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 87
Found 35 -----------------------------------------------------
Found 33 unique URLs
Processed page 88
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 89
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 90
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 91
Found 24 -----------------------------------------------------
Found 24 unique URLs




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 120




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 121
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 122
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 123
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 124
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 125
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 126
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 127
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 128
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 129
Found 24 -----------------------------------------------------
Found 24 uni



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 152
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 153
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 154
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 155
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 156
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 157




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 158




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 159




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 160




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 161




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 162




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 163




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 164




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 165




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 166




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 167




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 168




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 169




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 170
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 171
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 172




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 173




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 174




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 175
User-Agent: Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup.com/crawler.html)
User-Agent: Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznambot-intro/)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 176




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 177




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 178




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 179




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 180




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 181




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 182




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 183




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 184




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 185




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 186




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 187




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 188




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 189
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
User-Agent: Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; Sogou web spider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)
User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 190
User-Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
User-Agent: Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 202
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 203
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 204
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 205
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 206
User-Agent: Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 207
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 208
Found 24 -----------



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 216




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 217




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 218
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 219




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 220




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 221
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 222




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 223
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 224




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 225
Found 27 -----------------------------------------------------
Found 24 unique URLs
Processed page 226
User-Agent: Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 227
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 228
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed pa



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 230
Found 28 -----------------------------------------------------
Found 24 unique URLs
Processed page 231
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 232
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 233
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 234




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 235
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 236
User-Agent: Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 237
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 238
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 239
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 240
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 241
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 242
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)
Found 25 -------------------------------

ERROR:__main__:Request error for https://www.amazon.com/s?i=mobile&rh=n%3A7072561011%2Cp_n_feature_thirty-nine_browse-bin%3A113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page=1&qid=1723567169&rnid=113334702011&ref=sr_pg_2 
 User-Agent: Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots): 503, message='Service Unavailable', url='https://www.amazon.com/s?i=mobile&rh=n:7072561011,p_n_feature_thirty-nine_browse-bin:113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page=1&qid=1723567169&rnid=113334702011&ref=sr_pg_2'
ERROR:__main__:Request error for https://www.amazon.com/s?i=mobile&rh=n%3A7072561011%2Cp_n_feature_thirty-nine_browse-bin%3A113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&s=featured-rank&dc&fs=true&page=2&qid=1723567169&rnid=113334702011&ref=sr_pg_2 
 User-Agent: Mozilla/5.0 (compatible; MJ

Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 245
User-Agent: Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 246
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 247
Found 24 ------



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 252
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznambot-intro/)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
User-Agent: Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznambot-intro/)
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
User-Agent: Mozilla/5.0 



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 286
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 287
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 288
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 289
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 290
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 291
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 292
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 293
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 294
Found 27 -----------------------------------------------------
Found 24 uni



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 310
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 311
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 312
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 313
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 314
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 315
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 316
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 317
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 318
Found 26 -----------------------------------------------------
Found 24 uni



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 326
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 327
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 328
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 329
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 330
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 331
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 332
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 333
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 334
Found 24 -----------------------------------------------------
Found 24 uni



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 357
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 358
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 359
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 360
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 361
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 362
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 363
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 364




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 365
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 366
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 367
User-Agent: Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 368
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 369
Found 28 -----------------------------------------------------
Found 24 unique URLs
Processed page 370
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 371
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 372
Found 25 -----------------------------------------------------
Found 24 unique URL



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 377
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 378
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 379




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 380
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 381
Found 28 -----------------------------------------------------
Found 24 unique URLs
Processed page 382
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 383
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 384
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 385




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 386
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 387
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 388
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 389
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 390
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 391
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 392
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 393
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 394
Found 25 -----------------------------------------------------
Found 24 uni



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 411




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 412




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 413




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 414




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 415




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 416




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 417




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 418




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 419




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 420




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 421




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 422




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 423
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 424




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 425




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 426




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 427




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 428




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 429




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 430




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 431




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 432
User-Agent: Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)
User-Agent: Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup.com/crawler.html)
User-Agent: Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 433
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 434
User-Agent: Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)
User-Agent: Mozilla/5.0 (compatible; 



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 435




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 436
User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 437
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznambot-intro/)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
User-Agent: Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 438
User-Agent: Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)
User-Agent: Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznamb



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 439
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 440
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 441
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
User-Agent: Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 442
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 443
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 444
Found 24 ------------------------------------



Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 455
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 456
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 457
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 458




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 459
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 460
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 461
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 462




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 463
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 464
User-Agent: Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 465
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 466
Found 25 -----------------------------------------------------
Found 24 unique URLs
Processed page 467
User-Agent: Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)
User-Agent: Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 468
Found 26 -----------------------------------------------------
Found 24 unique URLs
Processed page 469
Found 24 -----------------------------------------------------
Found 24 unique URLs
Processed page 470
User-Agent: Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 471
User-Agent: Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)




Found 0 -----------------------------------------------------
Found 0 unique URLs
Processed page 472


ERROR:__main__:Failed to find product links on page 18 after 6 attempts.


Total URLs scraped: 4483


In [None]:
path= '/content/links.csv'
df = pd.read_csv(path)

In [None]:
print(len(df))

4504


In [None]:
print(df.duplicated().sum())

0


# ***scrap products***


In [None]:
import requests
from bs4 import BeautifulSoup
import random
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re

def scrap_page(soup):
    data = {
        'color': None, 'image_links': None, 'descriptions': None, 'kind_product': None,
        'ratings': None, 'number_of_ratings': None, 'status': None, 'number_of_buyers_last_month_more_than': None,
        'typical_price': None, 'price': None, 'you_save': None, 'discount': None,
        'brand': None, 'os': None, 'cpu_model': None, 'resolution': None, 'name': None,
        'wireless_carrier': None, 'cellular_technology': None, 'dimensions': None,
        'asin': None, 'model': None, 'amazon_renewed': None, 'renewed_smartphones': None,
        'battery_capacity': None, 'battery_power': None, 'charging_time': None,
        'ram': None, 'storage': None, 'screen_size': None, 'connectivity_technologies': None,
        'wireless_network': None, 'cpu_speed': None, 'reviews_usa': None, 'reviews_other': None
    }

    # Extract basic information
    span_selection = soup.find('span', {'class': 'selection'})
    label_text = soup.find('label', {'class': 'a-form-label'}).text.strip().lower() if soup.find('label', {'class': 'a-form-label'}) else ''
    data['color'] = span_selection.text.strip() if span_selection and label_text == 'color:' else None
    imgs = soup.find('ul', {'class': 'a-unordered-list a-nostyle a-button-list a-vertical a-spacing-top-micro regularAltImageViewLayout'})
    data['image_links'] = [img.get('src') for img in imgs.find_all('img')] if imgs else None
    des = soup.find('ul', {'class': 'a-unordered-list a-vertical a-spacing-mini'})
    data['descriptions'] = des.text.strip() if des else None
    kind = soup.find_all('span', {'class': 'a-list-item'})
    data['kind_product'] = kind[2].text.strip() if len(kind) > 2 else None

    # Extract ratings and status
    rate_div = soup.find('div', {'id': 'averageCustomerReviews'})
    if rate_div:
        data['ratings'] = rate_div.find('span', {'class': 'a-size-base a-color-base'}).text.strip() if rate_div.find('span', {'class': 'a-size-base a-color-base'}) else None
        number_rate = rate_div.find('span', {'id': 'acrCustomerReviewText'})
        data['number_of_ratings'] = number_rate.text.strip().split()[0] if number_rate else None
    data['status'] = soup.find('span', {'class': 'a-size-medium a-color-success'}).text.strip() if soup.find('span', {'class': 'a-size-medium a-color-success'}) else None
    buys = soup.find('span', {'id': 'social-proofing-faceout-title-tk_bought'})
    data['number_of_buyers_last_month_more_than'] = buys.text.strip().split()[0] if buys else None
    des = soup.find('span', attrs={'class': 'a-size-medium-plus _product-comparison-desktop_priceStyle_savings-percent__3-qzG'})
    data['discount'] = des.text.strip() if des else None
    pric = soup.find('span', attrs={'class': 'a-price-whole'})
    data['price'] = pric.text.strip() if pric else None
    tpr = soup.find('span', attrs={'class': 'a-size-mini a-color-secondary aok-nowrap a-text-strike'})
    data['typical_price'] = tpr.text.strip() if tpr else None


    # Extract product details
    for table in [soup.find('div', {'class': 'a-section a-spacing-small a-spacing-top-small'}),
                  soup.find('table', {'id': 'productDetails_detailBullets_sections1'})]:
        if table:
            for row in table.find_all('tr'):
                header = row.find('td', {'class': 'a-span3'}) or row.find('th')
                value = row.find('td', {'class': 'a-span9'}) or row.find('td')
                if header and value:
                    header_text = header.text.strip().lower()
                    value_text = value.text.strip()
                    if 'brand' in header_text: data['brand'] = value_text
                    elif 'color' in header_text and data['color']== None : data['color'] = value_text
                    elif 'operating system' in header_text: data['os'] = value_text
                    elif 'cpu model' in header_text: data['cpu_model'] = value_text
                    elif 'resolution' in header_text: data['resolution'] = value_text
                    elif 'model name' in header_text: data['name'] = value_text
                    elif 'wireless carrier' in header_text: data['wireless_carrier'] = value_text
                    elif 'cellular technology' in header_text: data['cellular_technology'] = value_text
                    elif 'dimensions' in header_text: data['dimensions'] = value_text
                    elif 'asin' in header_text: data['asin'] = value_text
                    elif 'model number' in header_text: data['model'] = value_text
                    elif 'sellers rank' in header_text:
                        numbers = re.findall(r'\d+', value_text)
                        data['amazon_renewed'] = numbers[0] if len(numbers) >= 1 else None
                        data['renewed_smartphones'] = numbers[2] if len(numbers) >= 3 else None
                    elif 'battery capacity' in header_text: data['battery_capacity'] = value_text
                    elif 'battery power' in header_text: data['battery_power'] = value_text
                    elif 'charging time' in header_text: data['charging_time'] = value_text
                    elif 'ram' in header_text: data['ram'] = value_text
                    elif 'storage' in header_text: data['storage'] = value_text
                    elif 'screen size' in header_text: data['screen_size'] = value_text
                    elif 'connectivity technologies' in header_text: data['connectivity_technologies'] = value_text
                    elif 'wireless network' in header_text: data['wireless_network'] = value_text
                    elif 'cpu speed' in header_text: data['cpu_speed'] = value_text

    # Extract reviews
    def extract_reviews(review_elements):
        reviews = []
        for review in review_elements:
            try:
                name = review.find('span', {'class': 'a-profile-name'}).text.strip()
                rate = review.find('span', {'class': 'a-icon-alt'}).text.strip().replace(' out of 5 stars', '')
                date_country = review.find('span', {'class': 'a-size-base a-color-secondary review-date'}).text.strip()
                parts = date_country.split(" on ")
                color = review.find('div', {'class': 'a-row a-spacing-mini review-data review-format-strip'}).find('span', attrs={'class': 'a-color-secondary'}).text.strip().replace(' Color:', '')
                content = review.find('div', {'class': 'a-expander-content reviewText review-text-content a-expander-partial-collapse-content'}).text.strip()
                support = review.find('span', {'class': 'a-size-base a-color-tertiary cr-vote-text'}).text.strip()
                reviews.append({
                    'name': name, 'rating': rate, 'country': parts[0].replace("Reviewed in ", "").strip(),
                    'date': parts[1].strip(), 'color': color, 'content': content, 'support': support
                })
            except AttributeError:
                continue
        return reviews if reviews else None

    data['reviews_usa'] = extract_reviews(soup.find_all('div', {'class': 'a-section review aok-relative'})[0:5])
    data['reviews_other'] = extract_reviews(soup.find_all('div', {'class': 'a-section review aok-relative cr-desktop-review-page-0'})[0:5])

    return data

user_agents = [
    "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
    "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
    "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
    "Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)",
    "Mozilla/5.0 (compatible; MJ12bot/v1.4.12; http://www.majestic12.co.uk/bot.php)",
    "Mozilla/5.0 (compatible; SemrushBot/7~bl; +http://www.semrush.com/bot)",
    "Mozilla/5.0 (compatible; YandexImages/5.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; YandexSearch/5.0; +http://yandex.com/bots)",
    "Mozilla/5.0 (compatible; DuckDuckBot/1.0; +http://duckduckgo.com/duckduckbot.html)",
    "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)",
    "Mozilla/5.0 (compatible; Sogou web spider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)",
    "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)",
    "Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/cz/seznambot-intro/)",
    "Mozilla/5.0 (compatible; DotBot/1.0; +http://www.opensiteexplorer.org/dotbot)",
    "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup.com/crawler.html)",
    "Mozilla/5.0 (compatible; PetalBot/1.0; +https://aspiegel.com/petalbot)",
    "Mozilla/5.0 (compatible; CCBot/2.0; +http://commoncrawl.org/faq/)",
    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
    "Mozilla/5.0 (compatible; Googlebot-News; +http://www.google.com/bot.html)",
    "Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
    "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
    "Mozilla/5.0 (compatible; Google-Site-Verification/1.0)",
    "Mozilla/5.0 (compatible; FacebookExternalHit/1.1; +http://www.facebook.com/externalhit_uatext.php)",
    "Mozilla/5.0 (Twitterbot/1.0)",
    "Mozilla/5.0 (compatible; Applebot/1.0; +http://www.apple.com/go/applebot)",
    "Mozilla/5.0 (compatible; LinkedInBot/1.0; +http://www.linkedin.com)",
    "Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)",
    "Mozilla/5.0 (compatible; archive.org_bot/1.13; +http://www.archive.org/details/archive.org_bot)",
    "Mozilla/5.0 (compatible; Slackbot-LinkExpanding 1.0; +https://api.slack.com/robots)",
    "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)",
    "Mozilla/5.0 (compatible; WhatsApp/2.19.51; +http://www.whatsapp.com)",
]




def get_session():
    session = requests.Session()
    retry = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def get_html(url, session):
    user_agent = random.choice(user_agents)
    headers = {
        'User-Agent': user_agent,
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Referer': 'https://www.google.com/'
    }
    try:
        response = session.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        time.sleep(random.uniform(1, 2))  # Increased delay
        return response, user_agent
    except requests.exceptions.RequestException as e:
        print(f"Request error for {url} \n User-Agent: {user_agent}: {e}")
        return None, None

def scrap(link, session):
    try:
        response, user_agent = get_html(link, session)
        print(f"User-Agent: {user_agent}")
        if response is None:
            return None, link

        soup = BeautifulSoup(response.text, 'lxml')
        product_title_element = soup.find('span', attrs={'id': 'productTitle'})

        if product_title_element:
            product_title = product_title_element.text.strip()
            print(f"(Title length: {len(product_title)})")
            return scrap_page(soup), None
        else:
            print(f"No product title found for URL: {link}")
            return None, link
    except Exception as e:
        print(f"An error occurred while scraping {link}: {e}")
        return None, link


def main():
    links_df = pd.read_csv('/content/links.csv')
    links = links_df['URL'].tolist()



    results = []
    failed_urls = []
    session = get_session()

    page_count = 0
    max_retries = 6

    def process_url(url):
        nonlocal page_count
        for attempt in range(max_retries):
            result, failed_url = scrap(url, session)
            if result:
                page_count += 1
                print(f"Processed page {page_count} ---------------------------------------------")
                return result, None
            elif attempt < max_retries - 1:
                print(f"Retrying URL (attempt {attempt + 2}/{max_retries}): {url}")
                # time.sleep(random.uniform(1, 3))  # Wait before retrying
            else:
                page_count += 1
                print(f"Processed page {page_count} (failed) ---------------------------------------------")
                return None, failed_url

    with ThreadPoolExecutor(max_workers=8) as executor:
        future_to_url = {executor.submit(process_url, url): url for url in links}
        for future in as_completed(future_to_url):
            result, failed_url = future.result()
            if result:
                results.append(result)
            if failed_url:
                failed_urls.append(failed_url)

    print("\nScraping completed.")
    print(f"Successfully scraped: {len(results)} products")
    print(f"Failed URLs: {len(failed_urls)}")

    if failed_urls:
        print("\nURLs that failed to scrape after all retries:")
        for url in failed_urls:
            print(url)

    if results:
        combined_data = {key: [] for key in results[0].keys()}
        for result in results:
            for key, value in result.items():
                combined_data[key].append(value)

        df = pd.DataFrame(combined_data)
        output_file = 'amazon_product_data.csv'
        df.to_csv(output_file, index=False)
        print(f"\nData saved to {output_file}")
    else:
        print("\nNo data was successfully scraped.")

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
User-Agent: Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)User-Agent: Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)

User-Agent: Mozilla/5.0 (compatible; Sogou web spider/4.0; +http://www.sogou.com/docs/help/webmasters.htm#07)
(Title length: 69)
(Title length: 74)
Processed page 3199 ---------------------------------------------
(Title length: 104)Processed page 3200 ---------------------------------------------

Processed page 3201 ---------------------------------------------
User-Agent: Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php)
(Title length: 127)
Processed page 3202 ---------------------------------------------
User-Agent: Mozilla/5.0 (compatible; YandexVideo/5.0; +http://yandex.com/bots)
Request error for https://www.amazon.com/Samsung-Galaxy-Factory-Unlock-512Gb/dp/B07GDSZ2KT/ref=sr_1_3699?dib=eyJ2IjoiMSJ9.gnNi28LuYdDoDUtZ

In [None]:
path = '/content/amazon_product_data.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,color,image_links,descriptions,kind_product,ratings,number_of_ratings,status,number_of_buyers_last_month_more_than,typical_price,price,...,battery_power,charging_time,ram,storage,screen_size,connectivity_technologies,wireless_network,cpu_speed,reviews_usa,reviews_other
0,Black,['https://m.media-amazon.com/images/I/51E87drc...,"CARRIER: This phone is locked to Tracfone, whi...",Cell Phones,4.1,465,In Stock,,$119.99,72.0,...,3000,,2 GB,32 GB,5.5 Inches,"Wireless, Bluetooth",,1.4 GHz,,
1,Aura Black,['https://m.media-amazon.com/images/I/41FroksW...,Fully Unlocked: Fully unlocked and compatible ...,Cell Phones,4.2,3765,,200+,$314.00,189.0,...,3500,77 minutes,12 GB,256 GB,6.3 Inches,Bluetooth,,,"[{'name': 'ayalexander', 'rating': '5.0', 'cou...","[{'name': 'Nayhely', 'rating': '5.0', 'country..."
2,green,['https://m.media-amazon.com/images/I/41KGmEhD...,"FULL HD 6.26"" SCREEN: The large 6.26"" screen a...",Cell Phones,1.9,4,,,$79.95,49.0,...,,,,8 GB,6.3 Inches,"Wi-Fi, USB",,,,
3,Black,['https://m.media-amazon.com/images/I/41xJFoKI...,,Cell Phones,4.2,9,,50+,$89.99,69.0,...,,,4 GB,64 GB,6.5 Inches,Wireless,,,,
4,Black,['https://m.media-amazon.com/images/I/51mBltlO...,【MTK Dimensity 7050+Android 13】Powered by the ...,Cell Phones,5.0,2,,,,499.0,...,9600 Amp Hours,,24 GB,512 GB,6.58 Inches,"Bluetooth, Wi-Fi, USB, NFC",,,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4499 entries, 0 to 4498
Data columns (total 35 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   color                                  4238 non-null   object 
 1   image_links                            4464 non-null   object 
 2   descriptions                           3969 non-null   object 
 3   kind_product                           4497 non-null   object 
 4   ratings                                4166 non-null   float64
 5   number_of_ratings                      4166 non-null   object 
 6   status                                 887 non-null    object 
 7   number_of_buyers_last_month_more_than  523 non-null    object 
 8   typical_price                          2839 non-null   object 
 9   price                                  4358 non-null   object 
 10  you_save                               0 non-null      float64
 11  disc

In [None]:
df['typical_price'] = df['typical_price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
df['number_of_buyers_last_month_more_than'] = df['number_of_buyers_last_month_more_than'].replace({'K': '000', '\+': ''}, regex=True).astype(float)
df['number_of_ratings']= df['number_of_ratings'].replace({ '\,': ''}, regex=True).astype(float)
df['price'] = df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

df['you_save'] = df['typical_price'] - df['price']

df.to_csv(path, index=False)

df.dtypes

Unnamed: 0,0
color,object
image_links,object
descriptions,object
kind_product,object
ratings,float64
number_of_ratings,float64
status,object
number_of_buyers_last_month_more_than,float64
typical_price,float64
price,float64


In [None]:
df.head()

Unnamed: 0,color,image_links,descriptions,kind_product,ratings,number_of_ratings,status,number_of_buyers_last_month_more_than,typical_price,price,...,battery_power,charging_time,ram,storage,screen_size,connectivity_technologies,wireless_network,cpu_speed,reviews_usa,reviews_other
0,Black,['https://m.media-amazon.com/images/I/51E87drc...,"CARRIER: This phone is locked to Tracfone, whi...",Cell Phones,4.1,465.0,In Stock,,119.99,72.0,...,3000,,2 GB,32 GB,5.5 Inches,"Wireless, Bluetooth",,1.4 GHz,,
1,Aura Black,['https://m.media-amazon.com/images/I/41FroksW...,Fully Unlocked: Fully unlocked and compatible ...,Cell Phones,4.2,3765.0,,200.0,314.0,189.0,...,3500,77 minutes,12 GB,256 GB,6.3 Inches,Bluetooth,,,"[{'name': 'ayalexander', 'rating': '5.0', 'cou...","[{'name': 'Nayhely', 'rating': '5.0', 'country..."
2,green,['https://m.media-amazon.com/images/I/41KGmEhD...,"FULL HD 6.26"" SCREEN: The large 6.26"" screen a...",Cell Phones,1.9,4.0,,,79.95,49.0,...,,,,8 GB,6.3 Inches,"Wi-Fi, USB",,,,
3,Black,['https://m.media-amazon.com/images/I/41xJFoKI...,,Cell Phones,4.2,9.0,,50.0,89.99,69.0,...,,,4 GB,64 GB,6.5 Inches,Wireless,,,,
4,Black,['https://m.media-amazon.com/images/I/51mBltlO...,【MTK Dimensity 7050+Android 13】Powered by the ...,Cell Phones,5.0,2.0,,,,499.0,...,9600 Amp Hours,,24 GB,512 GB,6.58 Inches,"Bluetooth, Wi-Fi, USB, NFC",,,,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4499 entries, 0 to 4498
Data columns (total 35 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   color                                  4238 non-null   object 
 1   image_links                            4464 non-null   object 
 2   descriptions                           3969 non-null   object 
 3   kind_product                           4497 non-null   object 
 4   ratings                                4166 non-null   float64
 5   number_of_ratings                      4166 non-null   float64
 6   status                                 887 non-null    object 
 7   number_of_buyers_last_month_more_than  523 non-null    float64
 8   typical_price                          2839 non-null   float64
 9   price                                  4358 non-null   float64
 10  you_save                               2838 non-null   float64
 11  disc

In [None]:
# test links
links= ['https://www.amazon.com/Electronics-Unlocked-Smartphone-Pro-Grade-SM-G998UZKAXAA/dp/B08VL1Q7Z1/ref=pd_ci_mcx_di_int_sccai_cn_d_sccl_2_3/137-9844677-7401249?pd_rd_w=tlFXA&content-id=amzn1.sym.751acc83-5c05-42d0-a15e-303622651e1e&pf_rd_p=751acc83-5c05-42d0-a15e-303622651e1e&pf_rd_r=8VXBDV9S4ZWEVMM5J7WF&pd_rd_wg=tD3bN&pd_rd_r=2648defa-97bd-4581-9454-2e89bfe9d28b&pd_rd_i=B08VL1Q7Z1&psc=1',
'https://www.amazon.com/Samsung-Electronics-Unlocked-Smartphone-Long-Lasting/dp/B08BX7N9SK/ref=pd_ci_mcx_di_int_sccai_cn_d_sccl_3_3/137-9844677-7401249?pd_rd_w=Ncv02&content-id=amzn1.sym.751acc83-5c05-42d0-a15e-303622651e1e&pf_rd_p=751acc83-5c05-42d0-a15e-303622651e1e&pf_rd_r=PCZ7W22Q2HAT4KVCCTQ2&pd_rd_wg=uKLAT&pd_rd_r=6b1d67c6-44c7-4573-a7ee-88a86666a091&pd_rd_i=B08BX7XBGN&th=1',
'https://www.amazon.com/Samsung-Electronics-Unlocked-Smartphone-Long-Lasting/dp/B08HKRVZ4W/ref=pd_ci_mcx_di_int_sccai_cn_d_sccl_2_10/137-9844677-7401249?pd_rd_w=6NwkB&content-id=amzn1.sym.751acc83-5c05-42d0-a15e-303622651e1e&pf_rd_p=751acc83-5c05-42d0-a15e-303622651e1e&pf_rd_r=3P2S1RZFT25DWJM34AK5&pd_rd_wg=Iyqyd&pd_rd_r=62471bad-df94-4e6d-970a-3e597876d3a8&pd_rd_i=B08HQZC2XK&th=1',
'https://www.amazon.com/Nokia-9500-Unlocked-Smartphone-MMC-U-S/dp/B0009VB0NG/ref=sr_1_4521?dib=eyJ2IjoiMSJ9.XJtBUAuXwJ71WsS74wvDTi0Dc0u7LFg4l94ZnQ35n_AeEjgaU4ImnEA2DeHmZqJlXciEJxwPAk4z5eHEG6TbptTrSrBz3xlCQa4tgAC1C5JP-VeV6RjuUjpfX_2-QrJMbnMBELXRLARPHDvAoF5ryw.v-ZdedqI3EN6oSZo7HaD3hT88NKQeg6IR2Zo7u7vwRM&dib_tag=se&qid=1726754973&refinements=p_n_feature_thirty-nine_browse-bin%3A113334727011%7C113334728011%7C113334729011%7C113334730011%7C113334731011%7C113334732011&rnid=113334702011&s=wireless&sr=1-4521']

