# Import Library

In [72]:
import pandas   as pd
import requests as rq
import numpy    as np

from selenium import webdriver     as wd
from bs4      import BeautifulSoup as BS
from tqdm     import tqdm

# Loading Dataset

In [2]:
df = pd.read_csv('Amazon_full_details.csv')
df.head()

Unnamed: 0,maincategory,subcategory,category,link,alt_link,iteration,qid,ref
0,0,"tv, audio & cameras",Televisions,https://www.amazon.in/gp/browse.html?node=1389...,https://www.amazon.in/s?rh=n%3A1389396031&fs=true,46,1678610000.0,sr_pg_1
1,0,"tv, audio & cameras",Home Entertainment Systems,https://www.amazon.in/gp/browse.html?node=1389...,https://www.amazon.in/s?rh=n%3A1389375031&fs=true,400,1678610000.0,sr_pg_1
2,0,"tv, audio & cameras",Headphones,https://www.amazon.in/gp/browse.html?node=1388...,https://www.amazon.in/s?rh=n%3A1388921031&fs=true,400,1678610000.0,sr_pg_1
3,0,"tv, audio & cameras",Speakers,https://www.amazon.in/gp/browse.html?node=1389...,https://www.amazon.in/s?rh=n%3A1389365031&fs=true,400,1678610000.0,sr_pg_1
4,0,"tv, audio & cameras",Home Audio & Theater,https://www.amazon.in/gp/browse.html?node=1389...,https://www.amazon.in/s?rh=n%3A1389387031&fs=true,18,1678610000.0,sr_pg_1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   maincategory  165 non-null    int64  
 1   subcategory   165 non-null    object 
 2   category      165 non-null    object 
 3   link          165 non-null    object 
 4   alt_link      160 non-null    object 
 5   iteration     165 non-null    int64  
 6   qid           150 non-null    float64
 7   ref           150 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 10.4+ KB


# Preprocessing

In [4]:
df = df[df['qid'].isna() == False]

In [5]:
del df['ref']
del df['maincategory']
del df['link']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 162
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   subcategory  150 non-null    object 
 1   category     150 non-null    object 
 2   alt_link     150 non-null    object 
 3   iteration    150 non-null    int64  
 4   qid          150 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 7.0+ KB


In [7]:
df.head()

Unnamed: 0,subcategory,category,alt_link,iteration,qid
0,"tv, audio & cameras",Televisions,https://www.amazon.in/s?rh=n%3A1389396031&fs=true,46,1678610000.0
1,"tv, audio & cameras",Home Entertainment Systems,https://www.amazon.in/s?rh=n%3A1389375031&fs=true,400,1678610000.0
2,"tv, audio & cameras",Headphones,https://www.amazon.in/s?rh=n%3A1388921031&fs=true,400,1678610000.0
3,"tv, audio & cameras",Speakers,https://www.amazon.in/s?rh=n%3A1389365031&fs=true,400,1678610000.0
4,"tv, audio & cameras",Home Audio & Theater,https://www.amazon.in/s?rh=n%3A1389387031&fs=true,18,1678610000.0


# Webscraping  Amazon Product

In [108]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
#driver = wb.Chrome('webdriver/chromedriver.exe')

products = { 
    'name'          :[],
    'main_category' :[],
    'sub_category'  :[],
    'image'         :[],
    'link'          :[],
    'ratings'       :[],
    'no_of_ratings' :[],
    'discount_price':[],
    'actual_price'  :[],
}


for main_category,sub_category,base_link,iteration,qid in tqdm(df.values):
    
    for i in range(1,iteration + 1):
        link = base_link + f"&page={i}&qid={qid}"
        #print(link)
        
        res = rq.get(link, headers = headers)
        #print(res.ok)
        soup = BS(res.content,'html.parser')
        
        data = soup.find('div',class_ = 's-main-slot s-result-list s-search-results sg-row')
        pros = data.find_all('div', class_= 'sg-col-4-of-24 sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20')
        #driver.get(link)
        
        for pro in pros:
            
            try:
                # Product details section
                product_detail = pro.find('a', class_ = 'a-link-normal s-no-outline')

                product_link   = 'https://www.amazon.in' + product_detail.get('href')
                product_img    = product_detail.find('img')

                product_name   = product_img   .get('alt')
                product_img    = product_img   .get('src')

                # rating section

                ratings        = pro.find('div', class_ ='a-section a-spacing-none a-spacing-top-micro')

                if ratings:
                    ratings    = [i.get('aria-label') for i in ratings.find_all('span')] 
                    ratings    = list(filter(lambda x: x is not None,ratings))
                else:
                    ratings    = []  

                # price section

                prices          = pro.find('div', class_ = 'a-section a-spacing-none a-spacing-top-small s-price-instructions-style') 

                prices          = [i.text for i in prices.find_all('span',class_ ='a-offscreen')] if prices else [np.nan] * 2

                products['main_category' ].append( main_category         )
                products['sub_category'  ].append( sub_category          )

                products['name'          ].append( product_name          )
                products['image'         ].append( product_img           )
                products['link'          ].append( product_link          )

                products['ratings'       ].append( ratings[0].split()[0] if len(ratings) == 2 else np.nan )
                products['no_of_ratings' ].append( ratings[1]            if len(ratings) == 2 else np.nan  )

                products['discount_price'].append(  prices[0] if len(prices) == 2 else np.nan    )
                products['actual_price'  ].append(  prices[1] if len(prices) == 2 else prices[0] )
                
            except Exception as e:
                with open(f'error_{qid}_{iteration}.txt','w') as file:
                    file.write(f"category: {category}")
                    file.write(f"link    : {link}")
                    file.write()
                    file.write(str(e))
                    file.close()
            
        # check point
    product = pd.DataFrame(products)
    product.to_csv(f"{sub_category}.csv", index = False)
#             print(product_link)
#             print(product_name)
#             print(product_img)
#             print(ratings) 
#             print(prices)
#             print()
#             break
#         break
        
        
#     break

  2%|█▌                                                                            | 3/150 [20:38<16:51:49, 412.99s/it]


KeyboardInterrupt: 

In [98]:
ratings

In [99]:
print(link)

https://www.amazon.in/s?rh=n%3A1389396031&fs=true&page=21&qid=1678609818.0


In [100]:
product_name

'Salora 98 cm (39 Inches) HD Ready Smart LED TV SLV-4392 SH (Black) (2021 Model)'

In [79]:
a = pro.find('div', class_ = 'a-section a-spacing-none a-spacing-top-small s-price-instructions-style')

<div class="a-section a-spacing-none a-spacing-top-small s-price-instructions-style"><div class="a-row a-size-base a-color-base"><a class="a-size-base a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal" href="/Mi-inches-Ready-Android-Black/dp/B084872DQY/ref=sr_1_94?qid=1678722349&amp;s=electronics&amp;sr=1-94" target="_blank"><span class="a-price" data-a-color="base" data-a-size="xl"><span class="a-offscreen">₹16,499</span><span aria-hidden="true"><span class="a-price-symbol">₹</span><span class="a-price-whole">16,499</span></span></span> </a> </div><div class="a-row a-size-base a-color-secondary"><span class="a-color-secondary"><span class="rush-component" data-component-type="s-truncate"><span class="a-truncate" data-a-max-rows="1" data-a-overflow-marker="&amp;hellip;" data-a-word-break="normal" style="line-height: 1.3em !important; max-height: 1.3em;"><span class="a-truncate-full">Save extra with Cashback</span><span aria-hidden="true" class="a-truncate-

In [81]:
[i.text for i in a.find_all('span',class_ ='a-offscreen')] if prices else [np.nan] * 2

['₹52,999', '₹85,000']

In [92]:
prices

['₹59,525', '₹89,400']

In [102]:
product = pd.DataFrame(products)

In [103]:
product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            1104 non-null   object
 1   image           1104 non-null   object
 2   link            1104 non-null   object
 3   ratings         577 non-null    object
 4   no_of_ratings   577 non-null    object
 5   discount_price  698 non-null    object
 6   actual_price    733 non-null    object
dtypes: object(7)
memory usage: 60.5+ KB
