In [27]:
import pandas as pd
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import matplotlib.pyplot as plt
from time import sleep
import inspect
import os
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import datetime
import schedule



# Get the search term and tracking period from the user
search_term = input("Please enter the name of the product you want to search: ")
tracking_days = input("Please enter the number of days you want to track the product: ")

# To ensure that the user enters a non-string value 
while not tracking_days.isdigit():
    print("Warning: Please enter a valid integer value for the number of days.")
    tracking_days = input("Please enter the number of days you want to track the product: ")
tracking_days = int(tracking_days)



# Start the web driver and go to the Hepsiburada homepage
options = uc.ChromeOptions()
options.add_argument('--blink-settings=imagesEnabled=false') # disable images for loading of page faster
options.add_argument('--disable-notifications')
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs",prefs)
driver = uc.Chrome(options=options)

url = 'https://www.hepsiburada.com/'
driver.get(url)
wait = WebDriverWait(driver, 15)

# close cookies bar
wait.until(EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))).click()

# Enter the search term in the search box and press Enter
search_box = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'theme-IYtZzqYPto8PhOx3ku3c')))
search_box.send_keys(search_term + Keys.RETURN)



# load all products
number_of_products = int(wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'searchResultSummaryBar-AVnHBWRNB0_veFy34hco')))[1].text)

number_of_loaded_products = 0
while number_of_loaded_products < number_of_products:
    loaded_products = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'li[class*=productListContent][id]')))
    number_of_loaded_products = len(loaded_products)
    driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', loaded_products[-1])

# Get the link, name, price and seller of all the products
product = {key:[] for key in ['name','price','seller','url']}
product['name']  = [h3.text for h3 in driver.find_elements(By.CSS_SELECTOR, 'h3[data-test-id=product-card-name]')]
product['url']   = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'a[class*=ProductCard]')]
product['price'] = [float(div.text.replace('TL','').replace(',','.')) for div in driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id=price-current-price]')]
for i,url in enumerate(product['url']):
    print(f'Search seller names {i+1}/{number_of_loaded_products}', end='\r')
    driver.get(url)
    product['seller'] += [wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.seller a'))).text]
    product['url'][i] = driver.current_url # useful to replace some long urls
    

# Sort by price in ascending order
product = pd.DataFrame(product, columns=['name', 'price', 'seller', 'url'])
product = pd.DataFrame(product).sort_values(by='price')
product = product.reset_index(drop=True)

# Save the product data to a CSV file
filename = f"{search_term.replace(' ', '_')}_data_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"


# Save the product data to a CSV file
product.to_csv(filename, index=False, encoding='utf-8-sig')


# Get the path to the directory containing the CSV files
dir_path = '.'

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv') and '_price_data' not in f]

# Initialize variables to hold information about the latest file
latest_file = None
latest_file_datetime = datetime.datetime.min

# Loop over each file in the directory
for file in csv_files:
    # Split the file name into parts using the "_" character as a delimiter
    file_parts = file.split("_")
    
    # Check if the file name contains a date and time string
    if len(file_parts) >= 3:
        # Extract the date and time from the file name
        file_date_str = file_parts[-2]
        file_time_str = file_parts[-1].replace(".csv", "")
        
        # Combine the date and time strings to create a datetime object
        file_datetime = datetime.datetime.strptime(file_date_str + "_" + file_time_str, "%Y-%m-%d_%H-%M-%S")
        
        # Check if this file is newer than the latest file we've found so far
        if file_datetime > latest_file_datetime:
            latest_file_datetime = file_datetime
            latest_file = file
    
# Delete all files in the directory except for the latest file
for file in csv_files:
    if file != latest_file:
        os.remove(os.path.join(dir_path, file))
        print(f"Removed old file: {file}")


tracking_period = ['23:52']
completed_days = 0
price_data = []

def get_price():
     try:
        price_element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'offering-price')))
        price = float(price_element.find_element_by_xpath('./span[1]').text + '.' + price_element.find_element_by_xpath('./span[2]').text)
        return price
     except:
        # print(f"Price not available on {get_time()}")
        return None

def get_time():
    return datetime.datetime.now()

def track_product():
    day_price_data = []

    time_now = datetime.datetime.now()
    print(f"Time just before tracking_period: {time_now}")

 
    # Get the current price of the product
    driver.get(product['url'][0])
    
    current_price = get_price()

    if current_price is not None:
        day_price_data.append([get_time(), current_price])

    day_price_df = pd.DataFrame(day_price_data, columns=['date', 'price'])
    day_price_df['date'] = pd.to_datetime(day_price_df['date'])
    price_data.append(day_price_df)

def job():
    global completed_days
    print("Starting product tracking...")
    track_product()
    completed_days += 1

    if completed_days < tracking_days * len(tracking_period):
        remaining_days = tracking_days - ((completed_days + len(tracking_period) - 1) // len(tracking_period))

        if tracking_days == 1 and remaining_days == 1:
            print(f"Remaining day: {remaining_days}")
        elif tracking_days > 1:
            print(f"Waiting for the next day... Remaining days: {remaining_days}")

    elif completed_days == tracking_days * len(tracking_period):
        remaining_days = 0
        print(f"Remaining day: {remaining_days}")
        print("Product tracking finished for today.")
        
for time_str in tracking_period:
    if ':' in time_str:
        hour, minute = time_str.split(':')
    else:
        hour, minute = time_str, '00'
    print("time_str:", time_str)
    schedule.every().day.at(f"{hour}:{minute}").do(job)

print("Schedule started...")
while completed_days < tracking_days * len(tracking_period):
    schedule.run_pending()
    time.sleep(1)
    
    
# Concatenate all the day_price_df DataFrames and create a single DataFrame
price_df = pd.concat(price_data, ignore_index=True)

price_df['date'] = pd.to_datetime(price_df['date'])
price_df['time'] = price_df['date'].dt.time
price_df['date'] = price_df['date'].dt.date

final_price_df = pd.DataFrame(columns=['date', 'time', 'price'])

for date in price_df['date'].unique():
    
    # Get the dataframe for the current date
    daily_df = price_df[price_df['date'] == date]
    
    # If there is only one data point for the current date, append it to the final_price_df dataframe
    if len(daily_df) == 1:
        final_price_df = final_price_df.append({
            'date': daily_df['date'].iloc[0],
            'time': daily_df['time'].iloc[0],
            'price': daily_df['price'].iloc[0]
        }, ignore_index=True)
    
    # If all the prices for the current date are the same, get the latest time and append that data point to the final_price_df dataframe
    elif len(daily_df['price'].unique()) == 1:
        max_time = daily_df['time'].max()
        final_price_df = final_price_df.append({
            'date': date,
            'time': max_time,
            'price': daily_df[daily_df['time'] == max_time]['price'].iloc[0]
        }, ignore_index=True)
    
    # If there are multiple prices for the current date, get the latest time and append that data point to the final_price_df dataframe
    else:
        max_time = daily_df['time'].max()
        final_price_df = final_price_df.append({
            'date': date,
            'time': max_time,
            'price': daily_df[daily_df['time'] == max_time]['price'].iloc[0]
        }, ignore_index=True)
      

# Save the price data to a CSV file
filename = search_term.replace(' ', '_') + '_price_data.csv'
final_price_df.to_csv(filename, index=False, encoding='utf-8-sig')


# Remove old _price_data.csv files from directory
# Get the path to the directory containing the CSV files
dir_path = '.'

# Get a list of all CSV files in the directory
csv_files = [f for f in os.listdir(dir_path) if f.endswith('.csv') and '_price_data' in f]

# Get the current date
today = datetime.date.today()

# Loop over each file in the directory
for file in csv_files:
    # Get the creation time of the file
    creation_time = os.path.getctime(os.path.join(dir_path, file))
    
    # Convert the creation time to a datetime object
    file_datetime = datetime.datetime.fromtimestamp(creation_time)
        
    # Check if this file is older than today
    if file_datetime.date() < today:
        os.remove(os.path.join(dir_path, file))
        print(f"Removed old file: {file}")
            
    # Check if this file is the latest file for today
    elif file_datetime.date() == today:
        # Check if there are multiple files for today
        if len(csv_files) > 1:
            # Remove all but the most recent file
            latest_file = max(csv_files, key=os.path.getctime)
            csv_files.remove(latest_file)
            for old_file in csv_files:
                os.remove(os.path.join(dir_path, old_file))
                print(f"Removed old file: {old_file}")
        else:
            print(f"No old files to remove for {file}")
    # If the file is from the future, just skip it
    else:
        pass

final_date = final_price_df['date'].max()  # latest day in data
start_date = final_price_df['date'].min()  # First day in data
elapsed_days = (final_date - start_date).days+1


# Create a line plot of the price data
if not final_price_df.empty:
    unique_prices = final_price_df['price'].unique()
    # Check if there is more than one unique price
    if len(unique_prices) > 1:
        plt.plot(final_price_df['date'], final_price_df['price'])
        plt.title(f"\n\nPrice Trend of {search_term}\n", fontsize=16, fontweight='bold')
        plt.xlabel("\nDate\n", fontsize=14)
        plt.ylabel("Price (TL)\n", fontsize=14)
        plt.gca().xaxis.set_major_locator(DayLocator())
        plt.gca().xaxis.set_major_formatter(DateFormatter('%Y-%m-%d'))
        plt.xticks(rotation=45)
        plt.gca().spines['top'].set_linewidth(2)
        plt.gca().spines['right'].set_linewidth(2)
        plt.gca().spines['bottom'].set_linewidth(2)
        plt.gca().spines['left'].set_linewidth(2)
        plt.show()
    else:
        print(f"The price didn't change over the {elapsed_days} days: {unique_prices[0]} TL")

    # Create a bar plot of the lowest and highest prices
    if len(final_price_df) >= 1:
        fig, ax = plt.subplots()
        ax.bar(['Lowest Price', 'Highest Price'], [final_price_df['price'].min(), final_price_df['price'].max()])
        ax.set_title(f"\n\nLowest and Highest Prices of {search_term}\n", fontsize=16, fontweight='bold')
        ax.set_ylabel("Price (TL)\n", fontsize=14)
        ax.spines['top'].set_linewidth(2)
        ax.spines['right'].set_linewidth(2)
        ax.spines['bottom'].set_linewidth(2)
        ax.spines['left'].set_linewidth(2)
        plt.show()

        # Create a line plot of the price data with the lowest and highest prices highlighted
    
        if len(final_price_df) >= 1 and final_price_df['price'].max() != final_price_df['price'].min():
            # Plot the price data
            plt.plot(final_price_df['date'], final_price_df['price'])

            # Add a title and axis labels
            plt.title(f"\n\nHighlighted Representation of the Highest and Lowest Prices of {search_term}\n", fontsize=16, fontweight='bold')
            plt.xlabel("\nDate\n", fontsize=14)
            plt.ylabel("Price (TL)\n", fontsize=14)

            plt.gca().xaxis.set_major_locator(DayLocator())
            plt.gca().xaxis.set_major_formatter(DateFormatter('%Y-%m-%d'))
            plt.xticks(rotation=45)

            # Add a horizontal line for the lowest price
            plt.axhline(y=final_price_df['price'].min(), color='r', linestyle='-')

            # Add a horizontal line for the highest price
            plt.axhline(y=final_price_df['price'].max(), color='g', linestyle='-')

            # Set the line widths for the plot
            plt.gca().spines['top'].set_linewidth(2)
            plt.gca().spines['right'].set_linewidth(2)
            plt.gca().spines['bottom'].set_linewidth(2)
            plt.gca().spines['left'].set_linewidth(2)

            # Find the intersection points between the price data and the horizontal lines
            idx_min = np.argmin(final_price_df['price'])
            idx_max = np.argmax(final_price_df['price'])

            # Add a point for the lowest price intersection
            plt.plot(final_price_df['date'][idx_min], final_price_df['price'][idx_min], 'ro')

            # Add a point for the highest price intersection
            plt.plot(final_price_df['date'][idx_max], final_price_df['price'][idx_max], 'go')

            plt.show()


        else:
            print("There is only one price data point. Cannot create the line plot with lowest and highest prices.")
    else:
        print("There is only one price data point. Cannot create the line plot and the bar plot.")
else:
    print("There is no price data to create a plot.")

#close the website tab to avoid errors
driver.quit()



Please enter the name of the product you want to search:  pınar süt 1lt
Please enter the number of months you want to track the product:  1


Search seller names 35/35
The product selected from the search results is:
name:   Pınar Denge Laktozsuz Süt 1 L
price:  24.5
seller: PınarOnline
url:    https://www.hepsiburada.com/pinar-denge-laktozsuz-sut-1-l-p-ZYPINAR153103445?magaza=P%C4%B1narOnline


Would you like to save the product data as Excel file? (yes or no):  yes


The product data has been saved as Excel files.
[]
[]


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//a[@class='page-next']"}
  (Session info: chrome=110.0.5481.178)
