<h3>Webscraping - First Coding with ChatGPT4 </h3>

In [19]:
import tkinter as tk
from tkinter import ttk, filedialog
import requests
from bs4 import BeautifulSoup
import csv
import json
import pandas as pd

class ScraperGUI:
    def __init__(self, master):
        self.master = master
        master.title("EDA Web Scraper (Jupyter) - Marcelo C. Plaza")
        master.geometry("1280x720")

        self.url_label = ttk.Label(master, text="URL:")
        self.url_entry = ttk.Entry(master, width=80)
        self.url_entry.insert(0, "https://")
        self.select_label = ttk.Label(master, text="Select 1:")
        self.select_entry = ttk.Entry(master, width=80)
        
        self.select2_label = ttk.Label(master, text="Select 2:")
        self.select2_entry = ttk.Entry(master, width=80)
        self.select3_label = ttk.Label(master, text="Select 3:")
        self.select3_entry = ttk.Entry(master, width=80)
        self.select4_label = ttk.Label(master, text="Select 4:")
        self.select4_entry = ttk.Entry(master, width=80)
        self.select5_label = ttk.Label(master, text="Select 5:")
        self.select5_entry = ttk.Entry(master, width=80)

        self.scrape_button = ttk.Button(master, text="Scrape", command=self.scrape)
        self.result_text = tk.Text(master, wrap="word")
        self.save_button = ttk.Button(master, text="Save", command=self.save)

        self.url_label.grid(row=0, column=0, padx=5, pady=5, sticky="w")
        self.url_entry.grid(row=0, column=1, padx=5, pady=5, sticky="we")
        self.select_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")
        self.select_entry.grid(row=1, column=1, padx=5, pady=5, sticky="we")
        
        self.select2_label.grid(row=2, column=0, padx=5, pady=5, sticky="w")
        self.select2_entry.grid(row=2, column=1, padx=5, pady=5, sticky="we")
        self.select3_label.grid(row=3, column=0, padx=5, pady=5, sticky="w")
        self.select3_entry.grid(row=3, column=1, padx=5, pady=5, sticky="we")
        self.select4_label.grid(row=4, column=0, padx=5, pady=5, sticky="w")
        self.select4_entry.grid(row=4, column=1, padx=5, pady=5, sticky="we")
        self.select5_label.grid(row=5, column=0, padx=5, pady=5, sticky="w")
        self.select5_entry.grid(row=5, column=1, padx=5, pady=5, sticky="we")

        self.scrape_button.grid(row=6, column=1, padx=5, pady=5, sticky="w")
        self.result_text.grid(row=7, column=0, columnspan=2, padx=5, pady=5, sticky="nsew")
        self.save_button.grid(row=6, column=1, padx=100, pady=5, sticky="w")

        master.rowconfigure(7, weight=1)
        master.columnconfigure(1, weight=1)

    def scrape(self):

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Content-Type": "text/html",
        }
        url = self.url_entry.get()
        select1, select2, select3, select4, select5 = self.select_entry.get().split(",")

        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        results1 = soup.select(select1.strip())
        results2 = soup.select(select2.strip())
        results3 = soup.select(select3.strip())
        results4 = soup.select(select4.strip())
        results5 = soup.select(select5.strip())

        self.result_text.configure(state="normal")
        self.result_text.delete("1.0", tk.END)
        self.result_list = []

        for i, (result1, result2, result3, result4, result5) in enumerate(
            zip(results1, results2, results3, results4, results5)
        ):
            result_dict = {
                "Result Number": i + 1,
                "Result1 Text": result1.text.strip(),
                "Result2 Text": result2.text.strip(),
                "Result3 Text": result3.text.strip(),
                "Result4 Text": result4.text.strip(),
                "Result5 Text": result5.text.strip(),
            }
            self.result_list.append(result_dict)

            self.result_text.insert(
                tk.END,
                f"#{i + 1}\n{select1}: {result1} -> {result1.text.strip()}\n{select2}: {result2} -> {result2.text.strip()}\n{select3}: {result3} -> {result3.text.strip()}\n{select4}: {result4} -> {result4.text.strip()}\n{select5}: {result5} -> {result5.text.strip()}\n\n",
            )

        self.result_text.configure(state="disabled")


    def get_results(self, soup, select):
        if select.startswith(".") or select.rfind(".") != -1 or select.rfind(" ") != -1:
            return soup.select(select)
        else:
            return soup.find_all(select)

    ...

    def save(self):
        filetypes = [("JSON files", "*.json"), ("CSV files", "*.csv")]
        filename = filedialog.asksaveasfilename(
            defaultextension=".json", filetypes=filetypes
        )

        if filename:
            selected_results = []

            top = tk.Toplevel(self.master)
            top.title("Select Results to Save")
            top.geometry("1280x720")

            canvas = tk.Canvas(top)
            canvas.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)

            yscrollbar = ttk.Scrollbar(
                top, orient=tk.VERTICAL, command=canvas.yview
            )
            yscrollbar.pack(side=tk.RIGHT, fill=tk.Y)
            canvas.configure(yscrollcommand=yscrollbar.set)

            checkbox_frame = tk.Frame(canvas)
            checkbox_frame.columnconfigure(0, weight=1)
            canvas.create_window((0, 0), window=checkbox_frame, anchor=tk.NW)

            checkboxes = []
            for i, result in enumerate(self.result_list):
                var = tk.BooleanVar(value=True)
                checkbox = ttk.Checkbutton(
                    checkbox_frame,
                    text=f"{result['Result1 Text']} | {result['Result2 Text']} | {result['Result3 Text']} | {result['Result4 Text']} | {result['Result5 Text']}",
                    variable=var,
                )
                checkbox.grid(row=i + 2, column=0, padx=5, pady=5, sticky="w")
                checkboxes.append(var)

            checkbox_frame.update_idletasks()
            canvas.config(scrollregion=canvas.bbox(tk.ALL))

            def save_selected():
                for i, checkbox in enumerate(checkboxes):
                    if checkbox.get():
                        selected_results.append(self.result_list[i])
                with open(filename, "w", encoding="utf-8") as file:
                    if filename.endswith(".json"):
                        file.write(
                            json.dumps(
                                selected_results, ensure_ascii=False, indent=4
                            )
                        )
                    elif filename.endswith(".csv"):
                        writer = csv.writer(file)
                        writer.writerow(
                            [
                                "Result Number",
                                "Result1 Text",
                                "Result2 Text",
                                "Result3 Text",
                                "Result4 Text",
                                "Result5 Text",
                            ]
                        )
                        for result in selected_results:
                            writer.writerow(
                                [
                                    result["Result Number"],
                                    result["Result1 Text"],
                                    result["Result2 Text"],
                                    result["Result3 Text"],
                                    result["Result4 Text"],
                                    result["Result5 Text"],
                                ]
                            )
                top.destroy()

                df = pd.DataFrame(selected_results)
                print(df.head())

            save_button = ttk.Button(
                checkbox_frame, text="Save Selected Results", command=save_selected
            )
            save_button.grid(row=0, column=0, padx=5, pady=5, sticky="w")


root = tk.Tk()
style = ttk.Style()
style.theme_use("vista")
scraper = ScraperGUI(root)
root.mainloop()



In [37]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import requests
from bs4 import BeautifulSoup
import csv
import json
import pandas as pd

class ScraperGUI:
    def __init__(self, master):
        self.master = master
        master.title("EDA Web Scraper (Jupyter) - Marcelo C. Plaza")
        master.geometry("1280x720")

        self.url_label = ttk.Label(master, text="URL:")
        self.url_entry = ttk.Entry(master, width=80)
        self.url_entry.insert(0, "https://mercadolivre.com.br/")
        self.select_label = ttk.Label(master, text="Search:")
        self.select_entry = ttk.Entry(master, width=80)

        # Add 5 input fields for saving columns
        self.save_labels = [ttk.Label(master, text=f"Column {i + 1}:") for i in range(5)]
        self.save_entries = [ttk.Entry(master, width=80) for _ in range(5)]

        self.scrape_button = ttk.Button(master, text="Scrape", command=self.scrape)
        self.search_button = ttk.Button(master, text="Search", command=self.search)
        self.result_text = tk.Text(master, wrap="word")
        self.save_button = ttk.Button(master, text="Save", command=self.save)

        self.url_label.grid(row=0, column=0, padx=5, pady=5, sticky="w")
        self.url_entry.grid(row=0, column=1, padx=5, pady=5, sticky="we")
        self.select_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")
        self.select_entry.grid(row=1, column=1, padx=5, pady=5, sticky="we")

        # Place the save labels and entries in the grid
        for i, (label, entry) in enumerate(zip(self.save_labels, self.save_entries)):
            label.grid(row=i + 2, column=0, padx=5, pady=5, sticky="w")
            entry.grid(row=i + 2, column=1, padx=5, pady=5, sticky="we")

        self.search_button.grid(row=7, column=1, padx=5, pady=5, sticky="w")
        self.scrape_button.grid(row=7, column=1, padx=100, pady=5, sticky="w")
        self.result_text.grid(row=8, column=0, columnspan=2, padx=5, pady=5, sticky="nsew")
        self.save_button.grid(row=7, column=1, padx=300, pady=5, sticky="w")

        master.rowconfigure(8, weight=1)
        master.columnconfigure(1, weight=1)

    def search(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Content-Type": "text/html",
        }
        url = self.url_entry.get()
        select = self.select_entry.get()

        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        if select.startswith(".") or select.rfind(".") != -1 or select.rfind(" ") != -1:
            results = soup.select(select)
        else:
            results = soup.find_all(select)

        column_selectors = [entry.get() for entry in self.save_entries if entry.get()]

        self.result_text.configure(state='normal')
        self.result_text.delete('1.0', tk.END)
        self.result_list = []  # save all results in a class attribute
        count = 1
        for result in results:
            row = [result.text.strip()]
            for selector in column_selectors:
                column_result = result.select_one(selector)
                if column_result:
                    row.append(column_result.text.strip())
                else:
                    row.append('')

            self.result_list.append(row)
            self.result_text.insert(tk.END, f"{select}  #{count}: {result} -> {result.text.strip()}\n\n")
            count += 1
        
        self.result_text.configure(state='disabled')

    def scrape(self):
        # Combine the column names and values from the save_entries
        columns = [entry.get() for entry in self.save_entries if entry.get()]
        if not columns:
            messagebox.showerror("Error", "No columns to save.")
            return

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Content-Type": "text/html",
        }
        url = self.url_entry.get()
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # Perform searches for each of the columns
        column_results = []
        for column in columns:
            if column.startswith(".") or column.rfind(".") != -1 or column.rfind(" ") != -1:
                results = soup.select(column)
            else:
                results = soup.find_all(column)
            
            column_results.append([result.text.strip() for result in results])

        # Save the results as a CSV file
        filetypes = [('CSV files', '*.csv')]
        filename = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=filetypes)

        if filename:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(columns)

                # Write the rows of data
                max_rows = max([len(column_data) for column_data in column_results])
                for row_idx in range(max_rows):
                    row = [column_data[row_idx] if row_idx < len(column_data) else '' for column_data in column_results]
                    writer.writerow(row)

            # Create a pandas dataframe from the saved results
            df = pd.read_csv(filename)

            # Perform any desired operations on the dataframe
            # Example: print the first 5 rows of the dataframe
            print(df.head())


    def save(self):
        # Combine the column names and values from the save_entries
        columns = [entry.get() for entry in self.save_entries if entry.get()]
        if not columns:
            messagebox.showerror("Error", "No columns to save.")
            return

        # Save the results as a CSV file
        filetypes = [('CSV files', '*.csv')]
        filename = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=filetypes)

        if filename:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(columns)
                for result in self.result_list:
                    row = [result['Result Text'] if column in result['Result Text'] else '' for column in columns]
                    writer.writerow(row)

            # Create a pandas dataframe from the saved results
            df = pd.read_csv(filename)

            # Perform any desired operations on the dataframe
            # Example: print the first 5 rows of the dataframe
            print(df.head())

root = tk.Tk()
style = ttk.Style()
style.theme_use('vista')
scraper = ScraperGUI(root)
root.mainloop()

# USE THIS TO FIND PRICE IN MERCADOLIVRE.COM div.dynamic-carousel__item-container span.dynamic-carousel__price span
# USE THIS TO FIND TITULO IN MERCADOLIVRE.COM div.dynamic-carousel__item-container h3.dynamic-carousel__title

In [None]:
import pandas as pd
df = pd.read_csv('ttttt.csv')
df.head(40)

<H1>WORKING VERSION</H1>

In [44]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import requests
from bs4 import BeautifulSoup
import csv
import json
import pandas as pd

class ScraperGUI:
    def __init__(self, master):
        self.master = master
        master.title("EDA Web Scraper (Jupyter) - Marcelo C. Plaza")
        master.geometry("1280x720")

        self.url_label = ttk.Label(master, text="URL:")
        self.url_entry = ttk.Entry(master, width=80)
        self.url_entry.insert(0, "https://mercadolivre.com.br/")
        self.select_label = ttk.Label(master, text="Search:")
        self.select_entry = ttk.Entry(master, width=80)

        # Add 5 input fields for saving columns
        self.save_entries = [ttk.Entry(master, width=80) for _ in range(5)]

        # Add 5 input fields for custom column names
        self.column_name_entries = [ttk.Entry(master, width=20) for _ in range(5)]

        self.scrape_button = ttk.Button(master, text="Scrape", command=self.scrape)
        self.search_button = ttk.Button(master, text="Search", command=self.search)
        self.result_text = tk.Text(master, wrap="word")
        self.save_button = ttk.Button(master, text="Save", command=self.save)

        self.url_label.grid(row=0, column=0, padx=5, pady=5, sticky="w")
        self.url_entry.grid(row=0, column=1, padx=5, pady=5, sticky="we")
        self.select_label.grid(row=1, column=0, padx=5, pady=5, sticky="w")
        self.select_entry.grid(row=1, column=1, padx=5, pady=5, sticky="we")

        # Place the save entries in the grid
        for i, entry in enumerate(self.save_entries):
            entry.grid(row=i + 2, column=1, padx=5, pady=5, sticky="we")

        # Place the column name entries in the grid
        for i, entry in enumerate(self.column_name_entries):
            entry.grid(row=i + 2, column=0, padx=5, pady=5, sticky="w")

        self.search_button.grid(row=7, column=1, padx=5, pady=5, sticky="w")
        self.scrape_button.grid(row=7, column=1, padx=100, pady=5, sticky="w")
        self.result_text.grid(row=8, column=0, columnspan=2, padx=5, pady=5, sticky="nsew")
        self.save_button.grid(row=7, column=1, padx=300, pady=5, sticky="w")

        master.rowconfigure(8, weight=1)
        master.columnconfigure(1, weight=1)




    def search(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Content-Type": "text/html",
        }
        url = self.url_entry.get()
        select = self.select_entry.get()

        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        if select.startswith(".") or select.rfind(".") != -1 or select.rfind(" ") != -1:
            results = soup.select(select)
        else:
            results = soup.find_all(select)

        column_selectors = [entry.get() for entry in self.save_entries if entry.get()]

        self.result_text.configure(state='normal')
        self.result_text.delete('1.0', tk.END)
        self.result_list = []  # save all results in a class attribute
        count = 1
        for result in results:
            row = [result.text.strip()]
            for selector in column_selectors:
                column_result = result.select_one(selector)
                if column_result:
                    row.append(column_result.text.strip())
                else:
                    row.append('')

            self.result_list.append(row)
            self.result_text.insert(tk.END, f"{select}  #{count}: {result} -> {result.text.strip()}\n\n")
            count += 1
        
        self.result_text.configure(state='disabled')

    def scrape(self):
        # Combine the column names and values from the save_entries
        columns = [entry.get() for entry in self.save_entries if entry.get()]
        if not columns:
            messagebox.showerror("Error", "No columns to save.")
            return

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            "Content-Type": "text/html",
        }
        url = self.url_entry.get()
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        # Perform searches for each of the columns
        column_results = []
        for column in columns:
            if column.startswith(".") or column.rfind(".") != -1 or column.rfind(" ") != -1:
                results = soup.select(column)
            else:
                results = soup.find_all(column)
            
            column_results.append([result.text.strip() for result in results])

        # Save the results as a CSV file
        filetypes = [('CSV files', '*.csv')]
        filename = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=filetypes)

        if filename:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(columns)

                # Write the rows of data
                max_rows = max([len(column_data) for column_data in column_results])
                for row_idx in range(max_rows):
                    row = [column_data[row_idx] if row_idx < len(column_data) else '' for column_data in column_results]
                    writer.writerow(row)

            # Create a pandas dataframe from the saved results
            df = pd.read_csv(filename)

            # Perform any desired operations on the dataframe
            # Example: print the first 5 rows of the dataframe
            print(df.head())


    def save(self):
        # Combine the column names and values from the save_entries
        columns = [entry.get() for entry in self.save_entries if entry.get()]
        if not columns:
            messagebox.showerror("Error", "No columns to save.")
            return

        # Save the results as a CSV file
        filetypes = [('CSV files', '*.csv')]
        filename = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=filetypes)

        if filename:
            with open(filename, 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                
                # Replace default column names with custom names if provided
                custom_columns = [entry.get() for entry in self.column_name_entries]
                final_columns = [custom if custom else default for custom, default in zip(custom_columns, columns)]
                writer.writerow(final_columns)

                for result in self.result_list:
                    row = [cell for cell in result]
                    writer.writerow(row)

            # Create a pandas dataframe from the saved results
            df = pd.read_csv(filename)

            # Perform any desired operations on the dataframe
            # Example: print the first 5

                # Perform any desired operations on the dataframe
                # Example: print the first 5 rows of the dataframe
            print(df.head())


root = tk.Tk()
style = ttk.Style()
style.theme_use('vista')
scraper = ScraperGUI(root)
root.mainloop()

# USE THIS TO FIND PRICE IN MERCADOLIVRE.COM div.dynamic-carousel__item-container span.dynamic-carousel__price span
# USE THIS TO FIND TITULO IN MERCADOLIVRE.COM div.dynamic-carousel__item-container h3.dynamic-carousel__title

  div.dynamic-carousel__item-container h3.dynamic-carousel__title  \
0  Kit 10 Cuecas Box Boxer Estampadas Masculino A...                
1  Smart TV Philco PTV50G10AG11SK DLED Android TV...                
2  Tapete 2,00x1,50 Tay Day Shaggy Mega Promoção ...                
3  Smartphone Motorola Moto G22 Dual 6,5 128gb 4g...                
4  Maquininha Point Smart - A Máquina De Cartão D...                

  div.dynamic-carousel__item-container span.dynamic-carousel__price span  
0                                              R$ 43                      
1                                           R$ 2.177                      
2                                              R$ 86                      
3                                           R$ 1.039                      
4                                             R$ 217                      
