### Import

In [1]:
import os
import sys
import re
import datetime
import importlib
import pandas as pd
from time import sleep
from time import strptime
from datetime import datetime
from selenium.webdriver.common.by import By

### Logger

In [2]:
# Run file logger.py to create logger object with prefix 'gsm'
with open('../common/logger.py') as f:
    exec(f.read())
logger = get_logger(name='gsm')
logger.info('Start crawl gsm-forum')

In [3]:
# Import a module from another directory
def get_module(folder_name, file_name):
    module_name = file_name.split('.')[0]
    module_path = os.path.join(os.getcwd(), '..', folder_name, file_name)
    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

In [4]:
try:
    driver_module = get_module('common', 'web_driver.py')
    get_driver = driver_module.get_driver
    driver = get_driver()
except Exception as e:
    logger.error(f'Import web driver fail: {e}')

### Crawl selenium

In [5]:
# Go to "Product Support Sections" category on GSM-forum
try:
    driver.get("https://forum.gsmhosting.com/vbb/f209/")
except Exception as e:
    logger.error(f'Init driver fail: {e}')
    sys.exit()

In [6]:
topic_products = []
try:
    product_links = driver.find_elements(By.CLASS_NAME, "alt1Active")
    for link in product_links:
        a_tag = link.find_element(By.TAG_NAME, "a")
        title = link.text
        href = a_tag.get_attribute("href")

        topic_products.append({'title': title, 'link':href})  
except Exception as e:
    logger.error(f'Crawl selenium: {e}')

In [7]:
DAY = 1
MONTH = 7
YEAR = 2024
KEY_WORDS = ["unlock", "hack", "samsung", "s2", "flip", "fold", "knox"]

In [None]:
def get_products_from_topic(driver, link):
    products = []
    driver.get(link['link'])
    print(link['title'])

    rows = driver.find_elements(By.XPATH, "//*[contains(@id, 'thread_title')]")
    times = driver.find_elements(By.CSS_SELECTOR, "div.smallfont[style*='text-align:right; white-space:nowrap']")

    for i, (row, time) in enumerate(zip(rows, times)):
        title = row.text
        link = row.get_attribute('href')
        creatAt = time.text 

        pattern = r'(\d{2}-\d{2}-\d{4})'
        match = re.search(pattern, time.text)
        if match:
            creatAt = match.group(1)
        else:
            creatAt = None

        if(creatAt != None): 
            products.append({'title': row.text, 'link':link, 'creatAt': creatAt})

    month_str = str(MONTH).zfill(2)
    day_str = str(DAY).zfill(2)
    year_str = str(YEAR)
    deadline = month_str + '-' + day_str + '-' + year_str

    product_links_by_deadline = [item for item in products if strptime(item['creatAt'], '%m-%d-%Y') >= strptime(deadline, '%m-%d-%Y')]
    product_links_by_keywords = [item for item in product_links_by_deadline if any(keyword in item['title'].lower() for keyword in KEY_WORDS)]

    for i, thread_link in enumerate(product_links_by_keywords):
        print(f"Link {i+1}: {thread_link['title']} \n{thread_link['link']} \n{thread_link['creatAt']}\n")

    return product_links_by_keywords

In [9]:
driver.implicitly_wait(20)

In [None]:
total_products = []
try: 
    for link in (topic_products):
        total_products += get_products_from_topic(driver, link)

except Exception as e:
    logger.error(f'Get all product: {e}')

AMT-dongle
ART (Android Root Tool)
Link 1: Samsung M215F Binary 3 FRP Remove via Test Mode method 
https://forum.gsmhosting.com/vbb/f1134/samsung-m215f-binary-3-frp-remove-via-test-mode-method-3321538/ 
07-21-2024

Android Multi Tool (AMT)
7ICE Team
Mkey - Modem Unlock Key  
Avengers Box
Link 1: Avengers Box / UMT Pro Samsung Module v0.6 Released - [08/10/2024] 
https://forum.gsmhosting.com/vbb/f820/avengers-box-umt-pro-samsung-module-v0-6-released-08-10-2024-a-3341252/ 
10-08-2024

Link 2: Avengers Box / UMT Pro Samsung Module v0.5 Released - [07/10/2024] 
https://forum.gsmhosting.com/vbb/f820/avengers-box-umt-pro-samsung-module-v0-5-released-07-10-2024-a-3340904/ 
10-19-2024

Link 3: Avengers Box / UMTPro Samsung Module v0.4 Released - [14/08/2024] 
https://forum.gsmhosting.com/vbb/f820/avengers-box-umtpro-samsung-module-v0-4-released-14-08-2024-a-3328613/ 
08-14-2024

Link 4: Avengers Box / UMTPro Samsung Tool v0.3 Released - [11/08/2024] 
https://forum.gsmhosting.com/vbb/f820/aveng

### Get comment

In [11]:
# Retrieves the first 5 earliest comments from a topic.
# including navigating to the last page of comments if necessary.
# driver: Selenium WebDriver instance.
# url: URL of the topic.
# Returns list_answers: List of the first 5 earliest comments.
def get_comments_by_link(driver, url=''):
    driver.get(url)

    # Navigate to the last page of comments (earliest comments)
    try:
        last_page_a = driver.find_element(By.XPATH, "//a[@class='smallfont' and contains(text(), 'Last ')]")
        last_page_a.click()
        sleep(3)
    except Exception:
        print("Only 1 page of comments found")

    # Retrieve the last 5 div elements with an id starting with 'post...'
    comment_divs = driver.find_elements(By.CSS_SELECTOR, "table[id^='post']:nth-last-of-type(-n+5)")
    
    # Get the content of the 5 comments
    comments = []
    for cmt_container in reversed(comment_divs):
        container_id = cmt_container.get_attribute("id")    # ex: post14872555
        comment_id = container_id[4:]                       # ex: 14872555

        content = cmt_container.find_element(By.ID, f"post_message_{comment_id}").text
        comments.append(content)

    return comments

In [None]:
comments = []
try:
    for i, post in enumerate(total_products):
        print(f"Post {i}: {post['link']}")
        comments.append(get_comments_by_link(driver, post['link']))
except Exception as e:
    logger.error(f'Get data by link: {e}')

Thread 0: https://forum.gsmhosting.com/vbb/f1134/samsung-m215f-binary-3-frp-remove-via-test-mode-method-3321538/
Only 1 page of comments found
Thread 1: https://forum.gsmhosting.com/vbb/f820/avengers-box-umt-pro-samsung-module-v0-6-released-08-10-2024-a-3341252/
Only 1 page of comments found
Thread 2: https://forum.gsmhosting.com/vbb/f820/avengers-box-umt-pro-samsung-module-v0-5-released-07-10-2024-a-3340904/
Only 1 page of comments found
Thread 3: https://forum.gsmhosting.com/vbb/f820/avengers-box-umtpro-samsung-module-v0-4-released-14-08-2024-a-3328613/
Only 1 page of comments found
Thread 4: https://forum.gsmhosting.com/vbb/f820/avengers-box-umtpro-samsung-tool-v0-3-released-11-08-2024-a-3327543/
Only 1 page of comments found
Thread 5: https://forum.gsmhosting.com/vbb/f820/mobicel-legend-pro-simunlock-done-3323729/
Only 1 page of comments found
Thread 6: https://forum.gsmhosting.com/vbb/f898/24-10-2024-vivo-exynos-support-samsung-qualcomm-improvements-2819592/
Thread 7: https://foru

In [None]:
COL_TYPE = 'Type'
COL_LINK = 'Link'
COL_PUBLISHED = 'Published at'
COL_TITLE = 'Title'
COL_CONTENT = 'Content'
COL_SUMMARY = 'Summary'

In [None]:
# collect data
MIN_WORDS = 20
data_output = []

for i, post in enumerate(total_products):
    comment = ''.join(comments[i][0:5])
    row = {
        COL_TYPE:'gsm',
        COL_LINK: post["link"],
        COL_PUBLISHED: post["creatAt"], 
        COL_TITLE: post["title"], 
        COL_CONTENT: comment
    }
    if len(comment) >= MIN_WORDS:
        data_output.append(row)

In [None]:
comments

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [None]:
driver.quit()

### Genai

In [None]:
genai_module = get_module('common', 'genai.py')
Genai = genai_module.Genai

In [None]:
genai = Genai()

KeyboardInterrupt: 

In [None]:
data_output

[{'Type': 'gsm',
  'Link': 'https://forum.gsmhosting.com/vbb/f820/avengers-box-umtpro-samsung-module-v0-4-released-14-08-2024-a-3328613/',
  'Published at': '08-14-2024',
  'Title': 'Avengers Box / UMTPro Samsung Module v0.4 Released - [14/08/2024]',
  'Content': 'Added Samsung models\nGalaxy A52 SM-A525M BIT6\nGalaxy Z Flip6 5G SM-F741B BIT1\nGalaxy Z Fold 5G SM-F946B BIT3\nGalaxy S21 FE 5G SM-G990U BITC\nGalaxy S21 FE SM-G990U2 BITA\nGalaxy S22 Ultra 5G SM-S908E BITA\nGalaxy S23 Ultra 5G SM-S918B BIT6\nGalaxy S24 Ultra 5G SM-S928B BIT3\nGalaxy S24 Ultra 5G SM-S928N BIT2\nGalaxy A05s SM-A057F BIT3\nGalaxy S21 FE 5G SM-G990B2 BIT7\nGalaxy S21 5G SM-G991U BIT10\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT6\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT3\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT4\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT5\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L\nGalaxy Note 10+ SM-N975U BIT8\nGalaxy S22 Ultra 5G SM-S908U BIT6\nGalaxy Tab A7 10.4 (2020) SM-T500 BIT8\nGalaxy J6 Plus

### Summary comments

In [None]:
summaries = []
try:    
    for i, row in enumerate(data_output):
        text = row[COL_CONTENT]
        summary = genai.search(text)        
        summaries.append(summary)
        sleep(1)
        
except Exception as e:
    logger.error(f'Genai summary: {e}')

Summary: Samsung has added several new models to its lineup, including the Galaxy A52, Galaxy Z Flip6 5G, Galaxy Z Fold 5G, Galaxy S21 FE 5G, Galaxy S22 Ultra 5G, Galaxy S23 Ultra 5G, Galaxy S24 Ultra 5G, Galaxy A05s, Galaxy S21 FE 5G, Galaxy S21 5G, Galaxy M23Buddy 2 5G 2022, Galaxy Note 10, Galaxy Tab A7 104 2020, Galaxy J6 Plus, Galaxy M01, and Galaxy M11. These models come with various features such as reading information in EDL normal download mode, resetting FRP in EDL mode, resetting user data in EDL mode, flashing in download mode using normal TAR MD5 firmware, displaying partition information in download mode, and reading phone information and FRP state information in normal mode. However, it is important to note that there is still no NCK available at this time. If you have a UMT card, you can now activate NCK Box Premium v2 on it, allowing you to use it without any shipping costs or limitations. For more information, you can visit the official website, Avengers Box, and UMT 

In [None]:
for i, row in enumerate(data_output):
    if i<len(summaries):
        row[COL_SUMMARY] = ''.join(summaries[i])
    else:
        row[COL_SUMMARY] = ''

#### Save data

In [None]:
data_output

[{'Type': 'gsm',
  'Link': 'https://forum.gsmhosting.com/vbb/f820/avengers-box-umtpro-samsung-module-v0-4-released-14-08-2024-a-3328613/',
  'Published at': '08-14-2024',
  'Title': 'Avengers Box / UMTPro Samsung Module v0.4 Released - [14/08/2024]',
  'Content': 'Added Samsung models\nGalaxy A52 SM-A525M BIT6\nGalaxy Z Flip6 5G SM-F741B BIT1\nGalaxy Z Fold 5G SM-F946B BIT3\nGalaxy S21 FE 5G SM-G990U BITC\nGalaxy S21 FE SM-G990U2 BITA\nGalaxy S22 Ultra 5G SM-S908E BITA\nGalaxy S23 Ultra 5G SM-S918B BIT6\nGalaxy S24 Ultra 5G SM-S928B BIT3\nGalaxy S24 Ultra 5G SM-S928N BIT2\nGalaxy A05s SM-A057F BIT3\nGalaxy S21 FE 5G SM-G990B2 BIT7\nGalaxy S21 5G SM-G991U BIT10\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT6\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT3\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT4\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L BIT5\nGalaxy M23(Buddy 2) 5G 2022 SM-M236L\nGalaxy Note 10+ SM-N975U BIT8\nGalaxy S22 Ultra 5G SM-S908U BIT6\nGalaxy Tab A7 10.4 (2020) SM-T500 BIT8\nGalaxy J6 Plus

In [None]:
try:
    columns = [COL_TYPE, COL_LINK, COL_PUBLISHED, COL_TITLE, COL_CONTENT, COL_SUMMARY]
    df = pd.DataFrame(data_output, columns=columns)

    today = datetime.today().date()
    file_path = f'..//output//output_{today}.xlsx'
    sheet_name = f'gsm_{today}'

    # Check if the file already exists
    if os.path.exists(file_path):
        with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
            if sheet_name in writer.book.sheetnames:
            # Remove old sheet with same name
                writer.book.remove(writer.book[sheet_name])
            df.to_excel(writer, sheet_name=sheet_name, index=False)
    else:
        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name=sheet_name, index=False)
    
    logger.info(f'Export {len(data_output)} data successful')
except Exception as e:
    logger.error(f'Save data fail: {e}')