SCRIPT: Get lang of the website based on the source URL.

---
Script sends a request and based on the server responses the python lib 'langdetect' and its module 'detect' analyses a response body return site language.
- Script works in async mode.
- Script result execution is strong in CSV file and format.
- CSV headers: "Web Site URL", "Site Lang", "Eng_Yes_No", "Comment", "URL Valid"
- Script cath the exceptions:
  - SSL certificate verification
  - invalid URL
  - status code of server response

In [2]:
!pip install aiohttp langdetect nest_asyncio pandas


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993224 sha256=48ed5175b5fb6f9ce598d99ef7130c259760b8bfbed0f0ab074a41f9651d28fb
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [8]:
import asyncio
import aiohttp
import nest_asyncio
import pandas as pd
from langdetect import detect
from urllib.parse import urlparse, urlunparse
from aiohttp import ClientConnectorCertificateError

nest_asyncio.apply()

# Function to detect language, check if it's English, and validate the URL
async def detect_language(url):
    try:
        # Check if the URL has "http://" or "https://", and add "https://" if missing
        if not url.startswith(("http://", "https://")):
            url = "https://" + url

        # Validate the URL
        parsed_url = urlparse(url)
        if not all([parsed_url.scheme, parsed_url.netloc]):
            return "n/a", 0, "URL Invalid"

        async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=5)) as session:
            async with session.get(url) as response:
                if response.status == 200:
                    try:
                        text = await response.text()
                        detected_language = detect(text)
                        eng_yes_no = 1 if detected_language == 'en' else 0
                        comment = "OK"  # Site responded within 5 seconds
                    except Exception as e:
                        detected_language = "Error"
                        eng_yes_no = 0
                        comment = f"Error parsing content: {str(e)}"
                else:
                    detected_language = "Error"
                    eng_yes_no = 0
                    comment = f"HTTP Error {response.status}"  # Site returned a non-200 status code
    except asyncio.TimeoutError:
        detected_language = "n/a"
        eng_yes_no = 0
        comment = "n/a"  # Site did not respond within 5 seconds
    except ClientConnectorCertificateError as cert_error:
        # Handle SSLCertVerificationError by marking the URL as invalid
        detected_language = "n/a"
        eng_yes_no = 0
        comment = "SSL Verification Error"  # SSL certificate verification failed
    except Exception as conn_error:
        # Handle other client-related errors, including hostname resolution errors
        detected_language = "n/a"
        eng_yes_no = 0
        comment = "Client Error"  # An error occurred during the request

    return detected_language, eng_yes_no, comment

# Function to read URLs from a CSV file and return them as a list
def read_urls_from_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'URL' in df.columns:
            return df['URL'].tolist()
        else:
            print("CSV file must have a 'URL' column")
    except Exception as e:
        print(f"Error reading CSV file: {str(e)}")
    return []

# Create an asyncio event loop
async def main():
    # Read URLs from the CSV file
    csv_file_path = '/content/urls_.csv'  # Change the path as needed
    list_lang = read_urls_from_csv(csv_file_path)

    tasks = [detect_language(site_url) for site_url in list_lang]
    results = await asyncio.gather(*tasks)

    data = []
    for (lang, eng_yes_no, comment), url in zip(results, list_lang):
        valid = 0 if comment == "URL Invalid" or comment == "SSL Verification Error" or comment == "Client Error" else 1
        data.append([url, lang, eng_yes_no, comment, valid])

    # Create a DataFrame with the data
    df = pd.DataFrame(data, columns=["Web Site URL", "Site Lang", "Eng_Yes_No", "Comment", "URL Valid"])

    # Set option to display all rows
    pd.set_option("display.max_rows", None)

    # Format and center-align the columns
    df["Web Site URL"] = df["Web Site URL"].apply(lambda x: f"{x:>{max(len(x), 20)}}")
    df["Site Lang"] = df["Site Lang"].apply(lambda x: f"{x:^10}")
    df["Eng_Yes_No"] = df["Eng_Yes_No"].apply(lambda x: f"{x:^10}")
    df["Comment"] = df["Comment"].apply(lambda x: f"{x:^20}")
    df["URL Valid"] = df["URL Valid"].apply(lambda x: f"{x:^10}")

    # Print the formatted table
    print(df)

    # Save the DataFrame to a CSV file
    df.to_csv("website_data.csv", index=False)  # Change the filename as needed

if __name__ == "__main__":
    asyncio.run(main())

                                Web Site URL   Site Lang  Eng_Yes_No  \
0                                    welt.de     n/a          0        
1                                destatis.de     n/a          0        
2                                      mz.de     n/a          0        
3                                  kurier.at      en          1        
4                               7news.com.au      en          1        
5                                 abc.net.au     n/a          0        
6                             skynews.com.au     n/a          0        
7                                  report.az     n/a          0        
8                                   youtu.be     n/a          0        
9                                 bgonair.bg     n/a          0        
10                             belnovosti.by     n/a          0        
11                                 axios.com     n/a          0        
12                                   bbc.com     n/a          0 

In [None]:
import os

# Get the current working directory
current_directory = os.getcwd()

# Define the filename you used when saving the CSV file
filename = "website_data.csv"  # Change this to the actual filename if different

# Construct the full path to the CSV file
full_path = os.path.join(current_directory, filename)

# Print the full path
print("Full Path:", full_path)

Full Path: /content/website_data.csv
