In [None]:
import requests
import json
import os
import pandas as pd
# For parsing URLs:
from urllib.parse import quote_plus
import nltk
from langdetect import detect


In [None]:
def search_cc_index(url, index_name):
    """
    Search the Common Crawl Index for a given URL.
 
    This function queries the Common Crawl Index <a href="https://www.jcchouinard.com/api/">API</a> to find records related to the specified URL. 
    It uses the index specified by `index_name` to retrieve the data and returns a list of JSON objects, 
    each representing a record from the index.
 
    Arguments:
        url (str): The URL to search for in the Common Crawl Index.
        index_name (str): The name of the Common Crawl Index to search (e.g., "CC-MAIN-2024-10").
 
    Returns:
        list: A list of JSON objects representing records found in the Common Crawl Index. 
              Returns None if the request fails or no records are found.
 
    Example:
        >>> search_cc_index("example.com", "CC-MAIN-2024-10")
        [{...}, {...}, ...]
    """
    encoded_url = quote_plus(url)
    index_url = f'http://index.commoncrawl.org/{index_name}-index?url={encoded_url}&output=json'
    response = requests.get(index_url)
 
    if response.status_code == 200:
        records = response.text.strip().split('\n')
        return [json.loads(record) for record in records]
    else:
        return None
 

In [None]:
msmarco = pd.read_table("../data/external/msmarco-tiny.tsv", header=None)


In [None]:
# The URL you want to look up in the Common Crawl index
target_urls = msmarco[1].to_list()[:20]

# list of indexes https://commoncrawl.org/get-started
indexes  = ['CC-MAIN-2020-10', 'CC-MAIN-2021-31'] #['CC-MAIN-2022-05', 'CC-MAIN-2024-33','CC-MAIN-2024-30','CC-MAIN-2024-26']
 
record_dfs = []
for target_url in target_urls:
    # Fetch each index and store into a dataframe
    for index_name in indexes:
        print('Running: ', index_name)
        records = search_cc_index(target_url,index_name)
        record_df = pd.DataFrame(records)
        record_df['index_name'] = index_name
        record_dfs.append(record_df)
 
# Combine individual dataframes
all_records_df = pd.concat(record_dfs)
all_records_df = all_records_df.sort_values(by='index_name', ascending=False)
all_records_df = all_records_df.reset_index()
 
# Create columns where to store data later
all_records_df['success_status'] = 'not processed'
all_records_df['html'] = ''

In [None]:
all_records_df

In [None]:
df = tfds.as_dataframe(ds.take(10), ds_info)

In [5]:
from warcio.archiveiterator import ArchiveIterator
from bs4 import BeautifulSoup
import sys
import nltk
from langdetect import detect
import re
import pandas as pd

# Regular expression to detect non-Latin characters
non_latin_pattern = re.compile(r'[^\x00-\x7F]+')

allowed_domains = {'com', 'gov', 'edu', 'co', 'uk', 'net', 'mil', 'ai', 'ca'}

def is_english(text):
    try:
        return detect(text) == 'en' and not non_latin_pattern.search(text)
    except:
        return False

def is_latin_not_english(text):
    try:
        return detect(text) != 'en' and not non_latin_pattern.search(text)
    except:
        return False


def get_last_domain_part(url:str):
    return url.split("/")[2].split(".")[-1]

def is_error_response(input:str):
    block_words = {"404"}
    input = input.lower()
    words = input.split()
    for word in words:
        if word in block_words:
            return True
    if input.find("no response") >=0:
        return True
    if input.find("not found") >=0:
        return True
    return False
    

def extract_english_files(warc_file):
    count = 0

    results = []
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response' and 'text/html' in record.http_headers.get('Content-Type', ''):
                payload = record.content_stream().read()
                soup = BeautifulSoup(payload, 'html.parser')
                
                html_tag = soup.find('html')
                if html_tag and html_tag.get('lang', '').startswith('en'):
                    url = record.rec_headers.get('WARC-Target-URI')
                    if not get_last_domain_part(url) in allowed_domains:
                        continue
                    title_tag = soup.find('title')
                    title = title_tag.text.strip() if title_tag else None
                    if title is None or is_error_response(title):
                        continue
                    og_desc_tag = soup.find('meta', attrs={'property': 'og:description'})
                    description = og_desc_tag.get('content', '').strip() if og_desc_tag else None
                    if description is None:
                        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                        description = meta_desc_tag.get('content', '').strip() if meta_desc_tag else 'No Description'
                    
                    if not is_english(title):
                        continue
                    if count%20 == 0:
                        print(count)
                    count += 1
                    results.append({"url": url, "description": description, "title": title})
    return results

def extract_non_english_latin(warc_file):
    count = 0
    results = []
    with open(warc_file, 'rb') as stream:
        for record in ArchiveIterator(stream):
            if record.rec_type == 'response' and 'text/html' in record.http_headers.get('Content-Type', ''):
                payload = record.content_stream().read()
                soup = BeautifulSoup(payload, 'html.parser')
                
                html_tag = soup.find('html')
                if html_tag and not html_tag.get('lang', '').startswith('en'):
                    url = record.rec_headers.get('WARC-Target-URI')
                    if not get_last_domain_part(url) in allowed_domains:
                        continue
                    title_tag = soup.find('title')
                    title = title_tag.text.strip() if title_tag else None
                    if title is None or is_error_response(title):
                        continue
                    og_desc_tag = soup.find('meta', attrs={'property': 'og:description'})
                    description = og_desc_tag.get('content', '').strip() if og_desc_tag else None
                    if description is None:
                        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
                        description = meta_desc_tag.get('content', '').strip() if meta_desc_tag else 'No Description'
                    
                    if not is_latin_not_english(title):
                        continue
                    if count%20 == 0:
                        print(count)
                    count += 1
                    if count > 800:
                        break
                    results.append({"url": url, "description": description, "title": title})
    return results    



In [None]:
r = extract_english_files('/Users/Rrando/crawl/out/CC-MAIN-20250218081924-20250218111924-00893.warc.gz')
cc_corpus = pd.DataFrame(r)
cc_corpus.to_csv("../data/external/common_crawl.csv")

In [6]:
r = extract_non_english_latin('/Users/Rrando/crawl/out/CC-MAIN-20250218081924-20250218111924-00893.warc.gz')
cc_corpus = pd.DataFrame(r)
cc_corpus.to_csv("../data/external/common_crawl_non_english.csv")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


0


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


20


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


40


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


60


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


80
100
120


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


140


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


160
180
200
220
240


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


260
280
300
320
340
360
380
400
420


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


440
460
480
500


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


520
540


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


560
580
600
620


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


640
660



Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(payload, 'html.parser')


680
700
720


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


740
760
780
800


In [10]:
cc_corpus.columns

Index(['url', 'description', 'title'], dtype='object')

In [None]:
Index(['Unnamed: 0.1', 'Unnamed: 0', 'input_titles', 'input_keywords',
       'input_description', 'output', 'output_orig', 'before_none'],
      dtype='object')

In [12]:
cc_corpus["input_titles"] = cc_corpus.title

In [23]:
cc_corpus["input_keywords"] = ""

In [24]:
cc_corpus["input_description"] = cc_corpus.description

In [25]:
cc_corpus.output = "None"

In [27]:
cc_corpus = cc_corpus.drop(['description', 'input_keywords', 'title'], axis=1)

In [29]:
cc_corpus["output"] = "None"

In [31]:
cc_corpus.to_csv("common_corpus_noise_none_3_12.csv")

In [None]:
get_last_domain_part("https://cnn.com")

In [11]:
cc_corpus

Unnamed: 0,url,description,title
0,http://24bet-ind.com/easy-access-to-24-betting...,No Description,game guide - 24 betting
1,http://4strokeswimming.co.uk/,No Description,4strokeswimming.co.uk
2,http://8xxx.net/category/ChangingRoom/,,ChangingRoom
3,http://agencia-mexico.com/details.php?gid=3036...,"Isabel Madow, Mark Tacher, MarÃ­a Rojo, Julio ...",ISABEL MADOW + POSADA - CLAQUETAZO CINTA LA PRIMA
4,http://alpha.2findlocal.com/sitemap_2_14067.html,Free Advertising on 2FINDLOCAL.COM,Sitemap | 2FINDLOCAL.COM
...,...,...,...
795,https://www.dekowerk.net/online-shop/carr%C3%A...,Unsere Produktlinie CARRÉ bietet Ihnen eine um...,Maison - dekowerk | Premium Design-Artikel aus...
796,https://www.delfiero.net/eshop/scheda.asp?id=1...,No Description,Delfiero s.r.l.
797,https://www.deporclub.com/horarios/gimnasia-2,Horario Gimnasia 2,Horario Gimnasia 2
798,https://www.derooysteeldoors.com/voorbeelden-s...,In deze woning in Ede hebben we in de woonkame...,Eenheid door stalen taatsdeuren - De Rooy Stee...
