In [13]:
start = "https://data.commoncrawl.org/"

In [26]:
import sqlite3

# Connect to the database (or create it if it doesn't exist)
conn = sqlite3.connect('names_database.db')

# Create a cursor object
cursor = conn.cursor()

# Create the table
cursor.execute('''
  CREATE TABLE IF NOT EXISTS names (
    name TEXT PRIMARY KEY,
    hits INTEGER DEFAULT 1
  )
''')

# Commit the changes and close the connection
conn.commit()
conn.close()

In [14]:
import requests
import re

In [15]:
!zcat warc.paths.gz > urls_to_fetch.txt

In [23]:
import requests

def warc_paths():
  with open('urls_to_fetch.txt', 'r') as f:
    for line in f:
      url = line.strip()
      yield url

def fetch_and_save(url, filename):
  print(f"Fetching {url} and saving to {filename}")
  response = requests.get(url)
  if response.status_code == 200:
    response.raise_for_status()
    with open(filename, 'wb') as f:
      f.write(response.content)


for count, path in enumerate(warc_paths()):
  fetch_and_save(start + path, f"/content/temp/{count}.warc.gz")
  break

Fetching https://data.commoncrawl.org/crawl-data/CC-MAIN-2013-20/segments/1368696381249/warc/CC-MAIN-20130516092621-00000-ip-10-60-113-184.ec2.internal.warc.gz and saving to /content/temp/0.warc.gz


In [46]:
import re
import gzip
import os
from tqdm import tqdm
from time import time
import multiprocessing

regex = r"^[A-Z][a-z]+(ski|sky|ska)$"
names = re.compile(regex)

def process_line(line):
  match = names.search(line)
  if match:
    return match.group(0)
  else:
    return None

def find_matches_in_warc(warc_path):
  start = time()
  matches = []
  total_size = 0

  with gzip.open(warc_path, 'rt', encoding='utf-8', errors='replace') as f:
    with multiprocessing.Pool() as pool:
      results = list(tqdm(pool.imap(process_line, f), total=os.path.getsize(warc_path), desc="Processing lines"))
    matches = [match for match in results if match is not None]
    total_size = os.path.getsize(warc_path)

  end = time()
  search_time = end - start

  if matches:
    seconds_per_name = search_time / len(matches)
  else:
    seconds_per_name = 0

  print(f"Search time: {search_time}")
  print(f"Matches found: {len(matches)}")
  print(f"Seconds per name: {seconds_per_name}")
  print(f"Total data searched: {total_size} bytes")

  return matches

def find_matches_in_temp():
  all_matches = []
  for filename in os.listdir('/content/temp'):
    if filename.endswith('.warc.gz'):
      filepath = os.path.join('/content/temp', filename)
      matches = find_matches_in_warc(filepath)
      all_matches.extend(matches)
  return all_matches

In [None]:
# Example usage
all_matches = find_matches_in_temp()

Processing lines:   0%|          | 383154/783739200 [00:35<29:38:51, 7339.52it/s]

In [31]:
import sqlite3

def update_names_database(names):
  conn = sqlite3.connect('names_database.db')
  cursor = conn.cursor()

  for name in names:
    # Check if the name already exists
    cursor.execute("SELECT * FROM names WHERE name=?", (name,))
    existing_entry = cursor.fetchone()

    if existing_entry:
      # Update the hits count
      new_hits = existing_entry[1] + 1
      cursor.execute("UPDATE names SET hits=? WHERE name=?", (new_hits, name))
    else:
      # Insert a new entry
      cursor.execute("INSERT INTO names (name) VALUES (?)", (name,))

  conn.commit()
  conn.close()

In [32]:
update_names_database(all_matches)

In [33]:
import sqlite3

def get_name_data(name):
  conn = sqlite3.connect('names_database.db')
  cursor = conn.cursor()

  cursor.execute("SELECT * FROM names WHERE name=?", (name,))
  name_data = cursor.fetchone()

  conn.close()

  if name_data:
    return name_data
  else:
    return "Name not found!"

In [38]:
import sqlite3
import random

def get_random_names(count):
  conn = sqlite3.connect('names_database.db')
  cursor = conn.cursor()

  cursor.execute("SELECT name, hits FROM names ORDER BY RANDOM() LIMIT ?", (count,))
  random_names = cursor.fetchall()

  conn.close()

  return random_names

In [42]:
get_name_data('Latviski')

('Latviski', 5)

In [41]:
get_random_names(10)

[('Polanski', 1),
 ('Nebraska', 5),
 ('Hrvatski', 10),
 ('Srpski', 5),
 ('Slovenski', 5),
 ('Svenska', 10),
 ('Franziska', 1),
 ('Polski', 10),
 ('Chomsky', 1),
 ('Latviski', 5)]