In [1]:
!pip install internetarchive

Defaulting to user installation because normal site-packages is not writeable
Collecting internetarchive
  Downloading internetarchive-3.5.0.tar.gz (102 kB)
     |████████████████████████████████| 102 kB 10.7 MB/s           
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing wheel metadata (pyproject.toml) ... [?25ldone
Collecting schema>=0.4.0
  Downloading schema-0.7.5-py2.py3-none-any.whl (17 kB)
Collecting jsonpatch>=0.4
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting docopt<0.7.0,>=0.6.0
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting jsonpointer>=1.9
  Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)
Collecting contextlib2>=0.5.5
  Downloading contextlib2-21.6.0-py2.py3-none-any.whl (13 kB)
Building wheels for collected packages: internetarchive, docopt
  Building wheel for internetarchive (pyproject.toml) ... [?2

In [7]:
import pandas as pd 
from uuid import uuid4
from datetime import datetime
from bs4 import BeautifulSoup
import internetarchive as ia

In [3]:
file_links = []

# Identifier of the Archive.org collection
collection_identifier = "schwurbel-archiv"

# Search for items in the collection
items = ia.search_items('collection:' + collection_identifier)


data = []

# Iterate over the items
for item in items:
    data.append({
        "ID": uuid4(),
        "identifier": item['identifier'],
        "status": "Added",
        "downloaded": "",
        "group_name": "",
        "crawl_data": ""
    })


In [5]:
import pandas as pd 

df = pd.DataFrame.from_dict(data)
df.to_csv('2023-07-06-Schwurbelarchiv-Liste.csv')

In [6]:
def update_status(index, status):
    df.at[index, 'status'] = status
    df.to_csv('2023-07-06-Schwurbelarchiv-Liste.csv', index=False)  # Save the DataFrame to a CSV file

In [8]:
def scrape_content(filepath):
  # Read the HTML file
  with open(filepath, 'r', encoding='utf-8') as file:
      html_content = file.read()

  # Create a Beautiful Soup object
  soup = BeautifulSoup(html_content, 'lxml')

  messages = soup.find_all('div', class_='message')
  data = []

  # Iterate over the messages
  for message_div in messages:
      from_name_div = message_div.find('div', class_='from_name')
      if not from_name_div:
          #print(message_div)
          continue

      from_name = from_name_div.get_text(strip=True)
      forwarded_div = message_div.find('div', class_='forwarded body')

      forwarded_from_name = ""
      forwarded_message_text = ""
      forwarded_message_photo = ""
      forwarded_video_duration = ""
      if forwarded_div:
          forwarded_from_name_div = forwarded_div.find('div', class_='from_name')
          forwarded_from_name = forwarded_from_name_div.get_text(strip=True).replace(forwarded_from_name_div.find('span', class_='details').get_text(strip=True), '')

          message_text_div = forwarded_div.find('div', class_='text')
          forwarded_message_text = ''.join(str(child) for child in message_text_div.contents) if message_text_div else ""

          message_photo = forwarded_div.find('img')
          forwarded_message_photo = message_photo['src'] if message_photo else ""

          video_duration_div = forwarded_div.find('div', class_='video_duration')
          forwarded_video_duration = video_duration_div.get_text(strip=True) if video_duration_div else ""


      message_text_div = message_div.find('div', class_='text')

      if message_text_div:
          message_text = ''.join(str(child) for child in message_text_div.contents) if message_text_div else ""

          message_photo = message_div.find('img')
          message_photo = message_photo['src'] if message_photo else ""

          video_duration_div = message_div.find('div', class_='video_duration')
          video_duration = video_duration_div.get_text(strip=True) if video_duration_div else ""

      date_element = message_div.find('div', class_='pull_right date details')
      datetime_value = date_element['title']
      datetime_object = datetime.strptime(datetime_value, '%d.%m.%Y %H:%M:%S')

      # Append the extracted data to the list
      data.append({
          'ID': uuid4(),
          'From Name': from_name,
          'Datetime': datetime_object,
          'Message Text': message_text,
          'Photo': message_photo,
          'Video Duration': video_duration,
          'Forwarded From Name': forwarded_from_name,
          'Forwarded Message Text': forwarded_message_text,
          'Forwarded Photo': forwarded_message_photo,
          'Forwarded Video Duration': forwarded_video_duration
      })

  # Create a pandas DataFrame from the extracted data
  return pd.DataFrame(data)

In [11]:
import zipfile
import os
import re
import shutil

In [None]:
temp_folder_path = "tmp"

for index, row in df[11:20].iterrows():
    identifier = row['identifier']
    status = row['status']
    if status != 'Done':
        # Remove old files        
        if os.path.exists("export.zip"):
            os.remove("export*.zip")

        update_status(index, 'Downloading')

        item = ia.get_item(identifier)

        # Get group name
        name = item.metadata['title']

        # Extract the name
        name_pattern = r'„(.*?)“'
        name_match = re.search(name_pattern, name)
        group_name = name_match.group(1) if name_match else ""
        df.at[index, 'group_name'] = group_name

        # Extract the date
        date_pattern = r'vom (\d{2}\.\d{2}\.\d{4})'
        date_match = re.search(date_pattern, name)
        crawl_date = date_match.group(1) if date_match else ""
        # Convert the crawl_date_str to a datetime object
        crawl_date = datetime.strptime(crawl_date, "%d.%m.%Y")

        # Update the 'crawl_date' column in the DataFrame
        df.at[index, 'crawl_date'] = crawl_date

        print(f"Working on {identifier} -- {group_name}")

        download_files = []
        for f in ia.get_files(identifier):          
          if f.format == "ZIP":
            download_files.append(f)
            print(f"Downloading {f.name}")
            f.download()
        

        filename = ""
        if len(download_files) == 1:
          filename = "export.zip"

        else:
          filename = "export-part1.zip"

        update_status(index, 'Extracting')

        print("Downloaded. Extracting Now.")

        # Extract the export.zip into the temp folder
        with zipfile.ZipFile(filename, 'r') as zip_ref:
          zip_ref.extractall(temp_folder_path)

        update_status(index, 'Scraping')
        scraped_df = scrape_content(f"{temp_folder_path}/messages.html")
        scraped_df.to_csv(f"data/2023-07-Telegram-{row['ID']}.csv")

        # Remove the temp folder if it exists
        if os.path.exists(temp_folder_path):
          shutil.rmtree(temp_folder_path)

        if len(download_files) > 1:
          if os.path.exists(identifier):
            shutil.rmtree(identifier)

        update_status(index, 'Done')

print("Processing complete.")

Working on schwurbelarchiv-06hyV5GgkR -- Siegfried Daebritz
Downloading export-part1.zip
