In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!wget -c "https://github.com/raingo/TGIF-Release/archive/master.zip" -P "/content/drive/MyDrive/TumblrGIFs"
zip_path = "/content/drive/My Drive/TumblrGIFs/master.zip"
!unzip "{zip_path}" -d "/content/drive/MyDrive/TumblrGIFs"
!ls "/content/drive/MyDrive/TumblrGIFs"

--2023-11-02 19:05:15--  https://github.com/raingo/TGIF-Release/archive/master.zip
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/raingo/TGIF-Release/zip/refs/heads/master [following]
--2023-11-02 19:05:15--  https://codeload.github.com/raingo/TGIF-Release/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 192.30.255.120
Connecting to codeload.github.com (codeload.github.com)|192.30.255.120|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘/content/drive/MyDrive/TumblrGIFs/master.zip’

master.zip              [      <=>           ]  11.82M  5.29MB/s    in 2.2s    

2023-11-02 19:05:17 (5.29 MB/s) - ‘/content/drive/MyDrive/TumblrGIFs/master.zip’ saved [12396861]



In [None]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import os

# Initialize counters
download_counts = {
    'total_downloads': 0,
    'failed_downloads': 0,
    'skipped_removed': 0
}

def download_gif(url, download_folder, download_counts, retries=3):
    gif_name = url.split('/')[-1]
    gif_path = os.path.join(download_folder, gif_name)

    if Path(gif_path).exists():
        return  # Skip if already downloaded

    for attempt in range(retries):
        try:
            head_response = requests.head(url, allow_redirects=True)
            if head_response.ok:
                try:
                    content_length = int(head_response.headers.get('Content-Length', 0))
                except ValueError:
                    content_length = 0  # Default to 0 if the Content-Length header is not an integer

                if content_length in [3973, 3395]:
                    print(f"Skipped removed content: {gif_name}")
                    download_counts['skipped_removed'] += 1
                    return

                with requests.get(url, stream=True) as response:
                    if response.status_code == 200:
                        with open(gif_path, 'wb') as f:
                            for chunk in response.iter_content(chunk_size=8192):
                                f.write(chunk)
                        print(f"Downloaded {gif_name}")
                        download_counts['total_downloads'] += 1
                        return
            else:
                print(f"Failed to download (status code {head_response.status_code}): {gif_name}")
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {gif_name}: {e}")

    print(f"Failed to download after {retries} attempts: {gif_name}")
    download_counts['failed_downloads'] += 1

def download_all_gifs(tsv_path, download_folder, download_counts, max_workers=10):
    Path(download_folder).mkdir(parents=True, exist_ok=True)

    with ThreadPoolExecutor(max_workers=max_workers) as executor, open(tsv_path, 'r') as file:
        future_to_url = {executor.submit(download_gif, line.strip().split('\t')[0], download_folder, download_counts): line for line in file if line.strip()}

        for future in as_completed(future_to_url):
            url = future_to_url[future]  # This will print any exceptions if they occurred

    # Print out the final counts
    print(f"Total downloads: {download_counts['total_downloads']}")
    print(f"Failed downloads: {download_counts['failed_downloads']}")
    print(f"Skipped removed GIFs: {download_counts['skipped_removed']}")

# Set the path to your TSV file and the download folder
tsv_path = '/content/drive/My Drive/TumblrGIFs/TGIF-Release-master/data/tgif-v1.0.tsv'
download_folder = '/content/drive/My Drive/TumblrGIFs/cleaned_gifs'
download_all_gifs(tsv_path, download_folder, download_counts)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Downloaded tumblr_ndb4qpI4AV1sc377wo1_500.gif
Downloaded tumblr_nfg4ybyIMU1tc3kbpo1_250.gif
Downloaded tumblr_nmps4quOhz1r1pstjo1_400.gif
Downloaded tumblr_nes13br5Sk1r76160o1_500.gif
Downloaded tumblr_n9wjcxEfe01sqo9eko1_250.gif
Downloaded tumblr_ncmpi75fnV1s1s6foo3_r1_500.gif
Downloaded tumblr_n9n5d01I2q1tiu36po1_250.gif
Downloaded tumblr_nbuj95YP1L1soot9ho1_500.gif
Downloaded tumblr_nrl57jyCp61snzjsgo1_400.gif
Downloaded tumblr_nazqa1sP6r1tckhoro1_1280.gif
Downloaded tumblr_nbwjx31anF1rkfzaho1_400.gif
Downloaded tumblr_ncl4ftDRjE1tjmovwo1_250.gif
Downloaded tumblr_n8rfby6LNF1qapt05o1_500.gif
Downloaded tumblr_nffzhaqPyk1tkoeqqo1_500.gif
Downloaded tumblr_ngipx3UXAX1rnuooxo1_250.gif
Downloaded tumblr_nbzuhyMSAp1rj2d1ho1_250.gif
Downloaded tumblr_naf3n9tRJN1td8ijso1_500.gif
Downloaded tumblr_nrk5sacgG71ubnwu0o1_500.gif
Downloaded tumblr_nbj56iP7bu1s0r79po1_1280.gif
Downloaded tumblr_n9pdzrGI7S1tp0tv0o1_250.gif
Downloaded

OSError: ignored

In [None]:
import os

gif_folder_path = '/content/drive/MyDrive/Video-to-Text/gifs'  # Update this path to your GIFs folder path
gif_files = [name for name in os.listdir(gif_folder_path) if name.endswith('.gif')]
print(f"Total downloaded GIFs: {len(gif_files)}")

OSError: ignored

In [None]:
import glob

drive.mount('/content/drive', force_remount=True)

gif_folder_path = '/content/drive/MyDrive/Video-to-Text/gifs'  # Update this path to your GIFs folder path
gif_files = glob.glob(gif_folder_path + '/*.gif')
print(f"Total downloaded GIFs: {len(gif_files)}")

NameError: ignored

In [None]:
from google.colab import drive
drive.flush_and_unmount()  # Unmounts the drive
drive.mount('/content/drive', force_remount=True)  # Re-mounts the drive

!ls "/content/drive/MyDrive/Video-to-Text/gifs" | wc -l

Mounted at /content/drive
100669
