In [1]:
"""
This module provides functionality to download images from Imgur. It supports the following types of Imgur URLs:
1. Direct image links (i.imgur.com)
2. Post links (imgur.com/...)
3. Albums and galleries (imgur.com/a/... or imgur.com/gallery/...)

Functions:
- clasify_url: Classifies an Imgur URL as an image, post, or album/gallery.
- find_image_url_extension: Determines the correct extension for an Imgur image URL.
- download_imgur_image: Downloads an image from a direct Imgur URL.
- download_imgur_post: Converts a post URL to a direct image URL and downloads it.
- get_imgur_album_info: Retrieves image information from an album or gallery URL.
- download_imgur_album: Downloads images from an Imgur album or gallery.
"""

"\nThe purpose of this notebook is to write a function that can download most images from imgur.\n\nApproach:\nThere are three main types of imgur urls:\nimage links directly (i.imgur)\npost links (imgur.com/...)\nalbums & galleries (imgur.com/a/.. or imgur.com/gallery/...)\n\n\nNotes:\ndf_imgur[df_imgur['id']=='dtbz10u']]\nthis is a gif!\n"

In [2]:
import re
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

folder = "test_download_images"

def clasify_url(imgur_link:str):
    '''
    This function takes in a url as input and outputs its type
    '''
    if 'i.imgur.com' in imgur_link:
        return 'image'
    elif '/a/' in imgur_link or '/gallery/' in imgur_link:
        return 'album'
    elif 'imgur.com' in imgur_link:
        return 'post'
    return -1

def find_image_url_extension(imgur_url: str) -> str:
    '''
    For finding the correct extension.
    '''
    match = re.search(r'\.([a-zA-Z0-9]+)(?:\?.*)?$', imgur_url)
    if match:
        return match.group(1)
    return 'jpeg' #in case we dont find an extension


In [3]:
test_cases = ['https://i.imgur.com/zPghwK5.png',
'https://i.imgur.com/WbUZKUQ.jpg',
'https://i.imgur.com/WbUZKUQ.j',
'https://i.imgur.com/WbUZKUQ.',
'https://i.imgur.com/FMVFDWq.beep.boop',
'https://i.imgur.com/4Q6gk06.beepboop',
'https://i.imgur.com/cnx5J3q.beep']

results = [find_image_url_extension(url) for url in test_cases]
results

['png', 'jpg', 'j', 'jpeg', 'boop', 'beepboop', 'beep']

In [4]:
def download_imgur_image(url: str, image_id: str, download_folder: str, verbose: bool = True):
    '''
    For downloading imgur URLs that are classified as images.
    '''
    headers = {'User-Agent': 'Mozilla/5.0'}
    retry_strategy = Retry(
        total=3,  # Total number of retries
        status_forcelist=[429, 500, 502, 503, 504],  # Status codes to retry
        backoff_factor=2  # Wait 2, 4, 8 seconds between retries
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)
    status = 'Not Attempted'

    try:
        response = http.get(url, headers=headers, stream=True)
        if response.status_code == 200:
            content_type = response.headers.get('Content-Type', '')
            if 'image' in content_type:
                status = 'success'
                extension = find_image_url_extension(url)
                filename = f"{image_id}.{extension}"  # Use the correct extension
                filepath = os.path.join(download_folder, filename)
                
                with open(filepath, 'wb') as out_file:
                    out_file.write(response.content)
                    
                # Check if we got the placeholder 404 image:
                downloaded_image = Image.open(filepath)
                placeholder_image = Image.open('/Users/petrsushko/Desktop/placeholder.jpg')
                if list(downloaded_image.getdata()) == list(placeholder_image.getdata()):
                    os.remove(filepath)
                    status = 'Failure: Placeholder image downloaded'
                    
            else:
                status = 'Failure: Not an image'
        else:
            status = f"Error {response.status_code}"
    except Exception as e:
        status = 'Failure: ' + str(e)

    if url == 'unrecovered':
        status = 'URL wasnt recovered'
    if verbose:
        print(image_id, status)
    return status

In [5]:
# Ensure the folder exists
folder = "test_download_images"
os.makedirs(folder, exist_ok=True)

# Test Scenario 1: Successful Image Download
image_url = 'https://i.imgur.com/6a1ORz9.jpg'
image_id = 'valid_image'
status = download_imgur_image(image_url, image_id, folder, verbose=True)
print('Test 1:', status)  # Should print the filename and 'success'

# Test Scenario 2: Invalid Image URL
image_url = 'https://i.imgur.com/6a1ORz96a1ORz9.jpg'
image_id = 'invalid_image'
status = download_imgur_image(image_url, image_id, folder, verbose=True)
print('Test 2:', status)  # Should print 'Error 404'

# Test Scenario 3: URL Returning Non-Image Content
image_url = 'https://i.imgur.com/non_image_content'
image_id = 'non_image'
status = download_imgur_image(image_url, image_id, folder, verbose=True)
print('Test 3:', status)  # Should print 'Failure: Not an image'

# Test Scenario 4: Placeholder Image
image_url = 'https://i.imgur.com/XOcqMti.jpg'
image_id = 'placeholder_image'
status = download_imgur_image(image_url, image_id, folder, verbose=True)
print('Test 4:', status)  # Should print 'Failure: Placeholder image downloaded'

# Test Scenario 5: Valid Image with Different Extension
image_url = 'https://i.imgur.com/6a1ORz9.png'
image_id = 'valid_png_image'
status = download_imgur_image(image_url, image_id, folder, verbose=True)
print('Test 5:', status)  # Should print the filename and 'success'


valid_image success
Test 1: success
invalid_image Failure: Not an image
Test 2: Failure: Not an image
non_image Failure: Not an image
Test 3: Failure: Not an image
placeholder_image Failure: Placeholder image downloaded
Test 4: Failure: Placeholder image downloaded
valid_png_image success
Test 5: success


In [6]:
def download_imgur_post(url: str, image_id: str, download_folder: str, verbose: bool = True):
    '''
    This function changes the url to the url we can download directly.
    '''
    pattern = r'://imgur\.'
    if re.search(pattern, url):
        image_url = re.sub(pattern, '://i.imgur.', url, count=1)
        attempt = download_imgur_image(image_url + '.jpeg', image_id, download_folder, verbose=verbose)
        return attempt            
    return 'failed_to_convert_to_image_url'

In [7]:
# Test Scenario 1: Successful Image Download
url = 'https://imgur.com/XSbjmRz'
image_id = 'valid_image'
status = download_imgur_post(url, image_id, folder)
print('Test 1:', status)  # Expected: status indicating success or failure

# Test Scenario 2: Invalid Image URL
url = 'https://imgur.com/XSbjmRzXSbjmRz'
image_id = 'invalid_image'
status = download_imgur_post(url, image_id, folder)
print('Test 2:', status)  # Expected: 'Error 404' or similar

# Test Scenario 3: URL Returning Non-Image Content
url = 'https://imgur.com/gallery/me-gonna-just-format-this-then-im-done-microsoft-word-kXB3OFu#/t/gifs'
image_id = 'non_image'
status = download_imgur_post(url, image_id, folder)
print('Test 3:', status)  # Expected: 'Failure: Not an image'

# Test Scenario 4: Placeholder Image
url = 'https://imgur.com/XOcqMti'
image_id = 'placeholder_image'
status = download_imgur_post(url, image_id, folder)
print('Test 4:', status)  # Expected: 'Failure: Placeholder image downloaded'

valid_image success
Test 1: success
invalid_image Failure: Not an image
Test 2: Failure: Not an image
non_image Failure: Not an image
Test 3: Failure: Not an image
placeholder_image Failure: Placeholder image downloaded
Test 4: Failure: Placeholder image downloaded


In [8]:
result = download_imgur_image('https://i.imgur.com/XOcqMti.jpg', 'new_test', folder)
#https://i.imgur.com/XOcqMti.jpg	return placeholder
print(result)

new_test Failure: Placeholder image downloaded
Failure: Placeholder image downloaded


In [9]:
import requests
from bs4 import BeautifulSoup
import json

def get_imgur_album_info(album_url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    image_url = 'unrecovered'
    response = requests.get(album_url, headers=headers)
    if response.status_code != 200:
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Get image URL
    meta_tag = soup.find('meta', property='og:image')
    image_url = meta_tag.get('content', None).split('?')[0]
    if str(image_url)=='':
        image_url = 'unrecovered'

    return image_url

In [23]:
def download_imgur_album(url: str, image_id: str, download_folder: str, verbose: bool = True):
    url_to_download = get_imgur_album_info(url)
    attempt = download_imgur_image(url_to_download , image_id, download_folder, verbose=verbose)
    return attempt            

In [11]:
#============================================================================================================================================

In [26]:
import pandas as pd
df_exploded = pd.read_pickle('restoration_comments_ready_to_download.pkl')
df_exploded.head(3)

Unnamed: 0,id,subreddit,author,body,parent_id,created_utc,score,extracted_urls,root,main_website,unique_id
0,ds0wih9,estoration,pshopper,https://www.dropbox.com/s/91elcy19l9apfns/p37b...,t3_7ncnsv,1514778073,5,https://www.dropbox.com/s/91elcy19l9apfns/p37b...,7ncnsv,www.dropbox.com,ds0wih9_1
1,ds10xp6,estoration,Vulfpek,https://imgur.com/a/VwOlc\n\nTried to fix it u...,t3_7n9xju,1514786512,3,https://imgur.com/a/VwOlc,7n9xju,imgur.com,ds10xp6_1
2,ds10xrc,estoration,imguralbumbot,"^(Hi, I'm a bot for linking direct images of a...",t1_ds10xp6,1514786516,1,https://np.reddit.com/message/compose/?to=imgu...,7n9xju,np.reddit.com,ds10xrc_1


In [27]:
len(df_exploded)

46819

In [28]:
df_imgur = df_exploded[df_exploded['main_website'].str.contains('imgur.com')]
df_imgur.loc[:, 'url_classification'] = df_imgur['extracted_urls'].apply(clasify_url)
df_imgur['url_classification'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imgur.loc[:, 'url_classification'] = df_imgur['extracted_urls'].apply(clasify_url)


url_classification
image    7682
album    5990
post     4350
Name: count, dtype: int64

In [15]:
#checking the outputs manually, this seems good

#pd.set_option('display.max_rows', None)
#df_imgur[['extracted_urls','url_classification']].head(30)

In [30]:
pd.reset_option('display.max_rows')
df_imgur.head(50)

Unnamed: 0,id,subreddit,author,body,parent_id,created_utc,score,extracted_urls,root,main_website,unique_id,url_classification
1,ds10xp6,estoration,Vulfpek,https://imgur.com/a/VwOlc\n\nTried to fix it u...,t3_7n9xju,1514786512,3,https://imgur.com/a/VwOlc,7n9xju,imgur.com,ds10xp6_1,album
5,ds10xrc,estoration,imguralbumbot,"^(Hi, I'm a bot for linking direct images of a...",t1_ds10xp6,1514786516,1,https://i.imgur.com/6a1ORz9.jpg,7n9xju,i.imgur.com,ds10xrc_4,image
9,ds1azk7,estoration,elicaaaash,"I saw that no one had attempted these for you,...",t3_7nd02g,1514814847,2,https://i.imgur.com/XtCBDkK.jpg,7nd02g,i.imgur.com,ds1azk7_1,image
10,ds1azk7,estoration,elicaaaash,"I saw that no one had attempted these for you,...",t3_7nd02g,1514814847,2,https://imgur.com/a/nk8lL,7nd02g,imgur.com,ds1azk7_2,album
13,ds1om3p,estoration,manolietus,https://i.imgur.com/GuEKsgX.jpg,t3_7nehl9,1514834780,3,https://i.imgur.com/GuEKsgX.jpg,7nehl9,i.imgur.com,ds1om3p_1,image
15,ds1oyul,estoration,livethroughthis4,[Happy New Year.](https://i.imgur.com/Dmug1e5....,t3_7nehl9,1514835190,3,https://i.imgur.com/Dmug1e5.jpg,7nehl9,i.imgur.com,ds1oyul_1,image
18,ds1sk8y,estoration,dutchart,and my try: [Imgur](https://i.imgur.com/CaejlG...,t3_7ngujy,1514839450,2,https://i.imgur.com/CaejlGp.jpg,7ngujy,i.imgur.com,ds1sk8y_1,image
26,ds2o3c8,estoration,aangnesiac,Here's my go at it: https://i.imgur.com/XOcqM...,t3_7njgl3,1514885915,2,https://i.imgur.com/XOcqMti.jpg,7njgl3,i.imgur.com,ds2o3c8_1,image
29,ds2te1l,estoration,dutchart,My try: [Imgur](https://i.imgur.com/GP4YDna.jpg),t3_7njgl3,1514899262,1,https://i.imgur.com/GP4YDna.jpg,7njgl3,i.imgur.com,ds2te1l_1,image
30,ds2ydml,estoration,TheBrownieTitan,Here's my shot at it: https://imgur.com/a/34FV...,t3_7nd02g,1514906925,2,https://imgur.com/a/34FVZ,7nd02g,imgur.com,ds2ydml_1,album


In [31]:
df_imgur[:30]

Unnamed: 0,id,subreddit,author,body,parent_id,created_utc,score,extracted_urls,root,main_website,unique_id,url_classification
1,ds10xp6,estoration,Vulfpek,https://imgur.com/a/VwOlc\n\nTried to fix it u...,t3_7n9xju,1514786512,3,https://imgur.com/a/VwOlc,7n9xju,imgur.com,ds10xp6_1,album
5,ds10xrc,estoration,imguralbumbot,"^(Hi, I'm a bot for linking direct images of a...",t1_ds10xp6,1514786516,1,https://i.imgur.com/6a1ORz9.jpg,7n9xju,i.imgur.com,ds10xrc_4,image
9,ds1azk7,estoration,elicaaaash,"I saw that no one had attempted these for you,...",t3_7nd02g,1514814847,2,https://i.imgur.com/XtCBDkK.jpg,7nd02g,i.imgur.com,ds1azk7_1,image
10,ds1azk7,estoration,elicaaaash,"I saw that no one had attempted these for you,...",t3_7nd02g,1514814847,2,https://imgur.com/a/nk8lL,7nd02g,imgur.com,ds1azk7_2,album
13,ds1om3p,estoration,manolietus,https://i.imgur.com/GuEKsgX.jpg,t3_7nehl9,1514834780,3,https://i.imgur.com/GuEKsgX.jpg,7nehl9,i.imgur.com,ds1om3p_1,image
15,ds1oyul,estoration,livethroughthis4,[Happy New Year.](https://i.imgur.com/Dmug1e5....,t3_7nehl9,1514835190,3,https://i.imgur.com/Dmug1e5.jpg,7nehl9,i.imgur.com,ds1oyul_1,image
18,ds1sk8y,estoration,dutchart,and my try: [Imgur](https://i.imgur.com/CaejlG...,t3_7ngujy,1514839450,2,https://i.imgur.com/CaejlGp.jpg,7ngujy,i.imgur.com,ds1sk8y_1,image
26,ds2o3c8,estoration,aangnesiac,Here's my go at it: https://i.imgur.com/XOcqM...,t3_7njgl3,1514885915,2,https://i.imgur.com/XOcqMti.jpg,7njgl3,i.imgur.com,ds2o3c8_1,image
29,ds2te1l,estoration,dutchart,My try: [Imgur](https://i.imgur.com/GP4YDna.jpg),t3_7njgl3,1514899262,1,https://i.imgur.com/GP4YDna.jpg,7njgl3,i.imgur.com,ds2te1l_1,image
30,ds2ydml,estoration,TheBrownieTitan,Here's my shot at it: https://imgur.com/a/34FV...,t3_7nd02g,1514906925,2,https://imgur.com/a/34FVZ,7nd02g,imgur.com,ds2ydml_1,album


In [32]:
for _, row in df_imgur.head(100).iterrows():
    if row['url_classification'] == 'image':
        download_imgur_image(row['extracted_urls'], row['unique_id'], folder, verbose=True)
    elif row['url_classification'] == 'album':
        download_imgur_album(row['extracted_urls'], row['unique_id'], folder, verbose=True)
    elif row['url_classification'] == 'post':
        download_imgur_post(row['extracted_urls'], row['unique_id'], folder, verbose=True)


ds10xp6_1 success
ds10xrc_4 success
ds1azk7_1 success
ds1azk7_2 success
ds1om3p_1 success
ds1oyul_1 success
ds1sk8y_1 success
ds2o3c8_1 Failure: Placeholder image downloaded
ds2te1l_1 success
ds2ydml_1 success
ds3quw1_1 success
ds3qv1a_2 success
ds4qtvu_1 URL wasnt recovered
ds4qu1q_2 Failure: Placeholder image downloaded
ds4rgnu_1 success
ds5occ2_2 success
ds60pmt_1 success
ds67xxr_1 success
ds67y1o_2 success
ds6cj27_1 success
ds6djen_1 success
ds6htlk_1 success
ds71on7_1 success
ds79plc_1 success
ds7hcv6_1 success
ds7jmll_1 success
ds7noyg_1 success
ds81n1r_2 success
ds86ich_1 success
ds8sn59_1 success
ds9ej1m_1 Failure: Placeholder image downloaded
ds9ho3w_1 success
ds9ho6h_2 success
ds9mc4l_1 success
dsa1o6a_1 success
dsa5r55_1 Failure: Placeholder image downloaded
dsacsb9_1 success
dsb3gun_1 success
dsb61jo_1 success
dsb69jn_1 success
dsbzkaw_1 success
dscnwe8_1 Failure: Placeholder image downloaded
dscs1r3_1 Failure: Placeholder image downloaded
dsdz6ag_1 success
dseprow_1 succes

In [19]:
#===========