# Image Retrieval From Instagram

**Goal:** collect image data from instagram and then preprocess it, extract information (image files) from a user's Instagram profile

**Constraints:** the user has no way of setting the image size (in KB), the resolution (1080x1080) of the images found on Instagram. The images are extracted from the Instagram page in raw form.  

#### Websites: 

This notebook's code is based on the following tutorials: 

https://medium.com/@srujana.rao2/scraping-instagram-with-python-using-selenium-and-beautiful-soup-8b72c186a058

https://edmundmartin.com/scraping-instagram-with-python/

https://michaeljsanders.com/2017/05/12/scrapin-and-scrollin.html

**Important Note:** *Remember to respect user’s rights when you download copyrighted content. Do not use images/videos from Instagram for commercial intent.*

### 1. Import dependencies

Install non-standard libraries: requests, BeautifulSoup 

In [104]:
import os
from random import choice
import json
import pandas as pd
import csv

# to install
import requests
from bs4 import BeautifulSoup

### 2. Build InstagramScraper class
based on: https://edmundmartin.com/scraping-instagram-with-python/

Switching user agents is often a best practice when web scraping and can help you avoid detection. Should the caller of our class have provided their own list of user agents we take a random agent from the provided list.  Otherwise we will return our default user agent.

Define a class called InstagramScraper: 

In [122]:
# url header for requests.get()
headers={'User-Agent':  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
         ,  'content-type': 'application/json'
         , 'accept-encoding': 'gzip, deflate, br'
         , 'cache-control': 'no-cache'
         , 'accept' : '*/*'
         , 'accept-language' : 'de-DE, de; q=0.9,en-US; q=0.8,en;q=0.7'
         #, 'referer' : url
         , 'connection' : 'keep-alive'
         , 'cookie' : 'ig_cb=1; ig_did=DA66C494-9DFE-48F6-BA63-66F11DF8EC03; csrftoken=ukE8jYSjQxVs1YGPYddEkAXsN6WZ4Qmw; mid=XoChrAALAAG78Upva7Ld0TAzeTtm; rur=ASH; urlgen="{\"2a04:ee41:4:95:91f9:b9d4:8aab:41c\": 15796\054 \"213.55.241.7\": 15796\054 \"2a04:ee41:4:95:60ae:def3:2fd7:3633\": 15796}:1jIpww:PTjjrSzpjC6dWww8-AVOnfdQAFA"'
        }
_user_agents = [
   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
]

In [123]:
class InstagramScraper:

    def __init__(self, user_agents=None, proxy=None):
        self.user_agents = user_agents
        self.proxy = proxy

    def __random_agent(self):
        if self.user_agents and isinstance(self.user_agents, list):
            return choice(self.user_agents)
        return choice(_user_agents)

    def __request_url(self, url):
        """Our second helper method is simply a wrapper around requests. 
        We pass in a URL and try to make a request using the provided user agent and proxy. 
        If we are unable to make the request or Instagram responds with a non-200 status code we simply re-raise the error. 
        If everything goes fine, we return the page in questions HTML."""
        try:
            response = requests.get(url, headers={'User-Agent': self.__random_agent()}, proxies={'http': self.proxy, 'https': self.proxy})
            #response = requests.get(url, headers=headers, proxies={'http': self.proxy, 'https': self.proxy})
            response.raise_for_status()
        except requests.HTTPError:
            raise requests.HTTPError('Received non 200 status code from Instagram')
        except requests.RequestException:
            raise requests.RequestException
        else:
            return response.text


    @staticmethod
    def extract_json_data(html):
        """Instagram serve’s all the of information regarding a user in the form of JavaScript object. 
        This means that we can extract all of a users profile information and their recent posts by just 
        making a HTML request to their profile page. We simply need to turn this JavaScript object into 
        JSON, which is very easy to do."""
        soup = BeautifulSoup(html, 'html.parser')
        body = soup.find('body')
        script_tag = body.find('script')
        raw_string = script_tag.text.strip().replace('window._sharedData =', '').replace(';', '')
        return json.loads(raw_string)

    def profile_page_metrics(self, profile_url):
        results = {}
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                #print('key:', key, '-value:', value)
                if key != 'edge_owner_to_timeline_media':
                    if value and isinstance(value, dict):
                        value = value['count']
                        results[key] = value
                    elif value:
                        results[key] = value
        return results

    #TODO
    def hash_page_metrics(self, profile_url):
        results = {}
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['TagPage'][0]['graphql']['hashtag']
         
        except Exception as e:
            raise e
        else:
            for key, value in metrics.items():
                #print('metrics:', metrics)
                if key != 'edge_hashtag_to_media' and key != 'edge_hashtag_to_top_posts' and key != 'profile_pic_url':
                    results[key] = value
                    if value and isinstance(value, dict):
                        try: 
                            value = value['count']            
                            results[key] = value
                        except: 
                            results[key] = value
                        try: 
                            sigma = []
                            for i in range(0,5): 
                                #print(i)
                                value = value['edges'][i]['node']['name']  
                                #print(i)
                            sigma.append(value)
                            print(len(value['edges']['node']))
                            
                            #results[key] = sigma
                        except: 
                            results[key] = value 
                    elif value:
                        results[key] = value
        return results
    
    def profile_page_posts(self, profile_url):
        results = []
        try:
            response = self.__request_url(profile_url)
            json_data = self.extract_json_data(response)
            metrics = json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']["edges"]
            pprint(metrics)
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                #if node and isinstance(node, dict): #this line only gets most recent post out
                results.append(node)
        return results
    
    def hashtag_page_posts(self, hashtag_url):
        results = []
        try:
            response = self.__request_url(hashtag_url)
            json_data = self.extract_json_data(response)
            #pprint(json_data)
            metrics = json_data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']["edges"]
            #pprint(metrics)
        except Exception as e:
            raise e
        else:
            for node in metrics:
                node = node.get('node')
                #if node and isinstance(node, dict): #this line only gets most recent post out
                results.append(node)
        return results

### 3.1 Load URLS of Brand Names Data

Specify instragram USERNAME profile whose page you want to scrape. Get a dictionary with all information (image, comments, etc.) from that Instagram profile.

In [107]:
# to specify
directory= r'C:\Users\Anonym\Documents\GitHub\DLfM_BrandManagement\data\instagram_urls'
os.chdir(directory)

Get out all apparel brands. 

In [108]:
import csv

data = pd.read_csv("firm_usernames.csv", header=None)

firm_usernames = data[0].tolist()
firm_usernames[:5]

['abercrombie', 'adidas', 'anntaylor', 'bacardiusa', 'bananarepublic']

In [109]:
data = pd.read_csv("instagram_hashtags.csv", header=None)

instagram_hashtags = data[0].tolist()
instagram_hashtags[:5]

['fanta', 'coorslight', 'greygoose', 'corona', 'monsterenergy']

### 3.2 Specify Instagram page(s)

Specify instragram USERNAME profile whose page you want to scrape. Get a dictionary with all information (image, comments, etc.) from that Instagram profile. 

#### User-profile Page

If you want to scrape a user-profile page, specify the username as:

In [110]:
# pages that have access denial because of age limit
# are you 18/21 or over? 
#urls.remove('https://www.instagram.com/bacardiusa/?hl=en')

In [111]:
# items to be removed from list
agelimited_brands = {'bacardiusa', 'budlight', 'budweiser', 'coorslight', 'corona', 'greygoose', 'jackdaniels_us', 'korbel_1882'} 
  
firm_usernames = [ele for ele in firm_usernames if ele not in agelimited_brands] 

In [113]:
# for multiple firms  
urls = []
hashtag = False

for username in firm_usernames: 
    url = 'https://www.instagram.com/'+username+'/?hl=en'
    urls.append(url)

urls[:2]

['https://www.instagram.com/abercrombie/?hl=en',
 'https://www.instagram.com/adidas/?hl=en']

In [87]:
# for one firm only 

# to specify
#username='cailler_suisse'
#hashtag = False
#url = 'https://www.instagram.com/'+username+'/?hl=en'

#### Hashtag Page

If you want to open a hashtag page (instead of a user profile): 

In [114]:
# for multiple brands  
hash_urls = []
username = False

for hashtag in instagram_hashtags: 
    url = 'https://www.instagram.com/explore/tags/'+hashtag
    hash_urls.append(url)

hash_urls[:2]

['https://www.instagram.com/explore/tags/fanta',
 'https://www.instagram.com/explore/tags/coorslight']

In [89]:
# for one brand only 

# to specify
#hashtag='cailler'
#username = False
#url = 'https://www.instagram.com/explore/tags/'+hashtag

### 3. Get information from Instagram page(s) [optional]

Now that the url of the Instagram page is defined, it will extract out all the posts or meta-information from the website usinge the InstagramScraper class. 

Get meta-information metrics by using a class method. 

In [114]:
# get profile page metrics
from pprint import pprint

k = InstagramScraper()
results = k.profile_page_metrics(url) 
pprint(results)

{'biography': '🌱plant-based recipes & wholesome living \n'
              '🍒nourish the cells & the soul \n'
              '🌱a YouTube community of 2M friends 👩🏻\u200d🌾\n'
              '👇 NEW VIDEO 👇',
 'business_category_name': 'Publishers',
 'category_id': '2707',
 'edge_felix_video_timeline': 0,
 'edge_follow': 127,
 'edge_followed_by': 531071,
 'edge_media_collections': 0,
 'edge_mutual_followed_by': 0,
 'edge_saved_media': 0,
 'external_url': 'https://youtu.be/0Kgi-H2W7Hk',
 'external_url_linkshimmed': 'https://l.instagram.com/?u=https%3A%2F%2Fyoutu.be%2F0Kgi-H2W7Hk&e=ATM5rZNI8I5aBiZz3RAszJWMkhflagAU_QiH_SQDII3ITWclaigcQbJHAT__clKn0V1x15eE&s=1',
 'full_name': 'Sadia Badiei, BSc Dietetics',
 'highlight_reel_count': 1,
 'id': '2072931271',
 'is_business_account': True,
 'is_verified': True,
 'profile_pic_url': 'https://instagram.fzrh2-1.fna.fbcdn.net/v/t51.2885-19/s150x150/84057956_823380854858266_527460638654464000_n.jpg?_nc_ht=instagram.fzrh2-1.fna.fbcdn.net&_nc_ohc=RvJ85_MOJB4AX_

In [179]:
# get hashtag page metrics
from pprint import pprint

k = InstagramScraper()
#TODO
results = k.hash_page_metrics(url) 
#pprint(results)

### 4. Get image posts from Instagram page(s)

Get all posts on an Instagram **profile page** that are visible on the landing page (more items only load as you scroll downwards). The page loads 12 items at a time, and I need to scroll to load all entries (for a total of 120).

#### User-profile Page

In [117]:
# get posts (images) from multiple profile pages 

resultz = []
for url in urls: 
    k = InstagramScraper()
    results = k.profile_page_posts(url)
    resultz.append(results)
    print('Instagram page: ', url)

print('Total number of Instagram pages: ', len(resultz))
print('Total number of images: ', len(resultz)*len(resultz[0]))
#print('Average number of images per Instagram hashtag page: ', len(hash_result)*len(hash_result[0])/len(hash_result) )

RequestException: 

In [124]:
# get posts (images) from single profile page 

k = InstagramScraper()
results = k.profile_page_posts(url)

print('Instagram page: ', url)
#print('Posts on Instagram profile page: ', len(results))
#print('Second image url on instagram profile: ', results[1]['display_url'])

RequestException: 

#### Hashtag Page

Get all posts on an Instagram **hashtag page** that are visible on the landing page. 

In [102]:
# get posts (images) from multiple hashtag pages 
from pprint import pprint

hash_result = []
for url in hash_urls: 
    k = InstagramScraper()
    results = k.hashtag_page_posts(url)
    hash_result.append(results)
    print('Instagram page: ', url)

print('Total number of Instagram hashtag pages: ', len(hash_result))
print('Total number of hashed images: ', len(hash_result)*len(hash_result[0]))
#print('Average number of images per Instagram hashtag page: ', len(hash_result)*len(hash_result[0])/len(hash_result) )

RequestException: 

In [103]:
# get posts (images) from a hashtag page 

k = InstagramScraper()
results = k.hashtag_page_posts(url)

#pprint(results)
print('Instagram page: ', url)
print('Posts on Instagram hashtag page: ', len(results))
print('Second image url on instagram hashtag: ', results[1]['display_url'])

RequestException: 

### 5. Save images from list of dict 

Use requests library to download images from the ‘display_url’ in pandas ‘result’ data frame and store them with respective shortcode as file name.

Specify the directory for storing the images. 

In [45]:
# load modules
import os
import requests
import shutil

Some functions...

In [49]:
def set_root_path_images(): 
    
    # to specify
    directory= r"C:\Users\Anonym\Documents\GitHub\DLfM_BrandManagement\data"
    folder = 'instagram_images' #image root folder, all subfolders' name are firmnames

    os.chdir(directory)

    try: 
        os.mkdir(folder)
    except: 
        pass

    path = os.path.join(directory, folder)
    os.chdir(path)
    return path 

In [47]:
def build_folders_images(account, folder, path): 
        try: 
            os.mkdir(os.path.join(path, account))
        except: 
            pass
        
        # set directory 
        directory = os.path.join(path, account)
        os.chdir(directory)   
        try: 
            os.mkdir(folder)
            print('new folder created for: ', account)
        except: 
            pass
        path = os.path.join(directory, folder)
        os.chdir(path)
        return path 


 #### User-profile page
 
 Save all images from user-profile Instagram pages to your folder. 

In [50]:
# download all visible images from multiple Instagram pages 

for i, username in enumerate(firm_usernames): 
    path = set_root_path_images()
    build_folders_images(username, 'user_profile', path)

    # get image url 
    for j in range(len(resultz[i])): 
        r = requests.get(resultz[i][j]['display_url'], stream=True)
        with open(resultz[i][j]['shortcode']+".jpg", 'wb') as f:
            # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
            r.raw.decode_content = True
            # Copy the response stream raw data to local image file.
            shutil.copyfileobj(r.raw, f)
            # Remove the image url response object.
            del r
            
    print('processed: ', username, ' .', i, ' out of ', len(firm_usernames))

 #### Hashtag page
 
 Save all images from hashtag Instagram pages to your folder. 

In [None]:
# download all visible images from multiple Instagram pages 

for i, hashtag in enumerate(instagram_hashtags):
    path = set_root_path_images()
    build_folders_images(hashtag, 'hashtag', path)

    # get image url 
    for j in range(len(hash_result[i])): 
        r = requests.get(hash_result[i][j]['display_url'], stream=True)
        with open(hash_result[i][j]['shortcode']+".jpg", 'wb') as f:
            # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
            r.raw.decode_content = True
            # Copy the response stream raw data to local image file.
            shutil.copyfileobj(r.raw, f)
            # Remove the image url response object.
            del r
    print('processed: ', hashtag, ' .', i, ' out of ', len(instagram_hashtags))

 #### Both pages
 
 Save all images from both user profile and hashtag Instagram pages to your folder. 

In [22]:
# download all visible images from multiple Instagram pages 

if firm_usernames: 
    for i, username in enumerate(firm_usernames): 
        path = set_root_path_images()
        build_folders_images(username, 'user_profile', path)
       
        # get image url 
        for j in range(len(resultz[i])): 
            r = requests.get(resultz[i][j]['display_url'], stream=True)
            with open(resultz[i][j]['shortcode']+".jpg", 'wb') as f:
                # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
                r.raw.decode_content = True
                # Copy the response stream raw data to local image file.
                shutil.copyfileobj(r.raw, f)
                # Remove the image url response object.
                del r

elif instagram_hashtags: 
    for i, hashtag in enumerate(instagram_hashtags):
        path = set_root_path_images()
        build_folders_images(hashtag, 'hashtag', path)

        # get image url 
        for j in range(len(hash_result[i])): 
            r = requests.get(hash_result[i][j]['display_url'], stream=True)
            with open(hash_result[i][j]['shortcode']+".jpg", 'wb') as f:
                # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
                r.raw.decode_content = True
                # Copy the response stream raw data to local image file.
                shutil.copyfileobj(r.raw, f)
                # Remove the image url response object.
                del r

In [7]:
# download all visible images from an Instagram page 

path = set_root_path_images()

if username: 
    build_folders_images(username, 'user_profile', path)
elif hashtag: 
    build_folders_images(hashtag, 'hashtag', path)

for i in range(len(results)):
    r = requests.get(results[i]['display_url'], stream=True)
    with open(results[i]['shortcode']+".jpg", 'wb') as f:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True
        # Copy the response stream raw data to local image file.
        shutil.copyfileobj(r.raw, f)
        # Remove the image url response object.
        del r

KeyboardInterrupt: 

In [43]:
# download one image only

path = set_root_path_images()

r = requests.get(url, stream=True)

with open(directory+"B-Tckr0AgrH"+".jpg", 'wb') as f:
    # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
    r.raw.decode_content = True
    # Copy the response stream raw data to local image file.
    shutil.copyfileobj(r.raw, f)
    # Remove the image url response object.
    del r