In [1]:
# First cell - imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import config
import time
import json
from bs4 import BeautifulSoup

# Run this cell first

In [2]:
# Second cell - initialize driver
driver = webdriver.Chrome()

In [3]:
# Third cell - login
def login():
    driver.get("https://www.instagram.com")
    
    username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
    password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
    
    username.clear()
    username.send_keys(config.username)
    password.clear()
    password.send_keys(config.password)
    
    button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
    button.click()

# Run login when ready
login()

In [4]:
# Fourth cell - search function
def search(query):
    search_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "svg[aria-label='Search']")))
    search_button.click()
    
    search_input = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "input[placeholder='Search']"))
    )
    
    search_input.clear()
    search_input.send_keys(query)
    search_input.send_keys(Keys.RETURN)

# Try different searches
keyword = "sandyi01314"
search(keyword)  # You can change this and run the cell multiple times

In [5]:
# Fifth cell - click on the firstsearch result

#remove the @ symbol if present 
if keyword.startswith("@"):
    keyword = keyword[1:]

try:
    # Find the span with exact text content and click its parent link
    clickable_element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, f"//span[text()='{keyword}']/ancestor::a"))
    )
    driver.execute_script("arguments[0].click();", clickable_element)
except TimeoutException:
    print(f"Could not click on result for '{keyword}'")

In [6]:
# Sixth cell - scroll loop

#Get the initial page height
initial_height = driver.execute_script("return document.body.scrollHeight")

#create a list to store htmls
soups = []

#scroll loop
while True:
    #scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #wait for the page to load
    time.sleep(5)

    current_height = driver.execute_script("return document.body.scrollHeight")
    
    if current_height == initial_height:
        break   #exit the loop when you can't scroll further

    initial_height = current_height


#list to store the post image urls
post_urls = []
#Parse the HTML
html = driver.page_source

# Create a BeautilfulSoup object from scraped HTML
soup = BeautifulSoup(html, 'html.parser')


KeyboardInterrupt: 

In [15]:


#find all image elements that match the specific class in the current soup
elements = soup.find_all('a', class_="x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz _a6hd")


#Extract the href attributes and filter URLs that start with "/p/" or "/reel/"
post_urls.extend([element['href'] for element in elements if '/p/' in element['href'] or '/p/' in element['href']])

#conver the list to a set to remove duplicates
unique_post_urls = list(set(post_urls))


In [16]:
if len(unique_post_urls) > 0:
    unique_post_urls[0]
print('numer of posts: ', len(unique_post_urls))

numer of posts:  42


In [9]:
#Create a list to store the json for each post
json_list =[]

# Define the query parameters to add
query_params = "__a=1&__d=dis"

#go through all urls
for url in unique_post_urls:

    #Error handling
    try:

        #Get the current URL of the page
        current_url = driver.current_url
        
        #Append the query parameters to the URL
        modified_url = "https://www.instagram.com" + url + "?" + query_params

        #get URL
        driver.get(modified_url)

        #wait for a moment to allow new content to load (adjust as needed)
        time.sleep(1)

        #Find the <pre> tag containing the JSON data
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//pre"))
        )
        pre_tag = driver.find_element(By.XPATH, '//pre')  # Updated this line

        #Extract the JSON data from the <pre> tag
        json_script = pre_tag.text
        
        #Parse the JSON data
        json_parsed = json.loads(json_script)

        #Add json to the list
        json_list.append(json_parsed)

    except (NoSuchElementException, TimeoutException, json.JS) as e:
        print(f"Error processing URL {url}: {e}")


In [10]:
#lists to store URLs and corresponding dates
all_urls = []
all_dates = []

#iterate through each JSON data in the list
for json_data in json_list:

    #Extract the list from the 'items' key
    item_list = json_data.get('items',[])

    #iterate through each item in 'items' list
    for item in item_list:

        #extract the date the item was taken
        date_taken = item.get('taken_at')

        #check if the carousel media is present
        carousel_media = item.get('carousel_media',[])

        #iterate through each item in the 'carousel_media' list
        for media in carousel_media:

            #extract the image url from the media
            image_url = media.get('image_versions2', {}).get('candidates', [])[0].get('url')

            #check if the image_url field is found inside the 'carousel_media' list
            if image_url:

                #Add the image url and corresponding date to the lists
                all_urls.append(image_url)
                all_dates.append(date_taken)
                print("carousel image added")

            #Extract the video URL from the media
            video_versions = media.get('video_versions', [])
            if video_versions:
                video_url = video_versions[0].get('url')
                if video_url:
                    #Add the video URL and corresponding date to the lists
                    all_urls.append(video_url)
                    all_dates.append(date_taken)
                    print("carousel video added")
        
        #handle cases of unique image, instead of carousel
        image_url = item.get('image_versions2', {}).get('candidates', [])[0].get('url')
        if image_url:
            #Add the image URL and corresponding date to the lists
            all_urls.append(image_url)
            all_dates.append(date_taken)
            print("single image added")

        #check if 'video_versions' key exists
        video_versions =  item.get('video_versions', [])
        if video_versions:
            video_url = video_versions[0].get('url')
            if video_url:
                #Add the video URL and corresponding date to the lists
                all_urls.append(video_url)
                all_dates.append(date_taken)
                print("single video added")

print(len(all_urls))

#create a dataframe from the lists


carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
single image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel video added
carousel image added
carousel image added
carousel image added
carousel video added
single image added
carousel image added
carousel image added
carousel image added
carousel image added
single image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
single image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
carousel image added
single image added
carousel image added
carousel image added
carousel image added
carousel image added
ca

In [11]:
all_dates


[1688733793,
 1688733793,
 1688733793,
 1688733793,
 1688733793,
 1688733793,
 1688733793,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1669984692,
 1679664484,
 1679664484,
 1679664484,
 1679664484,
 1679664484,
 1634905370,
 1634905370,
 1634905370,
 1634905370,
 1634905370,
 1634905370,
 1634905370,
 1634905370,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1672402806,
 1735129315,
 1735129315,
 1735129315,
 1735129315,
 1735129315,
 1735129315,
 1735129315,
 1735129315,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1667564572,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1663938240,
 1655642992,
 1662123565,
 1662123565,

In [12]:
# create a directory to store downloaded files
base_dir = 'scraped_data'
import os
import requests
from urllib.parse import urlparse

#Create the base directory for alll scapped data
os.makedirs(base_dir, exist_ok=True)
download_dir = os.path.join(base_dir, keyword)

#Create subfolders for images and videos
image_dir = os.path.join(download_dir, "images")
video_dir = os.path.join(download_dir, "videos")
os.makedirs(image_dir, exist_ok=True)
os.makedirs(video_dir, exist_ok=True)

# Initialize counters for images and videos
image_counter = 1
video_counter = 1

#iterate through URLs in the all_urls list and download media
for index, url in enumerate(all_urls, 0):
    response =  requests.get(url, stream=True)

    #Extract file extension from the URL
    url_path =  urlparse(url).path
    file_extension = os.path.splitext(url_path)[1]

    #Determine the file name based on the URL
    if file_extension.lower() in {'.jpg', '.jpeg', '.png', '.gof'}:
        file_name = f"{all_dates[index]}-img-{image_counter}.png"
        destination_folder = image_dir
        image_counter += 1
    elif file_extension.lower() in {'.mp4', '.avi', '.mkv', '.mov'}:
        file_name = f"{all_dates[index]}-img-{video_counter}.mp4"
        destination_folder = video_dir
        video_counter += 1
    else:
        #Default to the main download directory for other file types
        file_name = f"{all_dates[index]}{file_extension}"
        destination_folder = download_dir

    #Save the file to the appropriate folder
    file_path = os.path.join(destination_folder, file_name)

    #Write the content of the response to the file
    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)

    print(f'Downloaded: {file_path}')

#Print a message indicating the number of downloaded files and the download directory
print(f'Download {len(all_urls)} files to {download_dir}')


Downloaded: scraped_data\sandyi01314\images\1688733793-img-1.png
Downloaded: scraped_data\sandyi01314\images\1688733793-img-2.png
Downloaded: scraped_data\sandyi01314\images\1688733793-img-3.png
Downloaded: scraped_data\sandyi01314\images\1688733793-img-4.png
Downloaded: scraped_data\sandyi01314\images\1688733793-img-5.png
Downloaded: scraped_data\sandyi01314\images\1688733793-img-6.png
Downloaded: scraped_data\sandyi01314\images\1688733793-img-7.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-8.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-9.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-10.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-11.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-12.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-13.png
Downloaded: scraped_data\sandyi01314\images\1669984692-img-14.png
Downloaded: scraped_data\sandyi01314\videos\1669984692-img-1.mp4
Downloaded: scraped_