In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from dateutil.parser import parse
from bs4 import BeautifulSoup
import time
import logging
from tqdm import tqdm
import pandas as pd
import argparse
import datetime
import pytz

def __login(username_ig, password_ig):
    username = WebDriverWait(driver, 10).\
        until(EC.element_to_be_clickable\
            ((By.CSS_SELECTOR, "input[name='username']")))

    password = WebDriverWait(driver, 10).\
        until(EC.element_to_be_clickable\
            ((By.CSS_SELECTOR, "input[name='password']")))

    #Cleaning the fields
    username.clear()
    username.send_keys(username_ig)
    password.clear()
    password.send_keys(password_ig)

    #Login
    login_button = WebDriverWait(driver, 10)\
        .until(EC.element_to_be_clickable\
            ((By.CSS_SELECTOR, "button[type='submit']"))).click()

    #Skipping not now
    not_now = WebDriverWait(driver, 10).\
        until(EC.element_to_be_clickable\
            ((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
    # not_now = WebDriverWait(driver, 10).\
    #     until(EC.element_to_be_clickable\
    #         ((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()


def search( keyword):
    searchButton = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[@href='#']"))).click()
    searchbox = WebDriverWait(driver, 10)\
        .until(EC.element_to_be_clickable\
            ((By.XPATH, "//input[@placeholder='Search']")))
    searchbox.clear()
    searchbox.send_keys('#' + keyword)


    # Wait for 5 seconds
    time.sleep(5)
    searchbox.send_keys(Keys.ENTER)
    time.sleep(5)
    searchbox.send_keys(Keys.ENTER)
    time.sleep(5)


def __get_links(nscrolls, scroll_pause_time):
    '''
    Getting posts links
    '''
    saved_links = {}
    rank = 0
    # Get scroll height
    last_height = \
        driver.execute_script("return document.body.scrollHeight")

    for j in tqdm(range(nscrolls)):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        links = driver.find_elements(by=By.TAG_NAME, value='a') 
        # print("All links:")
        # print(links)
        valid_links =  filter_links(links)

        for i in range(len(valid_links)):
            link = valid_links[i].get_attribute('href')
            # print(link)
            if link not in saved_links.keys():
                saved_links[link] = rank
                rank += 1
                
        # Wait to load page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = \
            driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height
    with open("./interim_data/links.txt",'w') as fo:
            fo.write(str(saved_links))
    return saved_links
                

def filter_links(links):
    '''
    Filter post links
    '''
    post_links = []
    for link in links:
        try:
            if '.com/p/' in link.get_attribute('href'):
                post_links.append(link)
        except:
            continue
    return post_links


def filter_links_new(links):
    '''
    Filter post links
    '''
    post_links = []
    for link in links:
        try:
            if '.com/p/' in link:
                post_links.append(link)
        except:
            continue
    return post_links

def get_links_from_file(filename):
    g1 = pd.read_csv(f'./GSERP_links/{filename}.csv')
    g1['url'] = g1['url'].apply(lambda x: x.split("/?")[0])
    valid_post_links = filter_links_new(g1['url'].to_list())
    ranks = list(range(len(valid_post_links)))
    return dict(zip(valid_post_links, ranks))

def get_data_v(nscrolls, scroll_pause_time, filename=None) -> dict:
    '''
    Get all hashtag data
    '''
    # links = get_links_from_file(filename)
    # logger.info(str(len(links)) + " links were found.")
    links = __get_links(nscrolls, scroll_pause_time)
    processed_data = []
    for link,rank in tqdm(links.items()):
        infos = {}
    #Accessing the post
        driver.get(link)
        time.sleep(2)
        try:
            date_elem = driver.find_elements(By.XPATH,"//time")
            post_date = date_elem[0].get_attribute("datetime")    # 
            post_date = parse(post_date)
        except:
            continue
        # Check specific date range
        # if (post_date > datetime.datetime(2023,4,30,tzinfo=pytz.UTC) ) or (post_date< datetime.datetime(2022,9,1,tzinfo=pytz.UTC)):
        #     continue
        try:
            likes_elem = driver.find_element(By.XPATH,"//meta[@property='og:description']")
            likes_elem_cont = likes_elem.get_attribute("content")
        except:
            continue

        try:
            likes, remaining = likes_elem_cont.split(" likes, ")
        except:
            likes = ""
            remaining = likes_elem_cont
        try:
            comments, remaining = remaining.split(" comments")
        except:
            comments = ""
        try:
            _, remaining = remaining.split(" (@")
            user_h, remaining = remaining.split(") on Instagram: ")
        except:
            user_h = ""
            
        remaining = remaining.replace('\n','')
        
        
        try:
            desc_xpath = '/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[2]/div/div/ul/div/li/div/div/div[2]/div[1]/h1'
            desc_elem = driver.find_element(By.XPATH,desc_xpath)
            cont = desc_elem.get_attribute('innerHTML')
            caption = BeautifulSoup(cont).get_text()
        except:
            caption = remaining
        # print(caption)
        image_elements = driver.find_elements(by=By.TAG_NAME, value='img')
        # images = driver.find_elements_by_tag_name('img')
        image_links = [image.get_attribute('src') for image in image_elements]
        image_links = image_links[2:] #IG logo and Profile picture

        infos["Publication_date"] = post_date
        infos["#likes"] = likes
        infos["#comments"] = comments
        infos["rank"] = rank
        infos["Caption"] = caption
        infos["PostLink"] = link
        infos["PostID"] = link.split("/p/")[1].rstrip("/")
        infos["ImageLink"] = link+"media?size=l"
        infos["user_handle"] = "@"+user_h
        # print(likes_elem_cont)
        if infos not in processed_data and infos is not None:
            processed_data.append(infos)
        time.sleep(1)

    return processed_data

In [2]:
driver = webdriver.Chrome()
driver.get("http://www.instagram.com")
__login("datacrick3", "Shamik@40")

In [5]:
# Using links from files

# "chanel", "hermes","dior",
filenames = [ "chanel", "hermes","dior","gucci", "prada" ] #['gucci_c','gucci_n','chanel_c','chanel_n','dior_c','dior_n','hermes_c','hermes_n','prada_c','prada_n']    #'gucci_c','gucci_n','chanel_c','chanel_n',
for fname in filenames:
    search(fname)
    print(f"working on file {fname}")
    data = get_data_v(nscrolls = int(30),
                                scroll_pause_time = 5, filename=  fname)
    df = pd.DataFrame(data)
    brand_name = "#"+fname.split('_')[0]
    df['Brand_name'] = brand_name
    df = df[['Brand_name','PostID','PostLink','ImageLink','Publication_date','user_handle',"Caption",'#likes', '#comments']]
    df.to_csv(f"./outputs_recent/{fname}_output.csv", index = False)

working on file gucci


100%|██████████| 30/30 [02:56<00:00,  5.88s/it]
100%|██████████| 309/309 [22:02<00:00,  4.28s/it]


working on file prada


100%|██████████| 30/30 [02:48<00:00,  5.61s/it]
100%|██████████| 311/311 [22:24<00:00,  4.32s/it]


In [26]:
data

[]