In [1]:
#Selenium imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

In [2]:
#Other imports here
import os
import wget
import time
import json
import copy
import random
import datetime
import os

In [3]:
class Post:
    """
    A post information class
    ------------------------
    
    As of now, it only scrapes its datetime and caption
    """
    
    
    def __init__(self, href):
        """
        Given a href (Instagram's link for the post),
        it creates an Post object for this given link
        """
        
        self.href = href
        self.datetime = None
        self.caption = None        
        
    def __str__(self):
        """
        (self.href, self.dateime, self.caption)
        """
        
        return "(" + self.href + ", " + self.datetime + ", " + self.caption + ")"
    

In [4]:
help(Post)

Help on class Post in module __main__:

class Post(builtins.object)
 |  Post(href)
 |  
 |  A post information class
 |  ------------------------
 |  
 |  As of now, it only scrapes its datetime and caption
 |  
 |  Methods defined here:
 |  
 |  __init__(self, href)
 |      Given a href (Instagram's link for the post),
 |      it creates an Post object for this given link
 |  
 |  __str__(self)
 |      (self.href, self.dateime, self.caption)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [5]:
class User:
    """
    An user information class
    -------------------------
    
    It contains all user's information, such as
    
        self.handle: the user's handle;
        self.url: Instagram's user link (url);
        self.n_posts: how many posts the user posted;
        self.followers: how many followers the user has;
        self.following: how many other users are being 
        following by this user;
        self.name: the user's choosen name (shown next 
        to the bio);
        self.bio: the user's bio.
        
        self.followers_list: a list of user's followers;
            (see InstaBot.get_users_info for more infor-
            mation)
            
        self.following_list: a list of others users be-
        ing following by this user
            (see InstaBot.get_users_info for more info-
            rmation)
            
        self.posts: a dictionary of the users posts
            (key=href::str, values=posts::Post)
    """
    
    
    def __init__(self, handle=None):
        """
        Creates an user with the handle and the url pro-
        vided (handle may not be valid)
        
        :Raise: 
            Expection("User handle expected"): if a han-
            dle is not provided
        """
        
        if handle is not None:
            self.handle = handle
            self.url = "https://www.instagram.com/" + self.handle + "/"
            
            self.n_posts = None
            self.followers = None
            self.following = None
            self.name = None
            self.bio = None
            
            self.followers_list = None
            self.following_list = None
            
            self.posts = None
            
        else:
            raise Expection("User handle expected")            
    

    def to_json(self):
        """
        Creates a json file with the current user infor-
        mation:
        
        :Usage:
            random_user.to_json()
            
        :Returns:
            None, although it creates a json file with 
            the following structure:
        
                {
                  "handle"         : String,
                  "url"            : String,
                  "n_posts"        : Number,
                  "followers"      : Number,
                  "following"      : Number,
                  "name"           : String,
                  "bio"            : String,
                  "followers_list" : Array.of(String),
                  "following_list" : Array.of(String),
                  "posts"          : Dict[String, Post]
                }
        """
        
        obj_dict = copy.deepcopy(self.__dict__)
        
        if self.posts is not None:
            obj_dict['posts'] = list(map(lambda ps: ps[1].__dict__, self.posts.items()))
        
        with open(self.handle + ".json", "w", encoding="utf-8") as out:
            out.write(json.dumps(obj_dict))

In [6]:
help(User)

Help on class User in module __main__:

class User(builtins.object)
 |  User(handle=None)
 |  
 |  An user information class
 |  -------------------------
 |  
 |  It contains all user's information, such as
 |  
 |      self.handle: the user's handle;
 |      self.url: Instagram's user link (url);
 |      self.n_posts: how many posts the user posted;
 |      self.followers: how many followers the user has;
 |      self.following: how many other users are being 
 |      following by this user;
 |      self.name: the user's choosen name (shown next 
 |      to the bio);
 |      self.bio: the user's bio.
 |      
 |      self.followers_list: a list of user's followers;
 |          (see InstaBot.get_users_info for more infor-
 |          mation)
 |          
 |      self.following_list: a list of others users be-
 |      ing following by this user
 |          (see InstaBot.get_users_info for more info-
 |          rmation)
 |          
 |      self.posts: a dictionary of the users posts
 

In [7]:
class InstaStructure:
    """
    It defines Instagram's structure to be used by an in-
    stance of InstaBot
    ----------------------------------------------------------------------
    
    Because Instagram is continuously updating their ele-
    ments structure and names,
    the InstaBot may be extremely brittle because for in-
    stance if just one element
    is changed the bot breaks (that's why this class needs
    to be constantly changed)
    """
    
    
    # ---------------- Used in InstaDriver.login -------------------------
    
    # Used in InstaDriver.login
    username = (By.CSS_SELECTOR, "input[name='username']")
    password = (By.CSS_SELECTOR, "input[name='password']") 
    submit = (By.CSS_SELECTOR, "button[type='submit']")
    save_information = (By.XPATH, "//div[contains(text(), 'Agora não') or contains(text(), 'Not now')]") 
    not_now_notification = (By.XPATH, "//button[contains(text(), 'Agora não') or contains(text(), 'Not now')]") 
    
    # --------------------------------------------------------------------
    
    # -------------- Used in InstaBot.__get_list__ -----------------------
    
    # JScript to scroll a scroll bar
    js_script_to_scroll = """ 
        arguments[0].scrollTo(0, arguments[0].scrollHeight);
        return arguments[0].scrollHeight; """
    
    # --------------------------------------------------------------------
    
    @staticmethod
    def __get_random_sleep__(minimum, maximum):
        """
        Get the system to sleep [minimum, maximum) sec-
        onds
        Should *not* be called outside the class
        
        :Args:
            - minimum :: int : the minimum of seconds to 
            sleep
            - maximum :: int : the maximum of seconds to 
            sleep
         
        :Usage:
            In InstaStructure.sleep only*
            
        :Returns:
            None
        """
        
        time.sleep(random.uniform(minimum, maximum))
    
    
    @staticmethod
    def sleep(time):
        """
        Given time seconds, get the system to sleep a peri-
        od of seconds in an interval [time-2, time+2) se-
        conds
        
        :Args:
            - time :: time : seconds to sleep
            
        :Usage:
            InstaStructure.sleep(5): will get system to
            sleep a minimum of 3 seconds (inclusive) and
            maximum of 5 seconds (exclusive) inside InstaBot
            
        :Returns:
            None
        """
        
        InstaStructure.__get_random_sleep__(time-2, time+2)
    
    
    @staticmethod
    def is_valid_handle(driver, user):
        """
        Checks, given an open and valid driver and a user,
        if the user is a valid one, as of now, it checks
        if it is possible to scrape the number of posts
        posted by this user (because it fails, it is an
        invalid used) 
        *Can be assumed the driver web browser is open in
        'https://instagram.com/user.handle'*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open
            and valid Selenium driver currently in use
            - user :: User: object instance with at least
            a handle (may not be valid: see User.__init__)
            
        :Usage:
            InstaStructure.is_valid_handle(random_driver,
            random_user) inside InstaBot.get_users_info
            
        :Returns:
            True, if user is valid (handle is found)
            False, otherwise
        """
        
        try:
            InstaStructure.get_n_posts(driver)
            return True
        except:
            return False
    
    
    @staticmethod
    def is_public(driver, user):
        """
        Checks, given an open and valid driver and a user,
        if the user profile is public (it may be private)
        *Can be assumed the driver web browser is open in 
        'https://instagram.com/user.handle'*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and
            valid Selenium driver currently in use
            - user :: User: object instance with at least a
            handle (can be assumed to be valid)
            
        :Usage:
            InstaStructure.is_public(random_driver, random_-
            user) inside InstaBot.get_users_info
            
        :Returns:
            True, if user profile is public
            False, if user profile is private
        """
            
        xpath = f"//*[contains(text(), 'Esta conta é privada') or contains(text(), 'This account is private')]"
        try:
            private_element = driver.find_element_by_xpath(xpath)
            return False
        except:
            return True
            
     
    @staticmethod
    def get_n_posts(driver):
        """
        Scrapes, given an open and valid driver, how many
        posts the current opened profile has      
        *Can be assumed the driver web browser is open in
        a valid user profile page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open
            and valid Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_n_posts(random_driver) in-
            side InstaBot.get_users_info
            
        :Returns:
            A string 'x' expressing for example the user
            has currently 'x' posts
            
        :Raises:
            NoSuchElementException: if the element wasn't
            found
            *Be sure this method does not IN ANY CASE raise
            this exception*
        """
        
        return driver.find_element_by_xpath("//*[contains(text(), 'publicaç') or contains(text(), 'post')]//*[not(contains(text(), 'publicaç')) and not(contains(text(), 'post'))]").text
    
    
    @staticmethod
    def get_followers_following_clickable(driver, which):
        """
        Scrapes, given an open and valid driver, the click-
        abe web element for the followers or following section
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and
            valid Selenium driver currently in use
            - which :: String: can be either 'followers' or
            'following'
            
        :Usage:
            InstaStructure.get_followers_following_clickable-
            (random_driver, 'followers') inside InstaBot.get-
            _users_info
            
        :Returns:
            A clickable web browser element representing ei-
            ther the followers or following section
            
        :Raises:
            Exception("Invalid clickable: followers or fol-
            lowing"): if invalid clickable is entered
        """
        
        if which == 'followers':
            return driver.find_element_by_xpath("//*[contains(text(), 'seguidor') or contains(text(), 'follower')]")
        elif which == 'following':
            return driver.find_element_by_xpath("//*[contains(text(), 'seguindo') or contains(text(), 'following')]")
        else:
            raise Exception("Invalid clickable: followers or following")
    
    
    @staticmethod
    def get_followers(driver):
        """
        Scrapes, given an open and valid driver, how many fol-
        lowers the current opened profile has  
        *Can be assumed the driver web browser is open in a va-
        lid user profile page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and
            valid Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_followers(random_driver) inside
            InstaBot.get_users_info
            
        :Returns:
            A string 'x' expressing for example the user has cu-
            rrently 'x' followers
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this
            exception*
        """
        
        return driver.find_element_by_xpath("//*[contains(text(), 'seguidor') or contains(text(), 'follower')]//*[not(contains(text(), 'seguidor')) and not(contains(text(), 'follower'))]").text
    
    
    @staticmethod
    def get_following(driver):
        """
        Scrapes, given an open and valid driver, how many users
        the current opened profile is following 
        *Can be assumed the driver web browser is open in a valid
        user profile page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_following(random_driver) inside
            InstaBot.get_users_info
            
        :Returns:
            A string 'x' expressing for example the user is fol-
            lowing 'x' users
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this
            exception*
        """
        
        return driver.find_element_by_xpath("//*[contains(text(), 'seguindo') or contains(text(), 'following')]//*[not(contains(text(), 'seguindo')) and not(contains(text(), 'following'))]").text
    
    
    @staticmethod
    def get_name(driver):
        """
        Scrapes, given an open and valid driver, the name of the
        current opened profile (see User for more) 
        *Can be assumed the driver web browser is open in a valid
        user profile page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_name(random_driver) inside InstaBot-
            .get_users_info
            
        :Returns:
            A string 'my_name' expressing that this given user
            chose 'my_name' as their profile name
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this
            exception*
        """
        
        return driver.find_element_by_xpath("//*[@class='_aa_c']/*[1]").text
    
    
    @staticmethod
    def get_bio(driver):
        """
        Scrapes, given an open and valid driver, the bio of the cu-
        rrent opened profile
        *Can be assumed the driver web browser is open in a valid
        user profile page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_bio(random_driver) inside InstaBot
            .get_users_info
            
        :Returns:
            A string 'bio' expressing that this given user chose
            'bio' as their profile bio
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this
            exception*
        """
        
        return driver.find_element_by_xpath("//*[@class='_aa_c']/*[last()]").text
    
    
    @staticmethod
    def get_scroll_bar(driver, which):
        """
        Scrapes, given an open and valid driver, the clickabe web
        element for the followers or following scroll bar
        *Can be assumed the driver web browser is open in a valid
        user followers or following page,
        ie, https://instagram.com/random_user/followers or
        https://instagram.com/random_user/following pages*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            - which :: String: can be either 'followers' or 'fol-
            lowing'
            
        :Usage:
            InstaStructure.get_scroll_bar(random_driver, 'fol-
            lowers') inside InstaBot.__get_list__
            
        :Returns:
            A clickable web browser element representing either the
            followers or following scroll bar
            
        :Raises:
            Exception("Invalid clickable: followers or following"):
            if invalid scroll bar is entered
        """
        
        if which == 'followers':
            return driver.find_element_by_xpath('//div[@class="_aano"]')
        elif which == 'following':
            return driver.find_element_by_xpath('//div[@class="_aano"]')
        else:
            raise Exception("Invalid clickable: followers or following")
        
    
    @staticmethod
    def get_links(scroll_bar):
        """
        Scrapes, given an valid scroll bar element, a list of ele-
        ments ('links') inside the given scroll bar element
        which contains the handles of either the followers or fol-
        lowings of the current user being fetched
        *Can be assumed the driver web browser is open in a valid
        user followers or following page,
        ie, https://instagram.com/random_user/followers or
        https://instagram.com/random_user/following pages*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            - which :: String: can be either 'followers' or 'fol-
            lowing'
            
        :Usage:
            user.followers_list = self.__get_list__(random_user)-
            inside InstaBot.get_users_info:
            
                Inside InstaBot.__get_list__(random_user):
                    
                    [...]
                    links = InstaStructure.get_links(scroll_bar)
                    names = [name.text for name in links if name.text != '']
                    [...]
            
        :Returns:
            A clickable web browser element representing either the
            followers or following list of handles
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this ex-
            ception*
        """
        
        return scroll_bar.find_elements_by_tag_name('a')
    
    @staticmethod
    def get_visible_posts(driver):
        """
        Scrapes, given an open and valid driver, a list of visible
        posts elements
        *Can be assumed the driver web browser is open in a valid
        user profile page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_visible_posts(random_driver) inside
            InstaBot.__get_posts__
            
        :Returns:
            A list containing all visible posts elements at that
            moment
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this
            exception*
        """
        
        return driver.find_elements_by_xpath("//*[@class='_aabd _aa8k  _al3l']")
    
    
    @staticmethod
    def get_post_href(post):
        """
        Scrapes, given an open and valid post element (which is
        also a driver), the post href (like
        https://instagram.com/p/href)
        *Can be assumed the driver web browser is open in a valid
        post page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_post_href(random_post) inside InstaBot.
            __get_posts__
            
        :Returns:
            A string 'href' containing the posts href 
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this ex-
            ception*
        """
        
        return post.find_element_by_tag_name('a').get_attribute('href')
    
    
    @staticmethod
    def __get_caption_box__(driver):
        """
        Scrapes, given an open and valid driver, the posts caption
        element, this element needs to contain the user handle, the
        caption, datetime info and likes info, but not necessarily
        the "view replies" element
        *Can be assumed the driver web browser is open in a valid
        post page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.__get_caption_box__(random_driver) inside
            InstaStructure.get_caption
            
        :Returns:
            A post caption box element, if one was found,
            None, otherwise
        """
        
        try:
            caption_box = driver.find_element_by_xpath("//*[@class='_a9zr']")
        except:
            caption_box = None

        return caption_box
            
    
    @staticmethod
    def get_caption(driver):
        """
        Scrapes, given an open and valid driver, the posts caption
        text, if one has it, because a post may be without a
        caption
        *Can be assumed the driver web browser is open in a valid
        post page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_caption(random_driver) inside Insta-
            Structure.__get_post_info__
            
        :Returns:
            A post caption text, if one was found,
            None, otherwise
        """
        
        caption_box = InstaStructure.__get_caption_box__(driver)
        
        try:
            caption = caption_box.find_element_by_tag_name('h1').text
        except:
            caption = None

        return caption
    
    
    @staticmethod
    def get_datetime(driver):
        """
        Scrapes, given an open and valid driver, the posts datetime
        attribute text, usually found at the bottom of the post box,
        right below "liked by..."
        *Can be assumed the driver web browser is open in a valid
        post page*
        
        :Args:
            - driver :: selenium.driver.Chrome: an open and valid
            Selenium driver currently in use
            
        :Usage:
            InstaStructure.get_datetime(random_driver) inside
            InstaStructure.__get_post_info__
            
        :Returns:
            A string 'datetime' containing the posts datetime 
            
        :Raises:
            NoSuchElementException: if the element wasn't found
            *Be sure this method does not IN ANY CASE raise this
            exception*
        """
        
        return driver.find_element_by_xpath("//time[@class='_aaqe']").get_attribute('datetime')
        

In [8]:
help(InstaStructure)

Help on class InstaStructure in module __main__:

class InstaStructure(builtins.object)
 |  It defines Instagram's structure to be used by an in-
 |  stance of InstaBot
 |  ----------------------------------------------------------------------
 |  
 |  Because Instagram is continuously updating their ele-
 |  ments structure and names,
 |  the InstaBot may be extremely brittle because for in-
 |  stance if just one element
 |  is changed the bot breaks (that's why this class needs
 |  to be constantly changed)
 |  
 |  Static methods defined here:
 |  
 |  __get_caption_box__(driver)
 |      Scrapes, given an open and valid driver, the posts caption
 |      element, this element needs to contain the user handle, the
 |      caption, datetime info and likes info, but not necessarily
 |      the "view replies" element
 |      *Can be assumed the driver web browser is open in a valid
 |      post page*
 |      
 |      :Args:
 |          - driver :: selenium.driver.Chrome: an open and val

In [9]:
class InstaDriver:
    """
    Defines a Selenium WebDriver which encapsulates opening and
    logging in Instagram
    ----------------------------------------------------------------------
    
    Given a driver application (.exe) and a Instagram user to be
    used as a bot account, it mounts the driver (open the web
    browser) and login at the bot account
    """
    
    
    def __init__(self, username, password):
        """
        Creates an instance of InstaDriver given the username
        and password of a valid Instagram user account to be used
        as the bot for the InstaBot
        """
        
        self.username = username
        self.password = password
        self.driver_directory = None
        self.driver = None
        self.mounted = False
        self.logged_in = False
        
        
    def mount_driver(self, driver_directory=None, overwrite=False):
        """
        Get the web driver ready to begin the login in process
        (into a bot account)
        
        :Args:
            - driver_directory :: String: the directory path for
            a Chrome Driver executable (.exe)
            - overwrite :: Boolean: overwrites the current, if
            opened, web driver
            
        :Usage:
            random_driver.mount_driver('chromedriver.exe',
            overwrite=True)
            
        :Returns:
            None
            
        :Raises:
           Exception("Error found while mounting driver"): 
               - Driver directory not passed
               - Overwrite option invalid
        """
        
        # If directory not passed or there is already an opened driver and overwrite=False
        if driver_directory is None or (overwrite is False and self.driver is not None):
            raise Exception("Error found while mounting driver")
        
        # If should overwrite and there is an opened driver, quit it before opening new driver
        if overwrite is True and self.driver is not None:
            self.quit()
        
        self.driver_directory = driver_directory
        self.driver = webdriver.Chrome(self.driver_directory)
        self.mounted = True
        
        
    def login(self, timeout=5):
        """
        Get the web driver to login with a bot account to
        begin the scraping process by InstaBot
        
        :Args:
            - timeout :: Float: seconds the WebDriver to wait
            ultil failure (in selenium.webdriver.support.wait
            .WebDriverWait.until)
            
        :Usage:
            random_driver.login()
            
        :Returns:
            None
            
        :Raises:
            - Exception("Driver not mounted")
            - Exception("Invalid timeout") 
            - Exceptions from selenium.webdriver.support.wait
            .WebDriverWait.until:
                - TimeoutException: Message: 
        """
        
        if self.driver is None:
            raise Exception("Driver not mounted")
            
        if timeout < 0:
            raise Exception("Invalid timeout")
        
        self.driver.get("https://www.instagram.com/")
        username = WebDriverWait(self.driver, timeout).until(EC.element_to_be_clickable(InstaStructure.username))
        password = WebDriverWait(self.driver, timeout).until(EC.element_to_be_clickable(InstaStructure.password))
        username.clear()
        password.clear()
        username.send_keys(self.username)
        password.send_keys(self.password)
        
        WebDriverWait(self.driver, timeout).until(EC.element_to_be_clickable(InstaStructure.submit)).click()
        
        # Save login information
        try:
            WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located(InstaStructure.save_information)).click()
        except:
            pass
        
        #Notification
        try:
            WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located(InstaStructure.not_now_notification)).click()
        except:
            pass
        
        self.logged_in = True
        

    def quit(self):
        """
        Quits the driver    
            
        :Usage:
            random_driver.quit()
            
        :Returns:
            None
            
        :Raises:
            - Exceptions from selenium.webdriver.Chrome
        """
        
        self.driver.quit()
        self.driver = None
        self.mounted = False
        self.logged_in = False
        
        

In [10]:
help(InstaDriver)

Help on class InstaDriver in module __main__:

class InstaDriver(builtins.object)
 |  InstaDriver(username, password)
 |  
 |  Defines a Selenium WebDriver which encapsulates opening and
 |  logging in Instagram
 |  ----------------------------------------------------------------------
 |  
 |  Given a driver application (.exe) and a Instagram user to be
 |  used as a bot account, it mounts the driver (open the web
 |  browser) and login at the bot account
 |  
 |  Methods defined here:
 |  
 |  __init__(self, username, password)
 |      Creates an instance of InstaDriver given the username
 |      and password of a valid Instagram user account to be used
 |      as the bot for the InstaBot
 |  
 |  login(self, timeout=5)
 |      Get the web driver to login with a bot account to
 |      begin the scraping process by InstaBot
 |      
 |      :Args:
 |          - timeout :: Float: seconds the WebDriver to wait
 |          ultil failure (in selenium.webdriver.support.wait
 |          .We

In [11]:
class InstaBot:
    """
    Defines a Instagram scraper bot which can get all of a
    user[s] information, (all information of an User object)
    
    ----------------------------------------------------------------------
    
    Given a mounted and working InstaDriver, it scrapes info-
    rmation of Instagram users using Selenium and a Chrome
    Web Driver
    """
    
    def __init__(self, insta_driver):
        """
        Creates an instance of InstaBot given an valid Insta-
        Driver (valid AND working,
        ie, opened and logged in)
        """
        
        self.insta_driver = insta_driver
        self.driver = self.insta_driver.driver
        
    
    def __get_list__(self, user=None, which='followers', sleep=5):
        """
        Get a list of followers/following of a User object 
        instance
        *Can be assumed the driver web browser is open in
        a valid user page*
        
        :Args:
            - user :: User: an User instance which contains
            at least a handle
            - which :: String: which list should the method
            scrape ('followers' or 'following')
            - sleep :: Float: time to sleep in InstaStructure
            .sleep(sleep)
            
        :Usage:
            Inside InstaBot.get_users_info:
                
                if get_followers:
                    user.followers_list = self.__get_list__(user)
            
        :Returns:
            A list containing all followers/following handles
        """        
        
        if which != 'followers' and which != 'following':
            raise Exception(f"Invalid list = {which}: choose either 'followers' or 'following'")
        
        # get the clickable scroll bar element and click it to open the followers/following page
        InstaStructure.get_followers_following_clickable(self.driver, which=which).click()
        InstaStructure.sleep(sleep)
        
        scroll_bar = InstaStructure.get_scroll_bar(self.driver, which)
        # height variable
        last_ht, ht = 0, 1
        
        # scroll the scroll bar until it reaches the bottom
        while last_ht != ht:
            last_ht = ht
            InstaStructure.sleep(sleep)
            ht = self.driver.execute_script(InstaStructure.js_script_to_scroll, scroll_bar)
    
        # once at the bottom, scrapes all links and names of followers/following
        links = InstaStructure.get_links(scroll_bar)
        names = [name.text for name in links if name.text != '']
        
        # return to the users page
        self.driver.back()
        
        return names
    
               
    def __get_posts__(self, user=None, pause=5):
        """
        Get all posts inserted in a dictionary inside an
        User object
        *Can be assumed the driver web browser is open in
        a valid user page*
        
        :Args:
            - user :: User: an User instance which contains
            at least a handle
            - pause :: Float: time to sleep in InstaStructure
            .sleep(pause)
            
        :Usage:
            Inside InstaBot.get_users_info:
                
                if get_posts:
                            self.__get_posts__(user=user)
            
        :Returns:
            None, but the information inserted inside the
            User object
        """ 
        
        # get driver internet page last height
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        user.posts = {}
        last_post_fetched = None
        
        while True:
            
            # filter list (takes out all posts already fetched)
            if last_post_fetched is None:
                # get all visible posts (given driver)            
                visible_posts = InstaStructure.get_visible_posts(self.driver)
            else:
                last_post_fetched = visible_posts[-1]
                
                visible_posts = InstaStructure.get_visible_posts(self.driver)
                
                # remove first until first is not the last post fetched
                while visible_posts[0] != last_post_fetched:
                    visible_posts = visible_posts[1:]
                    
                # only last post fetched remains in the list so remove it
                visible_posts = visible_posts[1:]
                
            
            for post in visible_posts:
                
                # get post href (/p/href)
                post_href = InstaStructure.get_post_href(post)
                
                # creates a post object with only the current post href
                current_post = Post(post_href)
                
                # if post not already fetched, fetch it
                if post_href not in user.posts:
                
                    # open the post (post here is an page element)
                    post.click()
                    
                    # inserts attributes for the current post
                    self.__get_post_info__(current_post)
                    
                    # get back to user post grid
                    self.driver.back()

                    # adds the post to the users post dict
                    user.posts.update({post_href: current_post})
            
            # scroll the page to make more posts visible
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            InstaStructure.sleep(pause)

            new_height = self.driver.execute_script("return document.body.scrollHeight")
            
            # will scroll the page until it can (until a new height is found)
            if new_height == last_height:
                break
                
            last_height = new_height
 

    def __get_post_info__(self, post=None, sleep=5):
        """
        Insert all posts information inside a Post object
        instance
        *Can be assumed the driver web browser is open in a
        valid post page*
        
        :Args:
            - post :: Post: an Post instance which contains
            at least its href
            - sleep :: Float: time to sleep in InstaStructure
            .sleep(sleep)
            
        :Usage:
            Inside InstaBot.__get_posts__:
                
                self.__get_post_info__(current_post)
            
        :Returns:
            None, but the information inserted inside the Post
            object
        """ 
        
        if post is None or not isinstance(post, Post):
            raise Exception("Invalid post")
            
        InstaStructure.sleep(sleep)
        
        post.caption = InstaStructure.get_caption(self.driver)
        post.datetime = InstaStructure.get_datetime(self.driver)
    
    
    def get_users_info(self, user=None, sleep=5, get_followers=True, get_following=True, get_posts=True):
        """
        Insert all users information inside a User object
        instance
        
        :Args:
            - user :: User: an User instance which contains at
            least a handle (*not* verified - could be invalid)
            - sleep :: Float: time to sleep in InstaStructure
            .sleep(sleep)
            - get_followers :: Boolean: should scrape followers
            list (default is True)
            - get_following :: Boolean: should scrape following
            list (default is True)
            - get_posts :: Boolean: should scrape posts dictionary
            (default is True)
            
        :Usage:
            random_bot.get_users_info(random_user)
            
        :Returns:
            None, but the information inserted inside the User
            object
        """ 
        
        if user is None or not isinstance(user, User):
            raise Exception("Invalid user")
        
        self.driver.get(user.url)
        InstaStructure.sleep(sleep)
        
        if not InstaStructure.is_valid_handle(self.driver, user):
            raise Exception("Invalid user: handle invalid")
            
        if not InstaStructure.is_public(self.driver, user):
            raise Exception("Invalid user: private user")
                
        user.n_posts = int(InstaStructure.get_n_posts(self.driver).replace('.',''))
        user.followers = int(InstaStructure.get_followers(self.driver).replace('.',''))
        user.following = int(InstaStructure.get_following(self.driver).replace('.',''))
        user.name = InstaStructure.get_name(self.driver)
        user.bio = InstaStructure.get_bio(self.driver)
        
        if get_followers:
            user.followers_list = self.__get_list__(user)
            
        if get_following:
            user.following_list = self.__get_list__(user, which='following')
        
        if get_posts:
            self.__get_posts__(user=user)
            
            
    def batch_process(self, handles, sleep=5, get_followers=True, get_following=True, get_posts=True):
        """
        Scrapes the information for all users given a list of
        handles, creating a
        json file for each user tittled user_handle.json and
        a log file
        
        :Args:
            - handles :: List[String]: list of handles to be
            scraped
            - sleep :: Float: time to sleep in InstaStructure
            .sleep(sleep)
            - get_followers :: Boolean: should scrape followers
            list (default is True)
            - get_following :: Boolean: should scrape following
            list (default is True)
            - get_posts :: Boolean: should scrape posts dictionary
            (default is True)
            
        :Usage:
            random_bot.batch_process(['user1', 'user2', 'user3'])
            
        :Returns:
            None, but jsons and log created
        """ 
        
        log_name = datetime.datetime.now().strftime('%H_%M_%d_%m_%Y.log')

        with open(log_name, 'a', encoding="utf-8") as log:
            while handles:
                # get a handle to try to scrape
                handle = handles.pop()

                # list of current jsons (all users already fetched successfully)
                current_jsons = list(filter(lambda f: '.json' in f, os.listdir()))
                current_jsons = list(map(lambda f: f.rstrip('.json'), current_jsons))

                # if handle already successfully fetched, just log it
                if handle in current_jsons:
                    log.write(random_user.handle + ": already fetched before" + '\n')
                else:
                    # else, try to fetch the users information
                    random_user = User(handle)
                    
                    # if sucess in getting the users information, log sucess
                    try:
                        date_now = datetime.datetime.now().strftime('%H:%M of %m/%d/%Y')
                        self.get_users_info(random_user, sleep=sleep, get_followers=get_followers, get_following=get_following, get_posts=get_posts)
                        log.write(random_user.handle + ": fetched at " + date_now + '\n')
                        random_user.to_json()
                    
                    # otherwise, log error
                    except Exception as ex:
                        log.write(random_user.handle + ": " + str(ex) + '\n')
        


In [12]:
help(InstaBot)

Help on class InstaBot in module __main__:

class InstaBot(builtins.object)
 |  InstaBot(insta_driver)
 |  
 |  Defines a Instagram scraper bot which can get all of a
 |  user[s] information, (all information of an User object)
 |  
 |  ----------------------------------------------------------------------
 |  
 |  Given a mounted and working InstaDriver, it scrapes info-
 |  rmation of Instagram users using Selenium and a Chrome
 |  Web Driver
 |  
 |  Methods defined here:
 |  
 |  __get_list__(self, user=None, which='followers', sleep=5)
 |      Get a list of followers/following of a User object 
 |      instance
 |      *Can be assumed the driver web browser is open in
 |      a valid user page*
 |      
 |      :Args:
 |          - user :: User: an User instance which contains
 |          at least a handle
 |          - which :: String: which list should the method
 |          scrape ('followers' or 'following')
 |          - sleep :: Float: time to sleep in InstaStructure
 |     

In [21]:
handles = ['', '']

In [None]:
driver = InstaDriver('', '') # insert username and password
driver.mount_driver('chromedriver.exe')
driver.login()

In [None]:
bot = InstaBot(driver)

In [None]:
bot.batch_process(handles, get_followers=False, get_following=False, get_posts=True)

In [88]:
import pandas as pd
import os
import random
import json
import datetime

def create_dataframes(first_id=1000):
    jsons = list(filter(lambda file: file.endswith('.json'), os.listdir(os.getcwd())))
    random.shuffle(jsons)
    
    users = pd.DataFrame(columns=['user_id', 'bio', 'seguidos', 'seguidores', 'qtd_posts'])
    posts = pd.DataFrame(columns=['user_id', 'contador', 'legenda', 'polaridade', 'emocao', 'qtd_likes', 'horario'])
    
    for js_file in jsons:
        
        f = open(js_file)
        data = json.load(f)
        
        users.loc[len(users)] = [str(first_id), data['bio'], data['following'], data['followers'], data['n_posts']]
        
        data['posts'] = list(filter(lambda post: post['caption'] is not None, data['posts']))
        
        n = len(data['posts'])
        
        user_data = {
                        'user_id': [first_id for i in range(0, n)],
                        'contador': [i for i in range(0, n)],
                        'legenda': list(map(lambda post: post['caption'], data['posts'])),
                        'polaridade': ['' for i in range(0, n)],
                        'emocao': ['' for i in range(0, n)],
                        'qtd_likes': [0 for i in range(0, n)],
                        'horario': list(map(lambda post: datetime.datetime.strftime(datetime.datetime.strptime(post['datetime'], '%Y-%m-%dT%H:%M:%S.000Z'), '%Y-%m-%d %H:%M:%S'),
                                            data['posts']
                                           )
                                       )
                        }
        
        user_posts = pd.DataFrame(user_data)
        posts = pd.concat([posts, user_posts], ignore_index=True)
        
        first_id += 1
        
    return users, posts
        
    

In [89]:
users, posts = create_dataframes()

In [90]:
users

Unnamed: 0,user_id,bio,seguidos,seguidores,qtd_posts
0,1000,,1492,1063,59
1,1001,• 🅑︎🅗︎ ✈︎ Uberlândia\n• SI | UFU 👨🏻‍💻,108,76,0
2,1002,🇧🇷❤🇧🇪,534,403,8
3,1003,,256,67,1
4,1004,🇧🇷🇵🇹\n@alekissimo\nUFU | Ciência da Computação,824,831,10
5,1005,"""Viver para ser melhor também é um jeito de le...",169,159,0
6,1006,São Carlos 🔄 Uberlândia\nUFU - Sistema de Info...,1389,750,10
7,1007,stanti.com.br/castraris,388,1101,64
8,1008,Vai Corinthians!!!,248,139,4
9,1009,Ciência da Computação - UFU,687,587,5


In [91]:
posts

Unnamed: 0,user_id,contador,legenda,polaridade,emocao,qtd_likes,horario
0,1000,0,{A arte de guardar fotos e só postar depois} •...,,,0,2023-03-10 21:46:14
1,1000,1,Imagine Dragons • Mercury Tour | 🎇🔥🐉❤️🤘🏻 | Mai...,,,0,2023-03-05 22:56:40
2,1000,2,"Última corrida do ano com eles, que me acompan...",,,0,2022-12-18 13:31:36
3,1000,3,Onde eu queria 🌟 nesse calor • Rio!,,,0,2022-11-27 22:19:05
4,1000,4,Turistando • 🦐🍃🐚☀️🌊,,,0,2022-11-13 23:04:44
...,...,...,...,...,...,...,...
2751,1035,11,,,,0,2021-01-10 11:59:32
2752,1035,12,,,,0,2019-02-19 21:43:15
2753,1036,0,Anivs.,,,0,2022-09-03 23:38:22
2754,1036,1,VACINADOS!!,,,0,2021-09-22 19:25:33


In [92]:
users.to_csv('users_para_rotular.csv', index=False)

In [93]:
posts.to_csv('para_rotular.csv', index=False)