In [2]:
import json
import bs4
import requests
import re
import logging as log
from abc import ABCMeta, abstractmethod


class InstagramUser:
    def __init__(self, user_id, username=None, bio=None, followers_count=None, following_count=None, is_private=False):
        """
        A class to represent an Instagram User
        :param user_id: User ID of instagram user
        :param username: Username of Instagram user
        :param bio: Bio text for user
        :param followers_count: Number of followers
        :param following_count: Number of people following
        :param is_private: Boolean to indicate if account is private or not
        """
        self.id = user_id
        self.username = username
        self.bio = bio
        self.followers_count = followers_count
        self.following_count = following_count
        self.is_private = is_private


class InstagramPost:
    def __init__(self, post_id, code, user=None, caption="", display_src=None, is_video=False, created_at=None):
        """
        A class to represent a post on Instagram
        :param post_id: ID of the post
        :param code: Code of the post
        :param user: A user object representing the owner of the post
        :param caption: The caption/text of the post
        :param display_src: The URL of the image of the post
        :param is_video: A boolean value indicating it's a video
        :param created_at: The time it was created
        """
        self.post_id = post_id
        self.code = code
        self.caption = caption
        self.user = user
        self.display_src = display_src
        self.is_video = is_video
        self.created_at = created_at

    def processed_text(self):
        """
        Processes a caption to remove newlines in it.
        :return:
        """
        if self.caption is None:
            return ""
        else:
            text = re.sub('[\n\r]', ' ', self.caption)
            return text

    def hashtags(self):
        """
        Simple hashtag extractor to return the hastags in the post
        :return:
        """
        hashtags = []
        if self.caption is None:
            return hashtags
        else:
            for tag in re.findall("#[a-zA-Z0-9]+", self.caption):
                hashtags.append(tag)
            return hashtags


class HashTagSearch(metaclass=ABCMeta):
    instagram_root = "https://www.instagram.com"

    def __init__(self, ):
        """
        This class performs a search on Instagrams hashtag search engine, and extracts posts for that given hashtag.
        There are some limitations, as this does not extract all occurrences of the hash tag.
        Instead, it extracts the most recent uses of the tag.
        """
        super().__init__()

    def extract_recent_tag(self, tag):
        """
        Extracts Instagram posts for a given hashtag
        :param tag: Hashtag to extract
        """

        url_string = "https://www.instagram.com/explore/tags/%s/" % tag
        response = bs4.BeautifulSoup(requests.get(url_string).text, "html.parser")
        query_ids = self.get_query_id(response)
        shared_data = self.extract_shared_data(response)

        media = shared_data['entry_data']['TagPage'][0]['tag']['media']
        posts = []
        for node in media['nodes']:
            post = self.extract_recent_instagram_post(node)
            posts.append(post)
        self.save_results(posts)

        end_cursor = media['page_info']['end_cursor']
        while end_cursor is not None:
            try:
                url = "https://www.instagram.com/graphql/query/?query_id=%s&tag_name=%s&first=12&after=%s" % (
                query_ids[0], tag, end_cursor)
                data = json.loads(requests.get(url).text)
                end_cursor = data['data']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
                posts = []
                for node in data['data']['hashtag']['edge_hashtag_to_media']['edges']:
                    posts.append(self.extract_recent_query_instagram_post(node['node']))
                self.save_results(posts)
            except Exception as e:
                query_ids = query_ids[1:]
                if len(query_ids) == 0:
                    log.error("Could not extract Query Id, exiting")
                    end_cursor = None

    @staticmethod
    def extract_shared_data(doc):
        for script_tag in doc.find_all("script"):
            if script_tag.text.startswith("window._sharedData ="):
                shared_data = re.sub("^window\._sharedData = ", "", script_tag.text)
                shared_data = re.sub(";$", "", shared_data)
                shared_data = json.loads(shared_data)
                return shared_data

    @staticmethod
    def extract_recent_instagram_post(node):
        return InstagramPost(
            post_id=node['id'],
            code=node['code'],
            user=InstagramUser(user_id=node['owner']['id']),
            caption=node['caption'],
            display_src=node['display_src'],
            is_video=node['is_video'],
            created_at=node['date']
        )

    @staticmethod
    def extract_recent_query_instagram_post(node):
        return InstagramPost(
            post_id=node['id'],
            code=node['shortcode'],
            user=InstagramUser(user_id=node['owner']['id']),
            caption=node['edge_media_to_caption']['edges'][0]['node']['text']
            if len(node['edge_media_to_caption']['edges']) > 0 else None,
            display_src=node['display_url'],
            is_video=node['is_video'],
            created_at=node['taken_at_timestamp']
        )

    @staticmethod
    def extract_owner_details(owner):
        """
        Extracts the details of a user object.
        :param owner: Instagrams JSON user object
        :return: An Instagram User object
        """
        username = None
        if "username" in owner:
            username = owner["username"]
        is_private = False
        if "is_private" in owner:
            is_private = is_private
        user = InstagramUser(owner['id'], username=username, is_private=is_private)
        return user

    def get_query_id(self, doc):
        query_ids = []
        for script in doc.find_all("script"):
            if script.has_attr("src") and "en_US_Commons" in script['src']:
                text = requests.get("%s%s" % (self.instagram_root, script['src'])).text
                for query_id in re.findall("(?<=c=\")[0-9]{1,}", text):
                    query_ids.append(query_id)
        return query_ids

    @abstractmethod
    def save_results(self, instagram_results):
        print instagram_results


class HashTagSearchExample(HashTagSearch):
    def __init__(self):
        super().__init__()
        self.total_posts = 0

    def save_results(self, instagram_results):
        super().save_results(instagram_results)
        for i, post in enumerate(instagram_results):
            self.total_posts += 1
            print("%i - %s" % (self.total_posts, post.processed_text()))


if __name__ == '__main__':
    log.basicConfig(level=log.INFO)
HashTagSearchExample().extract_recent_tag("christmas")

SyntaxError: invalid syntax (<ipython-input-2-6b2def13fef5>, line 73)

In [29]:
import json
import requests
import logging as log
import simplejson
import json
log.basicConfig(level=log.DEBUG)


class FollowerExtractor():
    """
    Extracts followers for a given profile
    """

    def __init__(self, username, password):
        self.csrf_token, self.cookie_string = FollowerExtractor.login_instagram(username, password)
        log.info("CSRF Token set to %s", self.csrf_token)
        log.info("Cookie String set to %s" % self.cookie_string)

    @staticmethod
    def get_csrf_and_cookie_string():
        resp = requests.head("https://www.instagram.com")
        return resp.cookies['csrftoken'], resp.headers['set-cookie']

    @staticmethod
    def login_instagram(username, password):
        csrf_token, cookie_string = FollowerExtractor.get_csrf_and_cookie_string()
        data = {"username": username, "password": password}
        resp = requests.post("https://www.instagram.com/accounts/login/ajax/",
                             data=data,
                             headers={
                                 "referer": "https://www.instagram.com/",
                                 "accept": "*/*",
                                 "Accept-Language": "en-GB,en;q=0.8",
                                 "cache-control": "no-cache",
                                 "content-length": "40",
                                 "Content-Type": "application/x-www-form-urlencoded",
                                 "cookie": cookie_string,
                                 "origin": "https://www.instagram.com",
                                 "pragma": "no-cache",
                                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
                                 "x-csrftoken": csrf_token,
                                 "x-instagram-ajax": "1",
                                 "X-Requested-With": "XMLHttpRequest"
                             })
        return resp.cookies['csrftoken'], resp.headers['set-cookie']

    def extract_followed_by(self, username, user_id=None):
        if user_id is None:
            user_id = json.loads(requests.get("https://www.instagram.com/%s?__a=1" % username).text)['user']['id']

        resp = self.query_followed_by(username, user_id)
        followers = resp['followed_by']['nodes']
        self.save_followed_by(followers)
        while resp['followed_by']['page_info']['has_next_page']:
            resp = self.query_followed_by(username, user_id, resp['followed_by']['page_info']['end_cursor'])
            followers = resp['followed_by']['nodes']
            self.save_followed_by(followers)
            followers += resp['followed_by']['nodes']

        return followers

    def extract_following(self, username, user_id=None):
        if user_id is None:
            user_id = json.loads(requests.get("https://www.instagram.com/%s?__a=1" % username).text)['user']['id']

        resp = self.query_following(username, user_id)
        followers = resp['follows']['nodes']
        self.save_following(followers)
        while resp['follows']['page_info']['has_next_page']:
            resp = self.query_following(username, user_id, resp['follows']['page_info']['end_cursor'])
            followers = resp['follows']['nodes']
            self.save_following(followers)
            followers += resp['follows']['nodes']

        return followers

    def query_following(self, username, user_id, end_cursor=None):
        headers = self.get_headers("https://www.instagram.com/%s" % username)
        post_data = self.get_following_params(user_id, end_cursor)
        req = requests.post("https://www.instagram.com/query/", data=post_data, headers=headers)
        return response.json()

    def query_followed_by(self, username, user_id, end_cursor=None):
        headers = self.get_headers("https://www.instagram.com/%s" % username)
        post_data = self.get_followed_by_params(user_id, end_cursor)
        req = requests.post("https://www.instagram.com/query/", data=post_data, headers=headers)
        return response.json()

    def get_headers(self, referrer):
        """
        Returns a bunch of headers we need to use when querying Instagram
        :param referrer: The page referrer URL
        :return: A dict of headers
        """
        return {
            "referer": referrer,
            "accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Language": "en-GB,en;q=0.8,en-US;q=0.6",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "cookie": self.cookie_string,
            "origin": "https://www.instagram.com",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/49.0.2623.87 Safari/537.36",
            "x-csrftoken": self.csrf_token,
            "x-instagram-ajax": "1",
            "X-Requested-With": "XMLHttpRequest"
        }

    @staticmethod
    def get_followed_by_params(user_id, end_cursor=None):
        """
        Returns the query params required to load next page on Instagram.
        This can be modified to return less information.
        :param tag: Tag we're querying
        :param end_cursor: The end cursor Instagram specifies
        :return: A dict of request parameters
        """
        if end_cursor is None:
            start_query = "ig_user(%s) { followed_by.first(20) {" % user_id
        else:
            start_query = "ig_user(%s) { followed_by.after(%s, 20) {" % (user_id, end_cursor)
        return {
            'q':
                start_query +
                "  count," +
                "  nodes {" +
                "    id," +
                "    is_verified," +
                "    followed_by_viewer," +
                "    requested_by_viewer," +
                "    full_name," +
                "    profile_pic_url," +
                "    username" +
                "  }," +
                "  page_info {" +
                "    end_cursor," +
                "    has_next_page" +
                "  }" +
                "}" +
                " }",
            "ref": "relationships::follow_list"
        }

    @staticmethod
    def get_following_params(user_id, end_cursor=None):
        """
        Returns the query params required to load next page on Instagram.
        This can be modified to return less information.
        :param tag: Tag we're querying
        :param end_cursor: The end cursor Instagram specifies
        :return: A dict of request parameters
        """
        if end_cursor is None:
            start_query = "ig_user(%s) { follows.first(20) {" % user_id
        else:
            start_query = "ig_user(%s) { follows.after(%s, 20) {" % (user_id, end_cursor)
        return {
            'q':
                start_query +
                "  count," +
                "  nodes {" +
                "    id," +
                "    is_verified," +
                "    followed_by_viewer," +
                "    requested_by_viewer," +
                "    full_name," +
                "    profile_pic_url," +
                "    username" +
                "  }," +
                "  page_info {" +
                "    end_cursor," +
                "    has_next_page" +
                "  }" +
                "}" +
                " }",
            "ref": "relationships::follow_list"
        }

    def save_following(self, following):
        """
        Called when a new batch of following users has been extracted from Instagram
        :param following: Users who are following user
        """
        for user in following:
            print("Following: %s" % user['username'])

    def save_followed_by(self, followed_by):
        """
        Called when a new batch of followed_by users has been extracted from Instagram
        :param following: Users who are followed_by
        """
        for user in followed_by:
            print("Followed By: %s" % user['username'])

if __name__ == '__main__':
    instagram_username = "mitidesai_1692"
    instagram_password = "#one2three4"
    followed_extractor = FollowerExtractor(instagram_username, instagram_password)
    followed_extractor.extract_following("justintimberlake")
followed_extractor.extract_followed_by("justintimberlake")

DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "HEAD / HTTP/1.1" 200 0
DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "POST /accounts/login/ajax/ HTTP/1.1" 403 15684
INFO:root:CSRF Token set to GJx0tc8FCWV4Q482PxHkQPu9PLF6dAxy
INFO:root:Cookie String set to rur=PRN; Path=/, csrftoken=GJx0tc8FCWV4Q482PxHkQPu9PLF6dAxy; expires=Wed, 13-Jun-2018 15:08:05 GMT; Max-Age=31449600; Path=/; Secure, mid=WUFRVQALAAG5W-sI5XDxzq0NnvYb; expires=Tue, 09-Jun-2037 15:08:05 GMT; Max-Age=630720000; Path=/
DEBUG:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): www.instagram.com
DEBUG:requests.packages.urllib3.connectionpool:https://www.instagram.com:443 "GET /justintimberlake?__a=1 HTTP/1.1" 301 0
DEBUG:requests.packages.url

NameError: global name 'response' is not defined

In [35]:
#! /env/bin/python3

'''
BeautifulSoup Instagram Scraper that gets all pictures posted by celebrities yesterday (max 12 pictures per celeb)
'''

import requests, json, os, pprint, sys, tweepy
from collections import Counter
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from top100gram import top100
from emoji import emoji

#Configuration
instagram = 'http://www.instagram.com/' 
yesterday = (datetime.today()- timedelta(days=1)).strftime('%Y-%m-%d') 
pictureFolder = os.path.join('.','SelenaBot_Pictures',yesterday)
captionFolder = os.path.join('.','SelenaBot_Captions')
emojiFolder = os.path.join('.','SelenaBot_Emoji')
captionFilepath = os.path.join(captionFolder, yesterday +'_captions.txt')
emojiFilepath = os.path.join(emojiFolder, yesterday +'_emoji.txt')

for p in [pictureFolder, captionFolder, emojiFolder]: #makes the prequiste folders, if they don't already exits
    if os.path.exists(p) == False:
        os.makedirs(p) 

def make_twitter_api():
    AUTH_FILE = sys.argv[1]
    with open(AUTH_FILE, "r") as auth_file:
        auth_data = json.load(auth_file)

    auth = tweepy.OAuthHandler(auth_data["9fe8nBe9TiLFMYkchZrsPR1Ek"],
                               auth_data["mhh1v5Z7jKGlVFy4RM93FovnFe0CCdFp8M3CseWJ1BDtoo8x0H"])
    auth.set_access_token(auth_data["334700312-RbFnOySH2X103PiHyS7nbhDcqHIoj6luBSfiw5dx"],
                          auth_data["lcevOHD2IAZ3goXkr3DnIZHmLnMfKFAsuGjLDh969fSq5"])

    return tweepy.API(auth)

def DateStamp(ds):
    d = datetime.fromtimestamp(ds)
    return d.strftime('%y-%m-%d_%H-%M-%S')

def picFinder(account):    
    try:
        rgram = requests.get(instagram + account) #accesses the instagram account
        rgram.raise_for_status()
    except requests.exceptions.HTTPError:	#this handles exceptions if accounts get deleted or suspended. Does not handle exceptions for accounts made private
        print('\t \t ### ACCOUNT MISSING ###')
    else:        
        selenaSoup=BeautifulSoup(rgram.text,'html.parser')
        pageJS = selenaSoup.select('script') #selects all the JavaScript on the page
        for i, j in enumerate(pageJS): #Converts pageJS to list of strings so i can calculate length for below. If BS4 has a neater way of doing this, I haven't found it.
            pageJS[i]=str(j)
        picInfo= sorted(pageJS,key=len, reverse=True)[0] #finds the longest bit of JavaScript on the page, which always contains the image data
        allPics = json.loads(str(picInfo)[52:-10])['entry_data']['ProfilePage'][0]['user']['media']['nodes']

    return allPics

def captionDownloader(picture):
    captions = []
    captionFile = open(captionFilepath,'a')
    if 'caption' in picture.keys():
        print('\tcopying caption for picture '+DateStamp(picture['date']))
        captions.append(picture['caption'])
    captionFile.write(str(captions))
    captionFile.close()

def picDownloader(account):
    for picture in picFinder(account):
        if datetime.fromtimestamp(picture['date']).strftime('%Y-%m-%d') == yesterday: #finds pictures from yesterday
                print('\tDownloading picture '+DateStamp(picture['date']))
                picRes = requests.get(picture['display_src'])
                picFileName = os.path.join(pictureFolder, account+'_'+DateStamp(picture['date'])+'.jpg')
                picFile = open(picFileName,'wb')

                for chunk in picRes.iter_content(100000):
                    picFile.write(chunk)

                picFile.close()
                captionDownloader(picture)

def emojiCounter():
    print('Counting the emojis...')
    captions = open(captionFilepath,'r').read() #opens and read the captions file
    emojiList = []
    for c in captions:
        if c in emoji:
            emojiList.append(c)
    emojiCount= Counter(emojiList)
    topEmoji=emojiCount.most_common(1)
    pprint.pprint(emojiCount) #outputs the count in the terminal
    for i, j in topEmoji:
        print('##########\nthe emoji '+ i +' was used ' +str(j)+' times on instagram yesterday!')
    #saves the count in a text file
    emojiFile= open(emojiFilepath,'w')
    emojiFile.write(pprint.pformat(emojiCount))
    emojiFile.close()


    return topEmoji

    
def main():
    for account in top100:
        print('Pictures from today on '+account+'\'s Instagram')
        picDownloader(account)

    api = make_twitter_api()
    for i, j in emojiCounter():
        api.update_status('The most popular emoji on Instagram yesterday was: '+ i +' which was used ' +str(j)+' times')


if __name__ == '__main__':
    main()

ImportError: No module named top100gram