In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm #Still for graphics
import matplotlib.colors # For nice colours
import wordcloud
import numpy as np
import scipy
import glob
import seaborn as sns
import sklearn.manifold
import itertools
import json
import urllib.parse
from bs4 import BeautifulSoup as bs
import spacy #comp-linguistics
import gensim#For topic modeling

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile

#To process text data
import re
import time
import string
import statsmodels.formula.api as smf
from gensim import corpora, models
from collections import Counter
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
from langdetect import detect
from textblob import Word
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Scrape Youtube
from googleapiclient import discovery
from googleapiclient import errors
from oauth2client.tools import argparser
from youtube_transcript_api import YouTubeTranscriptApi



## Instagram

In [3]:
# This function is used to scrape posts links from a give hashtag
def scrape_hashtag(hashtag, num_posts):
    browser = webdriver.Chrome(ChromeDriverManager().install())
    browser.get('https://www.instagram.com/accounts/login/')
    time.sleep(30) #Time to log in for scraping
    browser.get('https://www.instagram.com/explore/tags/' + hashtag)
    Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    links=[]
    
    # Repetitively scolling down and scraping new data until it reaches to num_posts
    while len(links) <= num_posts:
        source = browser.page_source
        data = bs(source, 'html.parser')
        body = data.find('body')
        script = body.find('article')
        
        # Scrape the link to the post under the hashtag
        for link in script.findAll('a'):
            if re.match("/p", link.get('href')):
                if 'https://www.instagram.com'+link.get('href') not in links:
                    links.append('https://www.instagram.com'+link.get('href'))
                    
        scroll_down = "window.scrollTo(0, document.body.scrollHeight);"
        browser.execute_script(scroll_down)
        time.sleep(3)  
        
    return links

In [2]:
# This function is used to generate dataframe from a list of post links
def scrape_post(post_lst, ins_data):
    browser = webdriver.Chrome(ChromeDriverManager().install())
    browser.get('https://www.instagram.com/accounts/login/')
    time.sleep(30) #Time to log in for scraping
    
    for i in range(len(post_lst)):
        info = {}
        browser.get(post_lst[i])
        time.sleep(25)
        source = browser.page_source
        soup = bs(source, "html.parser")
        scripts = soup.find_all("script")
                
        # If the post link does not contain the post anymore, skip the link
        if len(scripts) < 7:
            print('no')
            return pd.DataFrame(ins_data).T
            continue

        for script in scripts[21:24]:
            if script.contents[0][:29] == "window.__additionalDataLoaded":
                print("found")

            try:
                # Find out the information that we need from the script
                data = json.loads(script.contents[0][48:-2])
                if not data["graphql"]["shortcode_media"]:
                    print('nooo')
                    continue
                image_data = data["graphql"]["shortcode_media"]
                info["timestamp"] = datetime.fromtimestamp(image_data["taken_at_timestamp"])
                info["caption"] = str(image_data["edge_media_to_caption"]["edges"][0]["node"]["text"])
                info["user"] = str(image_data["owner"]["username"])
                info["full_name"] = str(image_data["owner"]["full_name"])
                info["likes"] = image_data["edge_media_preview_like"]["count"]
                info["image_url"] = str(image_data["display_url"])
                info["dimensions"] = image_data["dimensions"]

                # Store the post location, but some posts do not have a location in script
                location = image_data["location"]
                if location:
                    if location["address_json"]:
                        info["location"] = json.loads(location["address_json"])
                        info["country"] = info["location"]["country_code"]

                # Store tagged account that appeared in the post, some posts do not have
                tagged_account = []
                for edge in image_data["edge_media_to_tagged_user"]["edges"]:
                    tagged_account.append(edge["node"]["user"]["username"])
                info["tagged_account"] = tagged_account
                ins_data[i] = info
            except:
                continue
    return pd.DataFrame(ins_data).T

In [4]:
hashtag = "lululemon"
raw_links = scrape_hashtag(hashtag, 3000)
ins_data = scrape_post(raw_links[:50], {})
ins_data.to_csv('{}_Instagram.csv'.format(hashtag))

## YouTube

In [5]:
DEVELOPER_KEY = "AIzaSyDbQYhRdAnrlU0BQ7gpFpsZUiEWtP0WFts"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = discovery.build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)

In [6]:
#https://github.com/SeyiAgboola/YouTube-Mining/blob/master/youtube_search.py
# Scape videos based on date, rating, relevance, title, viewCount
def youtubeSearch(query, max_results=50, order="viewCount", token=None, 
                  location=None, location_radius=None):
    print(token)
    search_response = youtube.search().list(
        q=query,
        type="video",
        pageToken=token,
        order = order,
        videoCaption='closedCaption',
        part="id,snippet",
        maxResults=max_results,
        location=location,
        locationRadius=location_radius).execute()

    print("Search Completed...")
    print("Total results: {0} \nResults per page: {1}".format(search_response['pageInfo']['totalResults'], 
                                                              search_response['pageInfo']['resultsPerPage']))
    return search_response

In [7]:
def storeResults(response):
    
    #create variables to store your values
    title = []
    channelId = []
    channelTitle = []
    categoryId = []
    videoId = []
    viewCount = []
    likeCount = []
    dislikeCount = []
    commentCount = []
    favoriteCount = []
    category = []
    tags = []
    videos = []
    captions = []
    
    for search_result in response.get("items", []):
        if search_result["id"]["kind"] == "youtube#video":

            #append title and video for each item
            title.append(search_result['snippet']['title'])
            videoId.append(search_result['id']['videoId'])

            #then collect stats on each video using videoId
            stats = youtube.videos().list(
                part='statistics, snippet',
                id=search_result['id']['videoId']).execute()
            
            channelId.append(stats['items'][0]['snippet']['channelId']) 
            channelTitle.append(stats['items'][0]['snippet']['channelTitle']) 
            categoryId.append(stats['items'][0]['snippet']['categoryId']) 
            favoriteCount.append(stats['items'][0]['statistics']['favoriteCount'])
            
            # Collect viewCounts
            try:
                viewCount.append(stats['items'][0]['statistics']['viewCount'])
            except:
                viewCount.append("Not available")
            
            # Collect likes
            try:
                likeCount.append(stats['items'][0]['statistics']['likeCount'])
            except:
                likeCount.append("Not available")
                
            # Collect dislikes
            try:
                dislikeCount.append(stats['items'][0]['statistics']['dislikeCount'])     
            except:
                dislikeCount.append("Not available")
                
            # Extract subtitles
            try:
                transcript_list =YouTubeTranscriptApi.list_transcripts(search_result['id']['videoId'])
                caption = ""
                try:
                    transcript = transcript_list.find_generated_transcript(['en'])
                    caption = extract_captions(transcript)
                except:
                    try:
                        transcipt = transcript_list.find_manually_created_transcript(['en'])
                        caption = extract_captions(transcript)
                    except:
                        captions.append("")
                captions.append(caption)
            except:
                captions.append("")

            if 'commentCount' in stats['items'][0]['statistics'].keys():
                commentCount.append(stats['items'][0]['statistics']['commentCount'])
            else:
                commentCount.append(0)
         
            if 'tags' in stats['items'][0]['snippet'].keys():
                tags.append(stats['items'][0]['snippet']['tags'])
            else:
                #I'm not a fan of empty fields
                tags.append("No Tags")
                
    #Break out of for-loop and if statement and store lists of values in dictionary
    youtube_dict = {'tags':tags,'channelId': channelId,'channelTitle': channelTitle,
                    'categoryId':categoryId,'title':title,'videoId':videoId,
                    'viewCount':viewCount,'likeCount':likeCount,'dislikeCount':dislikeCount,
                    'commentCount':commentCount,'favoriteCount':favoriteCount, 'captions':captions}
 
    return youtube_dict

def extract_captions(transcript):
    caption = ""
    for text in transcript.fetch():
        caption += text['text']
    return caption

In [8]:
# Write the result into csv file
def writeCSV(results, filename):
    import csv
    keys = sorted(results.keys())
    with open(filename, "w", newline="", encoding="utf-8") as output:
        writer = csv.writer(output, delimiter=",")
        writer.writerow(keys)
        writer.writerows(zip(*[results[key] for key in keys]))

In [9]:
#Run YouTube Search
q ='nike'
response = youtubeSearch(q)
results = storeResults(response)
writeCSV(results, "{}_YouTube.csv".format(q))
for i in range(10):
    token = response['nextPageToken']
    response = youtubeSearch(q, token=token)
    results = storeResults(response)
    writeCSV(results, "{}_YouTube.csv".format(q))

## Reddit

In [10]:
CLIENT_ID = "F6gNbHvgpGFCsg"
SECRET_TOKEN = "aI6rgzVffEoA9_lNLz-wI4ul6uWx8w"
auth = requests.auth.HTTPBasicAuth('CLIENT_ID', 'SECRET_TOKEN')
data = {'grant_type': 'password', 'username': 'mingTG01', 'password': 'gMt0612!'}

In [11]:
import praw
reddit = praw.Reddit(client_id='F6gNbHvgpGFCsg', client_secret='aI6rgzVffEoA9_lNLz-wI4ul6uWx8w',
                     user_agent='gmt_project', username='mingTG01', password='gMt0612!')

In [12]:
def scrape_subreddit(subreddit, limit):
    subreddit = reddit.subreddit(subreddit)
    new_subreddit = subreddit.new(limit=limit)
    top_subreddit = subreddit.top(limit=limit)
    hot_subreddit = subreddit.hot(limit=limit)
    
    data_dict = { "title":[], "upvotes":[], "upvote_ratio":[], "url":[], "num_commns": [], 
                   "created": [], "body":[]}
    data_dict = extract_post(data_dict, new_subreddit)
    data_dict = extract_post(data_dict, top_subreddit)
    data_dict = extract_post(data_dict, hot_subreddit)
    data_df = pd.DataFrame(data_dict)
    data_df['created'] = data_df['created'].apply(lambda x: datetime.fromtimestamp(x))
    return data_df

def extract_post(data_dict, subreddit):
    for submission in subreddit:
        data_dict["title"].append(submission.title)
        data_dict["upvotes"].append(submission.score)
        data_dict["upvote_ratio"].append(submission.upvote_ratio)
        data_dict["url"].append(submission.url)
        data_dict["num_commns"].append(submission.num_comments)
        data_dict["created"].append(submission.created_utc)
        data_dict["body"].append(submission.selftext)
    return data_dict

In [13]:
q = "nike"
df = scrape_subreddit(q, 5000)
df = df.drop_duplicates()
df.to_csv('{}_Reddit.csv'.format(q))

## Twitter

In [14]:
import tweepy as tw
consumer_key = "OEma2BDMR252S8QItXpJttqGD"
consumer_secret = "wK85Jhq67X0WY0e7UShVjeOJNOdG5x8Xv2RuqlCBoVMrzCgTpp"
access_token = "1248088175102300161-7X0wVvv8Jhf33ykwBozin4RnFCAxc5"
access_token_secret = "yXYiqKTDwGoLxgEyil9yJfqxDZr9cwt1zCqoVJIrbbkvE"

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth,wait_on_rate_limit=True)

In [15]:
# Scrape tweets with keyword
keyword = "gymshark"
tweets = tw.Cursor(api.search, q=keyword, lang='en').items(5000)
tweet_details = [[tweet.created_at, tweet.text, tweet.user.screen_name, tweet.user.location,
                  tweet.retweet_count, tweet.favorite_count, tweet.entities['hashtags'], 
                  tweet.entities['user_mentions']] for tweet in tweets]
columns = ['time', 'text', 'user_name', 'user_location', 'retweet_cnt', 'favorite_cnt', 
           'hashtags', 'user_mentions']
df = pd.DataFrame(tweet_details, columns = columns)
df['hashtags'] = df['hashtags'].apply(lambda x: [dic['text'] for dic in x])
df['user_mentions'] = df['user_mentions'].apply(lambda x: [dic['screen_name'] for dic in x])
df['is_retweet'] = df['text'].apply(lambda x: x[:2] == 'RT')
df.to_csv('{}_Twitter.csv'.format(keyword))