# Data Scraping

In this notebook we
1. Use Selenium & twitter search API to scrape twitter data.
2. Dump them in to a csv file for further analysis.

In [1]:
# Import packages
import pandas as pd
import json

# selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

In [2]:
# Define function for scraping tweets
class Scraper():
    driver_path = "/Users/Nicole/Courses/Springboard-data-science/Capstone 2/Exe/chromedriver"
    
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path = self.driver_path)
        
    def __del__(self):
        self.driver.close()
        
    def get_tweets(self, url, key_word):
        print("URL: \n    " + url)
        self.driver.get(url)
        print("TITLE: \n    " + self.driver.title)
        
        sec = 0
        while True:
            # break after 10 minutes
            if sec > 1000:
                break
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(10) # delay for 10 seconds
            sec += 10
                
        elements = self.driver.find_elements_by_class_name('js-tweet-text')
        tweets = [element.text for element in elements]
        
        return tweets

In [3]:
# Scraping tweets using twitter search API.
# Define some keywords for searching.
keywords = ['movie', \
            'film', \
            'adrift', \
            'bookclub', \
            'thehappyprince', \
            'Oceans8', \
            'inthefade', \
            'crazyrichasians', \
            'jurassicworld', \
            'missionimpossible', \
            'mi6', \
            'americananimals', \
            'incredibles2', \
            'mammamia']

start_date, end_date = "2018-06-01", "2018-09-11"

dfs = []

for keyword in keywords:
    url = "https://twitter.com/search?q=" + keyword + "%20since%3A" + start_date + \
      "%20until%3A" + end_date + "&amp;amp;amp;amp;amp;amp;lang=tr%22"

    scraper = Scraper()
    tweets = scraper.get_tweets(url, keyword)
    
    df = pd.DataFrame({'Tweets' : tweets})
    dfs.append(df)

URL: 
    https://twitter.com/search?q=movie%20since%3A2018-06-01%20until%3A2018-09-11&amp;amp;amp;amp;amp;amp;lang=tr%22
TITLE: 
    News about movie since:2018-06-01 until:2018-09-11 on Twitter
URL: 
    https://twitter.com/search?q=film%20since%3A2018-06-01%20until%3A2018-09-11&amp;amp;amp;amp;amp;amp;lang=tr%22
TITLE: 
    News about film since:2018-06-01 until:2018-09-11 on Twitter
URL: 
    https://twitter.com/search?q=adrift%20since%3A2018-06-01%20until%3A2018-09-11&amp;amp;amp;amp;amp;amp;lang=tr%22
TITLE: 
    adrift since:2018-06-01 until:2018-09-11 - Twitter Search
URL: 
    https://twitter.com/search?q=bookclub%20since%3A2018-06-01%20until%3A2018-09-11&amp;amp;amp;amp;amp;amp;lang=tr%22
TITLE: 
    bookclub since:2018-06-01 until:2018-09-11 - Twitter Search
URL: 
    https://twitter.com/search?q=thehappyprince%20since%3A2018-06-01%20until%3A2018-09-11&amp;amp;amp;amp;amp;amp;lang=tr%22
TITLE: 
    thehappyprince since:2018-06-01 until:2018-09-11 - Twitter Search
URL: 
    h

In [4]:
for df in dfs:
    print(df.head(10))
    print(df.info())

                                              Tweets
0  I think there’s a first date going on near me ...
1  Sell all your houses. Stop flying private jets...
2  Starship Troopers (1997) is a deliciously ambi...
3  Then this & they never ran. I felt like I was ...
4                    Best Gary Oldman movie? (Pt. 2)
5  Find out how an unexpected practical effect br...
6  fauni just called me from jail and then he pas...
7  Jamie Dornan and Matt Bomer both stopped by a ...
8  Just saw that a Russian/South Korean team of s...
9  My brother, Benjamin Rice, engineered and co-p...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2018 entries, 0 to 2017
Data columns (total 1 columns):
Tweets    2018 non-null object
dtypes: object(1)
memory usage: 15.8+ KB
None
                                              Tweets
0  I just had to block a handful of Trolls who se...
1  Max Minghella just told @MTV that Dylan O’Brie...
2  Find out how an unexpected practical effect br...
3  I do hope this isn'

In [5]:
df = pd.concat(dfs)
df.head(20)

Unnamed: 0,Tweets
0,I think there’s a first date going on near me ...
1,Sell all your houses. Stop flying private jets...
2,Starship Troopers (1997) is a deliciously ambi...
3,Then this & they never ran. I felt like I was ...
4,Best Gary Oldman movie? (Pt. 2)
5,Find out how an unexpected practical effect br...
6,fauni just called me from jail and then he pas...
7,Jamie Dornan and Matt Bomer both stopped by a ...
8,Just saw that a Russian/South Korean team of s...
9,"My brother, Benjamin Rice, engineered and co-p..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15002 entries, 0 to 2014
Data columns (total 1 columns):
Tweets    15002 non-null object
dtypes: object(1)
memory usage: 234.4+ KB


In [7]:
# Dump tweets to csv file for further analysis.
df.to_csv('../Data/tweets.csv', index = False)