In [63]:
#forked from kennethreitz
import re
from requests_html import HTMLSession, HTML
from datetime import datetime

session = HTMLSession()


def get_tweets(user, pages=25):
    """Gets tweets for a given user, via the Twitter frontend API."""

    url = f'https://twitter.com/i/profiles/show/{user}/timeline/tweets?include_available_features=1&include_entities=1&include_new_items_bar=true'
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Referer': f'https://twitter.com/{user}',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
        'X-Twitter-Active-User': 'yes',
        'X-Requested-With': 'XMLHttpRequest'
    }

    def gen_tweets(pages):
        r = session.get(url, headers=headers)

        while pages > 0:
            try:
                html = HTML(html=r.json()['items_html'],
                            url='bunk', default_encoding='utf-8')
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{user}" does not exist or is private.')

            comma = ","
            dot = "."
            tweets = []
            for tweet in html.find('.stream-item'):
                text = tweet.find('.tweet-text')[0].full_text
                tweetId = tweet.find(
                    '.js-permalink')[0].attrs['data-conversation-id']
                time = datetime.fromtimestamp(
                    int(tweet.find('._timestamp')[0].attrs['data-time-ms'])/1000.0)
                interactions = [x.text for x in tweet.find(
                    '.ProfileTweet-actionCount')]
                replies = int(interactions[0].split(" ")[0].replace(comma, "").replace(dot,""))
                retweets = int(interactions[1].split(" ")[
                               0].replace(comma, "").replace(dot,""))
                likes = int(interactions[2].split(" ")[0].replace(comma, "").replace(dot,""))
                hashtags = [hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag')]
                urls = [url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')]
                photos = [photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer')]
                
                videos = []
                video_nodes = tweet.find(".PlayableMedia-player")
                for node in video_nodes:
                    styles = node.attrs['style'].split()
                    for style in styles:
                        if style.startswith('background'):
                            tmp = style.split('/')[-1]
                            video_id = tmp[:tmp.index('.jpg')]
                            videos.append({'id': video_id})
                tweets.append({'tweetId': tweetId, 'time': time, 'text': text,
                               'replies': replies, 'retweets': retweets, 'likes': likes, 
                               'entries': {
                                    'hashtags': hashtags, 'urls': urls,
                                    'photos': photos, 'videos': videos
                                }
                               })

            last_tweet = html.find('.stream-item')[-1].attrs['data-item-id']

            for tweet in tweets:
                if tweet:
                    tweet['text'] = re.sub('http', ' http', tweet['text'], 1)
                    yield tweet

            r = session.get(
                url, params = {'max_position': last_tweet}, headers = headers)
            pages += -1

    yield from gen_tweets(pages)

In [60]:
import pandas as pd

In [71]:
# Scrape from users
tweet_df_all = pd.DataFrame([])

users = ['nature','Dev_Cell']
for i in range(len(users)):
    tweet_gen = get_tweets(users[i], pages=1)
    # Make df of tweets, extract entries column, make df of entries, add these columns to dataframe
    tweet_df = pd.DataFrame(tweet_gen)
    entries_list=tweet_df.entries.values.tolist()
    entries_df = pd.DataFrame(entries_list)
    tweet_df_with_entries = pd.concat([tweet_df, entries_df], axis=1, sort=False)
    tweet_df_all = pd.concat([tweet_df_all, tweet_df_with_entries])

In [73]:
# Drop rows that don't have urls
df_tweet_urls=tweet_df_all[tweet_df_all.astype(str).urls!='[]']
df_tweet_urls.reset_index(inplace=True, drop=True)

In [74]:
df_tweet_urls.head()

Unnamed: 0,entries,likes,replies,retweets,text,time,tweetId,hashtags,photos,urls,videos
0,"{'hashtags': [], 'urls': ['https://www.nature....",56,0,22,On the Nature cover this week: Best laid plans...,2019-03-07 11:04:42,1103597332409528321,[],[https://pbs.twimg.com/media/D1DDyrIWkAAHzMI.jpg],[https://www.nature.com/nature/volumes/567/iss...,[]
1,"{'hashtags': [], 'urls': ['https://go.nature.c...",63,2,31,"Launching in 2020, Nature Food will publish re...",2019-03-11 09:07:03,1105017275785064448,[],[https://pbs.twimg.com/media/D1XPWxbWkAA7FID.jpg],[https://go.nature.com/2C8JiZ9],[]
2,"{'hashtags': ['#ScientistAtWork'], 'urls': ['h...",71,5,50,Send us your very best #ScientistAtWork photos...,2019-03-10 18:42:09,1104799617710022657,[#ScientistAtWork],[],[https://go.nature.com/2BPksNK],[{'id': '_EeiQtEZlJUuSMKC'}]
3,"{'hashtags': [], 'urls': ['https://go.nature.c...",218,2,99,"By repurposing two existing drugs, metformin a...",2019-03-09 11:07:04,1104322703203418112,[],[https://pbs.twimg.com/media/D1NXpVUWsAErQOz.jpg],[https://go.nature.com/2tUV1Gs],[]
4,"{'hashtags': [], 'urls': ['https://go.nature.c...",98,0,75,"In this Nature Review article, the authors dis...",2019-03-09 02:07:03,1104186804666798080,[],[https://pbs.twimg.com/media/D1LcDA1XcAEgycS.jpg],[https://go.nature.com/2IFA505],[]


In [75]:
import pickle
df_tweet_urls.to_pickle('twitter.pkl')

## Scraping abstracts and/or article paragraphs from websites from urls provided

In [76]:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver

In [92]:
# Read in summary (or whole article, if possible)
df_tweet_urls['summary'] = ''
for ix in range(len(df_tweet_urls)):
    this_url = str(df_tweet_urls.urls.iloc[ix]).replace('"', '').replace("'", "").replace("[", "").replace("]", "")
    service_args = [
    '--proxy=proxy-inst.upf.edu:9090',
    '--proxy-type=http',
    ]
    driver = webdriver.PhantomJS(executable_path='C://Users/lchen/phantomjs/bin/phantomjs', service_args=service_args)
    driver.set_window_size(1120, 550)
    driver.get(this_url)
    #driver.find_element_by_id('search_form_input_homepage').send_keys("realpython")
    soup = bs(driver.page_source, 'lxml')
    #for i in soup.body:
        #print(i)
    driver.quit()
    summary = ''
    if 'sciencmag.org' in this_url:
        summary = soup.find_all('span', attrs={'class': 'highwire-journal-article-marker-start'}).text #Science
    elif 'nature.com' in this_url and len(soup.find_all('p'))>17:
        summary_list = soup.find_all('p')[17:len(soup.find_all('p'))] #Nature
        summary_list_text = [x.text for x in summary_list]
        summary = ' '.join(summary_list_text)
    elif 'cell.com' in this_url:
        summary_list = soup.find_all('div', attrs={'class': 'section-paragraph'}) #Cell
        summary_list_text = [x.text for x in summary_list]
        summary = ' '.join(summary_list_text)

    else:
        pass
    df_tweet_urls.summary.iloc[ix]=summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [93]:
df_tweet_urls.head(10)

Unnamed: 0,entries,likes,replies,retweets,text,time,tweetId,hashtags,photos,urls,videos,summary
0,"{'hashtags': [], 'urls': ['https://www.nature....",56,0,22,On the Nature cover this week: Best laid plans...,2019-03-07 11:04:42,1103597332409528321,[],[https://pbs.twimg.com/media/D1DDyrIWkAAHzMI.jpg],[https://www.nature.com/nature/volumes/567/iss...,[],\nResearch Highlight\n | \n26 February 2019\n ...
1,"{'hashtags': [], 'urls': ['https://go.nature.c...",63,2,31,"Launching in 2020, Nature Food will publish re...",2019-03-11 09:07:03,1105017275785064448,[],[https://pbs.twimg.com/media/D1XPWxbWkAA7FID.jpg],[https://go.nature.com/2C8JiZ9],[],We publish a range of content types including ...
2,"{'hashtags': ['#ScientistAtWork'], 'urls': ['h...",71,5,50,Send us your very best #ScientistAtWork photos...,2019-03-10 18:42:09,1104799617710022657,[#ScientistAtWork],[],[https://go.nature.com/2BPksNK],[{'id': '_EeiQtEZlJUuSMKC'}],We’re interested in finding and celebrating ar...
3,"{'hashtags': [], 'urls': ['https://go.nature.c...",218,2,99,"By repurposing two existing drugs, metformin a...",2019-03-09 11:07:04,1104322703203418112,[],[https://pbs.twimg.com/media/D1NXpVUWsAErQOz.jpg],[https://go.nature.com/2tUV1Gs],[],
4,"{'hashtags': [], 'urls': ['https://go.nature.c...",98,0,75,"In this Nature Review article, the authors dis...",2019-03-09 02:07:03,1104186804666798080,[],[https://pbs.twimg.com/media/D1LcDA1XcAEgycS.jpg],[https://go.nature.com/2IFA505],[],
5,"{'hashtags': [], 'urls': ['https://www.science...",100,0,28,Check out our new @Dev_Cell paper by @DCheeram...,2019-03-01 04:51:59,1101329207827738624,[],[https://pbs.twimg.com/media/D0i0yDwV4AAdLmS.jpg],[https://www.sciencedirect.com/science/article...,[],
6,"{'hashtags': ['#CSTxnDev'], 'urls': ['http://b...",1,0,1,Developmental biologists! @CellSymposia Trans...,2019-02-28 20:00:40,1101195498772414464,[#CSTxnDev],[https://pbs.twimg.com/media/D0g7eE6XcAEjGiF.jpg],[http://bit.ly/2XsSzUS],[],
7,"{'hashtags': [], 'urls': ['http://ow.ly/YGhJ30...",3,0,1,A Perspective on the use of microfluidics in d...,2019-02-20 19:05:12,1098282434947354624,[],[],[http://ow.ly/YGhJ30nLRSB],[],
8,"{'hashtags': [], 'urls': ['http://ow.ly/U6VH30...",34,0,9,"On the cover, a fox eating spätzle. Read about...",2019-02-20 17:50:20,1098263594024751104,[],[https://pbs.twimg.com/media/Dz3Q6wXXgAEAoKK.jpg],[http://ow.ly/U6VH30nLRz5],[],
9,"{'hashtags': ['#RNA', '#CSRNA19'], 'urls': ['h...",1,0,0,RT @CellPressNews: Keynote Patrick Cramer @mpi...,2019-02-15 18:15:04,1096457882227363842,"[#RNA, #CSRNA19]",[https://pbs.twimg.com/media/DzdmogJWsAAKqJy.jpg],[http://bit.ly/2GQZd1F],[],


In [None]:
df_tweet_urls.to_pickle('twitter_with_summaries.pkl')