Skip to content
This repository has been archived by the owner on Jul 14, 2020. It is now read-only.

Commit

Permalink
Create py3.5_get_fb_comments_from_fb
Browse files Browse the repository at this point in the history
  • Loading branch information
eoehri committed Oct 19, 2016
1 parent 063d8a8 commit 2fff6be
Showing 1 changed file with 175 additions and 0 deletions.
175 changes: 175 additions & 0 deletions py3.5_get_fb_comments_from_fb
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import urllib.request
import json
import datetime
import csv
import time

app_id = "<FILL IN>"
app_secret = "<FILL IN>" # DO NOT SHARE WITH ANYONE!
file_id = "cnn"

access_token = app_id + "|" + app_secret

def request_until_succeed(url):
req = urllib.request.Request(url)
success = False
while success is False:
try:
response = urllib.request.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print(e)
time.sleep(5)

print("Error for URL %s: %s" % (url, datetime.datetime.now()))
print("Retrying.")

if '400' in str(e):
return None;

return response.read().decode('utf-8')

# Needed to write tricky unicode correctly to csv
def unicode_normalize(text):
return text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22,
0x201D:0x22, 0xa0:0x20 }).encode('utf-8')

def getFacebookCommentFeedData(status_id, access_token, num_comments):

# Construct the URL string
base = "https://graph.facebook.com/v2.6"
node = "/%s/comments" % status_id
fields = "?fields=id,message,like_count,created_time,comments,from,attachment"
parameters = "&order=chronological&limit=%s&access_token=%s" % \
(num_comments, access_token)
url = base + node + fields + parameters

# retrieve data
data = request_until_succeed(url)
if data is None:
return None
else:
return json.loads(data)

def processFacebookComment(comment, status_id, parent_id = ''):

# The status is now a Python dictionary, so for top-level items,
# we can simply call the key.

# Additionally, some items may not always exist,
# so must check for existence first

comment_id = comment['id']
comment_message = '' if 'message' not in comment else \
unicode_normalize(comment['message'])
comment_author = unicode_normalize(comment['from']['name'])
comment_likes = 0 if 'like_count' not in comment else \
comment['like_count']

if 'attachment' in comment:
attach_tag = "[[%s]]" % comment['attachment']['type'].upper()
comment_message = attach_tag if comment_message is '' else \
(comment_message.decode("utf-8") + " " + \
attach_tag).encode("utf-8")

# Time needs special care since a) it's in UTC and
# b) it's not easy to use in statistical programs.

comment_published = datetime.datetime.strptime(
comment['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
comment_published = comment_published + datetime.timedelta(hours=-5) # EST
comment_published = comment_published.strftime(
'%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs

# Return a tuple of all processed data

return (comment_id, status_id, parent_id, comment_message, comment_author,
comment_published, comment_likes)

def scrapeFacebookPageFeedComments(page_id, access_token):
with open('%s_facebook_comments.csv' % file_id, 'w', newline='', encoding='utf-8') as file:
w = csv.writer(file)
w.writerow(["comment_id", "status_id", "parent_id", "comment_message",
"comment_author", "comment_published", "comment_likes"])

num_processed = 0 # keep a count on how many we've processed
scrape_starttime = datetime.datetime.now()

print("Scraping %s Comments From Posts: %s\n" % \
(file_id, scrape_starttime))

with open('%s_facebook_statuses.csv' % file_id, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)

#reader = [dict(status_id='759985267390294_1158001970921953')]

for status in reader:
has_next_page = True

comments = getFacebookCommentFeedData(status['status_id'],
access_token, 100)

while has_next_page and comments is not None:
for comment in comments['data']:
w.writerow(processFacebookComment(comment,
status['status_id']))

if 'comments' in comment:
has_next_subpage = True

subcomments = getFacebookCommentFeedData(
comment['id'], access_token, 100)

while has_next_subpage:
for subcomment in subcomments['data']:
# print (processFacebookComment(
# subcomment, status['status_id'],
# comment['id']))
w.writerow(processFacebookComment(
subcomment,
status['status_id'],
comment['id']))

num_processed += 1
if num_processed % 1000 == 0:
print("%s Comments Processed: %s" %
(num_processed,
datetime.datetime.now()))

if 'paging' in subcomments:
if 'next' in subcomments['paging']:
subcomments = json.loads(
request_until_succeed(
subcomments['paging']\
['next']))
else:
has_next_subpage = False
else:
has_next_subpage = False

# output progress occasionally to make sure code is not
# stalling
num_processed += 1
if num_processed % 1000 == 0:
print("%s Comments Processed: %s" %
(num_processed, datetime.datetime.now()))

if 'paging' in comments:
if 'next' in comments['paging']:
comments = json.loads(request_until_succeed(
comments['paging']['next']))
else:
has_next_page = False
else:
has_next_page = False

print("\nDone!\n%s Comments Processed in %s" %
(num_processed, datetime.datetime.now() - scrape_starttime))


if __name__ == '__main__':
scrapeFacebookPageFeedComments(file_id, access_token)


# The CSV can be opened in all major statistical programs. Have fun! :)

0 comments on commit 2fff6be

Please sign in to comment.