In [1]:
# -*- coding: utf-8 -*-
# goto https://github.com/n-lo/Tweets_analysis_tryout for other parts
%matplotlib inline
from __future__ import print_function
import pprint
from pymongo import MongoClient
import pandas as pd
import numpy as np

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# e.g. logging.info("%i tweets found..." % len(df))

pp = pprint.PrettyPrinter(indent=4)
#hdf5_file = 'timeline.h5'
hdf5_file = 'onp.h5'

In [2]:
# ==================================================
#  Connect to mongoDB database
#  note: make sure MongoDB is running (mongod)
# ==================================================
print("Open connection to MongoDB.")
client = MongoClient('localhost', 27017)
#db = client.timeline.tweets
db = client.onp.tweets
print("Total number of tweets in db: ", db.count())

Open connection to MongoDB.
Total number of tweets in db:  8674


In [3]:
# ===================================================
#  Get tweets from database
# ===================================================

# find tweets written in English to dataframe
print("Find tweets in database.")
#datain = db.find({"lang" : "en"})
datain = db.find() # load all tweets
print("%i tweets found." % datain.count())

Find tweets in database.
8674 tweets found.


In [4]:
# this line will simply convert the first level json items into df, 
#   any sublevel items are stored as string at this stage
print("Storing into dataframe...")
df = pd.DataFrame(list(datain))
print("Done.")

Storing into dataframe...
Done.


In [5]:
# drop duplicated tweets using 'id_str', ie. tweet's id
df.drop_duplicates(subset='id_str', keep='last', inplace=True)
print("Number of tweets after dropping duplicates (using id_str tag): %d" % len(df))

Number of tweets after dropping duplicates (using id_str tag): 8674


In [6]:
print("Storing nested 'user' items into separate dataframe...")
df_users = pd.io.json.json_normalize(list(df['user']))

Storing nested 'user' items into separate dataframe...


In [7]:
# sanity check
print("Number of 'user' entries in df: %d" % len(df_users))

Number of 'user' entries in df: 8674


In [8]:
print("Tidy up dataframes.")
df_users.drop(['default_profile', 'default_profile_image', 'profile_background_color', 
               'profile_background_image_url', 'profile_background_image_url_https', 
               'profile_background_tile', 'profile_banner_url', 'profile_image_url',
               'profile_image_url_https', 'profile_link_color', 'profile_sidebar_border_color', 
               'profile_sidebar_fill_color', 'profile_text_color', 'profile_use_background_image'],
              axis = 1, inplace = True)

# renaming df_users columns with prefix "user_" to avoid duplicate column names when 
#  joining tables (buildin join function only provides column name suffix)
cnames = df_users.columns
cnames = ["user_"+n for n in cnames]
df_users.columns = cnames

Tidy up dataframes.


In [9]:
print("Joining tweets and users dataframes...")
df_join = df.join(df_users)

Joining tweets and users dataframes...


In [10]:
# sanity check...
print("Joined df has %d entries." % len(df_join))

Joined df has 8674 entries.


In [11]:
df_join.columns

Index([                           u'_id',                   u'contributors',
                          u'coordinates',                     u'created_at',
                             u'entities',              u'extended_entities',
                       u'favorite_count',                      u'favorited',
                                  u'geo',                             u'id',
                               u'id_str',        u'in_reply_to_screen_name',
                u'in_reply_to_status_id',      u'in_reply_to_status_id_str',
                  u'in_reply_to_user_id',        u'in_reply_to_user_id_str',
                      u'is_quote_status',                           u'lang',
                             u'metadata',                          u'place',
                   u'possibly_sensitive',                  u'quoted_status',
                     u'quoted_status_id',           u'quoted_status_id_str',
                        u'retweet_count',                      u'retweeted',

In [None]:
# store df to disk
print("Writing dataframe to disk...")
hdf = pd.HDFStore(hdf5_file)
hdf['df'] = df_join
#print(hdf['df'].shape)
hdf.close()

In [13]:
# disconnect from MongoDB
print("Data loaded into dataframe, now disconnect from database.")
client.close()

Data loaded into dataframe, now disconnect from database.


In [13]:
'''
# to load the df
print("Load df from hdf5 file.")
hdf = pd.HDFStore('df.h5')
df = hdf['df']
hdf.close()
'''

Load df from hdf5 file.
