## How to start processing the data?
    - Online Json Viewer - useful webpage to play with the data: http://jsonviewer.stack.hu
    - Read the text file, read it line by line (we go through the code)
    - Pandas Dataframe: easily play with the time, and aggregate the columns.
    - Reminder: You can store the processed data/features in pickle objects. 

In [56]:
'''
glob function: finds all the pathnames matching a specific pattern.
more info: https://docs.python.org/2/library/glob.html
'''
import glob

datafilenames = [i for i in glob.iglob('../data/ECE219_tweet_data/*.txt')]
#datafilenames = [i for i in glob.iglob('../data/ECE219_tweet_data/*gopatriots*.txt')]
datafilenames

['../data/ECE219_tweet_data/tweets_#gohawks.txt',
 '../data/ECE219_tweet_data/tweets_#gopatriots.txt',
 '../data/ECE219_tweet_data/tweets_#nfl.txt',
 '../data/ECE219_tweet_data/tweets_#patriots.txt',
 '../data/ECE219_tweet_data/tweets_#sb49.txt',
 '../data/ECE219_tweet_data/tweets_#superbowl.txt']

In [57]:
'''
Instead of loading the entire file, you can read the files, line by line and keep the information you need.
Store the processed data in pickle objects.
'''
import json
import pickle, gzip

OVERWRITE_PICKLE_OBJECTS = True

def extract_feature_from_tweet(tweet_dict,feature_name):
    if feature_name == "time": 
        return tweet_dict["citation_date"]
    if feature_name == "time_ori":
        return tweet_dict["firstpost_date"]
    if feature_name == "re_count":
        return tweet_dict["metrics"]["citations"]["total"]
    if feature_name == "fo_count":
        return tweet_dict["author"]["followers"]
    return ""


feature_names = ["time", "time_ori", "re_count", "fo_count"]



for fn in datafilenames:
    if os.path.isfile(fn+'.pkl.gz') and not OVERWRITE_PICKLE_OBJECTS:
        print(fn+'.pkl.gz'+'\texists.')
        '''
        # To also load the pickle file 
        with gzip.open(fn+'.pkl.gz', 'rb') as f:
            tweets_features = pickle.load(f)
        print(fn+'.pkl.gz'+'\tloaded.')
        '''
    else:    
        tweets_features = dict() 
        # {"time":[444,444,], "re_count":[23,34]}
        # the other way: [{"time": 44, "re_count":23}]
        for f in feature_names:
            tweets_features[f] = []
        
        with open(fn,'rb') as d:
            for ind, line in enumerate(d):
                if ind > 1000:
                    break        
                tweet_dict = json.loads(line) 
                for f in feature_names:
                    feature_value = extract_feature_from_tweet(tweet_dict, f)
                    tweets_features[f].append(feature_value)
        with gzip.open(fn+'.pkl.gz','wb') as p:
            pickle.dump(tweets_features,p,protocol=pickle.HIGHEST_PROTOCOL)
        print(fn+'.pkl.gz'+'\tsaved.')

../data/ECE219_tweet_data/tweets_#gohawks.txt.pkl.gz	saved.
../data/ECE219_tweet_data/tweets_#gopatriots.txt.pkl.gz	saved.
../data/ECE219_tweet_data/tweets_#nfl.txt.pkl.gz	saved.
../data/ECE219_tweet_data/tweets_#patriots.txt.pkl.gz	saved.
../data/ECE219_tweet_data/tweets_#sb49.txt.pkl.gz	saved.
../data/ECE219_tweet_data/tweets_#superbowl.txt.pkl.gz	saved.


In [23]:
tweets_features

{'re_count': [2,
  15,
  2,
  2,
  7,
  34,
  3,
  10,
  6,
  11,
  3,
  3,
  2,
  10,
  4,
  6,
  4,
  4,
  2,
  33,
  2,
  2,
  3,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  2,
  10,
  1,
  2,
  3,
  6,
  5,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  4,
  2,
  4,
  1,
  1,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  4,
  1,
  3,
  1,
  1,
  1,
  1,
  3,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  3,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  2,
  1,
  2,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  2,
  1,
  20,
  1,
  1,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  2,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  10,
  1,
  1,
  1,

In [28]:
'''
An alternative solution.
'''

def dict_xpath_get(mydict, path):
    elem = mydict
    try:
        for x in path.strip("/").split("/"):
            try:
                x = int(x)
                elem = elem[x]
            except ValueError:
                elem = elem.get(x)
    except:
        pass

    return elem

def feature_extract(tweet_dict, feature_selectors):
    extracted_dict = dict()
    for f in feature_selectors:
        selector = feature_selectors[f]
        extracted_dict[f] = dict_xpath_get(tweet_dict, selector)
    return extracted_dict


feature_selectors = {
    'time'         : '/citation_date',
    'time_ori'     : '/firstpost_date',
    're_count'     : '/metrics/citations/total',
    'fo_count'     : '/author/followers',
}



for fn in datafilenames:
    if os.path.isfile(fn+'.pkl.gz') and not OVERWRITE_PICKLE_OBJECTS:
        print(fn+'.pkl.gz'+'\texists.')
#         with gzip.open(fn+'.pkl.gz', 'rb') as f:
#             tweets_features = pickle.load(f)
#         print(fn+'.pkl.gz'+'\tloaded.')
    else:
        tweets_features = dict()
        for f in feature_selectors:
            tweets_features[f] = []

        with open(fn,'rb') as d:
            for ind, line in enumerate(d):
                if ind > 1000:
                    break
                tweet_dict = json.loads(line)
                feature_dict = feature_extract(tweet_dict, feature_selectors)
                for f in feature_dict:
                    tweets_features[f].append(feature_dict[f])
        with gzip.open(fn+'.pkl.gz','wb') as p:
            pickle.dump(tweets_features,p,protocol=pickle.HIGHEST_PROTOCOL)
        print(fn+'.pkl.gz'+'\tsaved.')
        

In [58]:
import pandas as pd
based_dir = '../data/ECE219_tweet_data/'

for hashtag in ['superbowl', 'nfl', 'gohawks', 'gopatriots', 'patriots', 'sb49']:
    with gzip.open(based_dir + 'tweets_#' + hashtag +'.txt.pkl.gz', 'rb') as f:
        features_superbowl = pickle.load(f)
    pd_df = pd.DataFrame(features_superbowl)
    '''
    Start the analysis ...
    '''

In [59]:
pd_df.head()

Unnamed: 0,fo_count,re_count,time,time_ori
0,59,2,1421238675,1421238675
1,21,1,1421244058,1421244058
2,53,1,1421246898,1421246898
3,533,2,1421249217,1421249217
4,260,1,1421249288,1421249288


In [60]:
pd_df['citetime'] = pd.to_datetime(pd_df['time'], unit='s')

In [61]:
pd_df.head()

Unnamed: 0,fo_count,re_count,time,time_ori,citetime
0,59,2,1421238675,1421238675,2015-01-14 12:31:15
1,21,1,1421244058,1421244058,2015-01-14 14:00:58
2,53,1,1421246898,1421246898,2015-01-14 14:48:18
3,533,2,1421249217,1421249217,2015-01-14 15:26:57
4,260,1,1421249288,1421249288,2015-01-14 15:28:08


In [62]:
import pytz
utc_tz = pytz.UTC
pst_tz = pytz.timezone('America/Los_Angeles')

pd_df['citetime_pst'] = pd.to_datetime(pd_df['time'], unit='s').apply(lambda x: x.tz_localize(utc_tz).astimezone(pst_tz))

In [63]:
pd_df.head()

Unnamed: 0,fo_count,re_count,time,time_ori,citetime,citetime_pst
0,59,2,1421238675,1421238675,2015-01-14 12:31:15,2015-01-14 04:31:15-08:00
1,21,1,1421244058,1421244058,2015-01-14 14:00:58,2015-01-14 06:00:58-08:00
2,53,1,1421246898,1421246898,2015-01-14 14:48:18,2015-01-14 06:48:18-08:00
3,533,2,1421249217,1421249217,2015-01-14 15:26:57,2015-01-14 07:26:57-08:00
4,260,1,1421249288,1421249288,2015-01-14 15:28:08,2015-01-14 07:28:08-08:00


In [44]:
df_windowed = pd_df.groupby(pd.Grouper(key='citetime_pst', freq="60Min")).agg({'re_count': 'sum'})
# read more about the agg function: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html

In [49]:
df_windowed.head()

Unnamed: 0_level_0,re_count
citetime_pst,Unnamed: 1_level_1
2015-01-14 04:00:00-08:00,2.0
2015-01-14 05:00:00-08:00,
2015-01-14 06:00:00-08:00,2.0
2015-01-14 07:00:00-08:00,3.0
2015-01-14 08:00:00-08:00,2.0


In [50]:
df_windowed.fillna(value=0).head()

Unnamed: 0_level_0,re_count
citetime_pst,Unnamed: 1_level_1
2015-01-14 04:00:00-08:00,2
2015-01-14 05:00:00-08:00,0
2015-01-14 06:00:00-08:00,2
2015-01-14 07:00:00-08:00,3
2015-01-14 08:00:00-08:00,2


## Innovation Part
    - Look through the literature, and list some potential tasks for this part. Start with the paper we referred to in the spec.
    - There are many different analysis tasks, try to be creative!
   
   

## Useful links/materials
    - Statsmodels -> Formulas : http://www.statsmodels.org/dev/examples/notebooks/generated/formulas.html
        - Enables R-like regression formulas
        - Easy to modify small feature sets. 
        - Can handle categorical features pretty well
        - In general dataframes, and other techniques can also handle the categorical features as well. (pd.get_dummies)
   