# Playground Notebook
## To try, validate and debug the code into the dashboard

In [259]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import os

## Stocks

In [260]:
# Stock Names
stock_names = {'GS':'Goldman Sachs Group Inc',
               'LMT':'Lockheed Martin Corporation',
               'TSLA':'TESLA',
               'MSFT':'Microsoft Corporation',
               'AAPL':'Apple Inc.',
               'MCD':'McDonalds Corporation',
               'NKE':'Nike Inc',
               'PFE':'Pfizer Inc.',
               'FB':'Facebook, Inc.',
               'GOOGL':'Alphabet Inc.'
              }

In [261]:
# Name of timestamp to parse on the raw data file
stamp_name = '4. timestamp'

In [262]:
# Column names on the raw data file
header_names = {'1. symbol':'sym',
                '2. price':'$', 
                '3. volume':'vol'}

In [263]:
# Directories and Files
# Windows
if os.name == 'nt':
    home_dir = r'//userhome/users$/ksagilop/home/ZHAW/MAIN/04_Big_Data/'
    stock_dir = r'spark-stock-market-streaming/collected_data/'
    tweet_dir = r'spark-stock-market-streaming/collected_tweets_csv_raw/'
# Linux
if os.name == 'posix':
    home_dir = os.path.expanduser(r'~/Documents/ZHAW/MAIN/04_Big_Data/30_Project/')
    stock_dir = r'spark-stock-market-streaming/collected_data/'
    tweet_dir = r'spark-stock-market-streaming/collected_tweets_csv_raw/'

### Build Dataframe from json files

In [264]:
def buildDF(base_dir, data_dir, json_col):
    '''Construct a big dataframe by aggregating the individual json files
    located at the proper data directory
    Args:
        base_dir(str), the home or base directory
        data_dir(str), the directory containing the data
        json_col(str), which column to normalize from the json file
    return:
        df(dataframe), full dataframe iaw json structure'''
    folder = os.path.join(base_dir + data_dir)
    files = os.listdir(folder)
    count_files = 0
    for file in files:
        file_path = os.path.join(folder + file)
        with open(file_path) as data_file:
            data = json.load(data_file)
            if count_files == 0:
                df = json_normalize(data, json_col)
                print('---- Base Dataframe ----')
                print('Lenght of base dataframe is: ', len(df))
                print(file_path)
                print(df.head())
                count_files += 1
            else:
                df_temp = json_normalize(data, json_col)
                df = df.append(df_temp, ignore_index=True)
                count_files += 1
    print('------------------------')
    print('Total files read: ' + str(count_files))
    print('---- %d Dataframes appended ----' %count_files)
    print('Total lenght of dataframe is: ', len(df))
    return df

In [265]:
dfs = buildDF(home_dir, stock_dir, 'Stock Quotes')

---- Base Dataframe ----
Lenght of base dataframe is:  10
/home/hase/Documents/ZHAW/MAIN/04_Big_Data/30_Project/spark-stock-market-streaming/collected_data/20180611-221636.json
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  332.0500  12997109  2018-06-11 16:00:00
1      AAPL  191.1600  17974245  2018-06-11 16:09:39
2      MSFT  101.0500  22795861  2018-06-11 16:11:44
3       MCD  166.4400   3214869  2018-06-11 16:00:43
4       NKE   74.5700   4676668  2018-06-11 16:02:14
------------------------
Total files read: 2123
---- 2123 Dataframes appended ----
Total lenght of dataframe is:  21230


### Data Pre-processing

In [266]:
def transDF(df, stamp, header, names):
    '''Pre-processing of the data by doing some transformations
    and aggregations to the dataframe created previously
    Args:
        df(dataframe): input dataframe
        stamp(str): name column to parse for datetime
        header(dict): a dictionary with {'original col name':'new col name'}
        names(dict): a dictionary with the mapping {'stock abrev.':'stock full name'}
    return:
        df(dataframe): transformed dataframe'''
    print('---- Input Dataframe ----')
    print(df.head())
    # Parse date and time
    print('------------------------')
    print('Parsing datetimes...')
    df['stamp'] = pd.to_datetime(df[stamp])
    df['date'] = df['stamp'].dt.date
    #df['year'] = df['stamp'].dt.year
    #df['month'] = df['stamp'].dt.strftime('%b')
    #df['day'] = df['stamp'].dt.day
    df['time'] = df['stamp'].dt.time
    #print('Dropping auxiliary columns...')
    #df.drop([stamp], axis=1, inplace=True) # Not needed as 'stamp' is used for x-axis
    # Set dataframe index
    df.set_index(['date', 'time'], inplace=True)
    #df.set_index(['year', 'month', 'day', 'time'], inplace=True)
    print('Multi-index set from: ', stamp)
    print('Renaming columns...')
    df.rename(columns=header, inplace=True)
    print('Drop duplicates...')
    original_len = len(df)
    df.drop_duplicates(inplace=True)
    drop_len = len(df)
    print('Dataframe reduced from %d to %d rows' %(original_len, drop_len))
    df.sort_index(inplace=True)
    print('Renaming rows...')
    for key, value in names.items():
        mask = df.sym == key
        df.loc[mask, 'sym'] = value + ', ' + '(' + key + ')'
    print('---- Modified Dataframe ----')
    print(df.head())
    return df

In [267]:
dfss = transDF(dfs, stamp_name, header_names, stock_names)

---- Input Dataframe ----
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  332.0500  12997109  2018-06-11 16:00:00
1      AAPL  191.1600  17974245  2018-06-11 16:09:39
2      MSFT  101.0500  22795861  2018-06-11 16:11:44
3       MCD  166.4400   3214869  2018-06-11 16:00:43
4       NKE   74.5700   4676668  2018-06-11 16:02:14
------------------------
Parsing datetimes...
Multi-index set from:  4. timestamp
Renaming columns...
Drop duplicates...
Dataframe reduced from 21230 to 15067 rows
Renaming rows...
---- Modified Dataframe ----
                                                    sym         $ vol  \
date       time                                                         
2018-06-05 15:59:46       Goldman Sachs Group Inc, (GS)  228.3000  --   
           15:59:53  Lockheed Martin Corporation, (LMT)  320.1300  --   
           15:59:55                       TESLA, (TSLA)  290.9400  --   
           15:59:55       Microsoft Corporation, (MSFT)  102.2100  --   
        

In [268]:
dfss.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sym,$,vol,4. timestamp,stamp
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-06-05,15:59:46,"Goldman Sachs Group Inc, (GS)",228.3,--,2018-06-05 15:59:46,2018-06-05 15:59:46
2018-06-05,15:59:53,"Lockheed Martin Corporation, (LMT)",320.13,--,2018-06-05 15:59:53,2018-06-05 15:59:53
2018-06-05,15:59:55,"TESLA, (TSLA)",290.94,--,2018-06-05 15:59:55,2018-06-05 15:59:55
2018-06-05,15:59:55,"Microsoft Corporation, (MSFT)",102.21,--,2018-06-05 15:59:55,2018-06-05 15:59:55
2018-06-05,15:59:57,"Apple Inc., (AAPL)",193.36,--,2018-06-05 15:59:57,2018-06-05 15:59:57


In [269]:
s_hr_min = dfss.index.get_level_values('time').min()
s_hr_max = dfss.index.get_level_values('time').max()

In [270]:
s_hr_min, s_hr_max

(datetime.time(8, 19, 30), datetime.time(16, 59, 59))

## Tweets

In [271]:
hashtag_map = {'nike':'Nike Inc, (NKE)', 
               'facebook':'Facebook, Inc., (FB)', 
               'apple':'Apple Inc., (AAPL)', 
               'microsoft':'Microsoft Corporation, (MSFT)', 
               'mcdonalds':'McDonalds Corporation, (MCD)', 
               'tesla':'TESLA, (TSLA)',
               'n/a':'NA', 
               'goldmansachs':'Goldman Sachs Group Inc, (GS)', 
               'alphabet':'Alphabet Inc., (GOOGL)', 
               'pfizer':'Pfizer Inc., (PFE)'}

In [272]:
def buildDFbis(base_dir, data_dir):
    '''Construct a big dataframe by aggregating the individual csv files
    located at the proper data directory
    Args:
        base_dir(str), the home or base directory
        data_dir(str), the directory containing the data
    return:
        df(dataframe), full dataframe iaw csv structure'''
    folder = os.path.join(base_dir + data_dir)
    files = os.listdir(folder)
    file_path = os.path.join(folder + files[0])
    # Dataframe
    df = pd.read_csv(file_path,
                     sep=';',
                     parse_dates={'stamp':['timestamp']},
                     keep_date_col=False)
    print('Dataframe created from: ', file_path)
    print('with length: ', len(df))
    print('--- Dataframe processing ---')
    print('Creating date and time columns from timestamp')
    df['date'] = df['stamp'].dt.date
    df['time'] = df['stamp'].dt.time
    print('Setting date and time as Index...')
    df.set_index(['date', 'time'], inplace=True)
    return df

In [273]:
dft = buildDFbis(home_dir, tweet_dir)

Dataframe created from:  /home/hase/Documents/ZHAW/MAIN/04_Big_Data/30_Project/spark-stock-market-streaming/collected_tweets_csv_raw/tweets.csv
with length:  171273
--- Dataframe processing ---
Creating date and time columns from timestamp
Setting date and time as Index...


In [274]:
dft.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,stamp,hashtag,cnt,followers_count,log_followers_count
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-06-09,03:10:23,2018-06-09 03:10:23,apple,1,287,5.659482
2018-06-09,03:10:28,2018-06-09 03:10:28,apple,1,535,6.282267
2018-06-09,03:10:30,2018-06-09 03:10:30,nike,1,66,4.189655
2018-06-09,03:10:32,2018-06-09 03:10:32,apple,1,39384,10.581115
2018-06-09,03:10:33,2018-06-09 03:10:33,facebook,1,11822,9.377717


In [275]:
# Slice tweet dataframe to an equal stock timestamp
stock_hr_min = dfss.index.get_level_values('time').min()
stock_hr_max = dfss.index.get_level_values('time').max()
tweet_hr_min = dft.index.get_level_values('time').min()
tweet_hr_max = dft.index.get_level_values('time').max()
mask_hr_min = dft.index.get_level_values('time') > stock_hr_min
dft = dft.loc[mask_hr_min]
mask_hr_max = dft.index.get_level_values('time') < stock_hr_max
dft = dft.loc[mask_hr_max]

In [276]:
dft.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,stamp,hashtag,cnt,followers_count,log_followers_count
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-06-09,08:19:34,2018-06-09 08:19:34,nike,1,3,1.098612
2018-06-09,08:19:39,2018-06-09 08:19:39,nike,1,59,4.077537
2018-06-09,08:19:41,2018-06-09 08:19:41,nike,1,26,3.258097
2018-06-09,08:19:42,2018-06-09 08:19:42,mcdonalds,1,1,0.0
2018-06-09,08:19:47,2018-06-09 08:19:47,apple,1,51,3.931826


In [331]:
dft['stamp_round'] = dft.stamp.dt.round('1Min')

In [332]:
c = dft.groupby(['stamp_round'])['log_followers_count'].sum()

In [333]:
c.head()

stamp_round
2018-06-09 08:20:00    178.598008
2018-06-09 08:21:00    201.026977
2018-06-09 08:22:00    153.596335
2018-06-09 08:23:00    147.767965
2018-06-09 08:24:00    146.745374
Name: log_followers_count, dtype: float64

In [334]:
c.index

DatetimeIndex(['2018-06-09 08:20:00', '2018-06-09 08:21:00',
               '2018-06-09 08:22:00', '2018-06-09 08:23:00',
               '2018-06-09 08:24:00', '2018-06-09 08:25:00',
               '2018-06-09 08:26:00', '2018-06-09 08:27:00',
               '2018-06-09 08:28:00', '2018-06-09 08:29:00',
               ...
               '2018-06-12 16:53:00', '2018-06-12 16:54:00',
               '2018-06-12 16:55:00', '2018-06-12 16:56:00',
               '2018-06-12 16:57:00', '2018-06-12 16:58:00',
               '2018-06-12 16:59:00', '2018-06-12 17:00:00',
               '2018-06-13 10:12:00', '2018-06-13 10:13:00'],
              dtype='datetime64[ns]', name='stamp_round', length=2086, freq=None)

In [329]:
c

stamp_round
2018-06-09 08:20:00     533.221320
2018-06-09 08:25:00     675.804657
2018-06-09 08:30:00     739.337659
2018-06-09 08:35:00     601.800495
2018-06-09 08:40:00     726.595835
2018-06-09 08:45:00     660.147264
2018-06-09 08:50:00     734.997276
2018-06-09 08:55:00     660.550036
2018-06-09 09:00:00     828.862269
2018-06-09 09:05:00     820.147048
2018-06-09 09:10:00     813.141613
2018-06-09 09:15:00     830.164758
2018-06-09 09:20:00     788.429100
2018-06-09 09:25:00     657.491009
2018-06-09 09:30:00    1156.627323
2018-06-09 09:35:00    1100.735832
2018-06-09 09:40:00    1050.001003
2018-06-09 09:45:00     940.215354
2018-06-09 09:50:00     959.764449
2018-06-09 09:55:00     741.485257
2018-06-09 10:00:00    1382.112239
2018-06-09 10:05:00     922.506912
2018-06-09 10:10:00     818.679878
2018-06-09 10:15:00     885.207178
2018-06-09 10:20:00     911.483656
2018-06-09 10:25:00     749.291543
2018-06-09 10:30:00     877.760136
2018-06-09 10:35:00     673.057043
2018-06-

In [320]:
dft.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,stamp,hashtag,cnt,followers_count,log_followers_count,hora,minuto,stamp_round,stamp_rd
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-06-09,08:19:34,2018-06-09 08:19:34,nike,1,3,1.098612,8,19,2018-06-09 08:20:00,08:20:00
2018-06-09,08:19:39,2018-06-09 08:19:39,nike,1,59,4.077537,8,19,2018-06-09 08:20:00,08:20:00
2018-06-09,08:19:41,2018-06-09 08:19:41,nike,1,26,3.258097,8,19,2018-06-09 08:20:00,08:20:00
2018-06-09,08:19:42,2018-06-09 08:19:42,mcdonalds,1,1,0.0,8,19,2018-06-09 08:20:00,08:20:00
2018-06-09,08:19:47,2018-06-09 08:19:47,apple,1,51,3.931826,8,19,2018-06-09 08:20:00,08:20:00


In [None]:
dft.groupby(['stamp', 'time_min'])['log_followers_count'].sum()