# Playground Notebook
## To try, validate and debug the code into the dashboard

In [166]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import os

In [167]:
# Stock Names
stock_names = {'GS':'Goldman Sachs Group Inc',
               'LMT':'Lockheed Martin Corporation',
               'TSLA':'TESLA',
               'MSFT':'Microsoft Corporation',
               'AAPL':'Apple Inc.',
               'MCD':'McDonalds Corporation',
               'NKE':'Nike Inc',
               'PFE':'Pfizer Inc.',
               'FB':'Facebook, Inc.',
               'GOOGL':'Alphabet Inc.'
              }

In [168]:
# Name of timestamp to parse on the raw data file
stamp_name = '4. timestamp'

In [169]:
# Column names on the raw data file
header_names = {'1. symbol':'sym',
                '2. price':'$', 
                '3. volume':'vol'}

In [170]:
# Home directory
home_dir = os.path.expanduser(r'~/Documents/ZHAW/MAIN/04_Big_Data/30_Project/')
# Stock market data directory
stock_dir = r'spark-stock-market-streaming/collected_data/'
# Tweet data directory
tweet_dir = r'???'

### Build Dataframe from json files

In [171]:
def buildDF(base_dir, data_dir, json_col):
    '''Construct a big dataframe by aggregating the individual json files
    located at the proper data directory
    Args:
        base_dir(str), the home or base directory
        data_dir(str), the directory containing the data
        json_col(str), which column to normalize from the json file
    return:
        df(dataframe), full dataframe iaw json structure'''
    folder = os.path.join(base_dir + data_dir)
    files = os.listdir(folder)
    count_files = 0
    for file in files:
        file_path = os.path.join(folder + file)
        with open(file_path) as data_file:
            data = json.load(data_file)
            if count_files == 0:
                df = json_normalize(data, json_col)
                print('---- Base Dataframe ----')
                print('Lenght of base dataframe is: ', len(df))
                print(file_path)
                print(df.head())
                count_files += 1
            else:
                df_temp = json_normalize(data, json_col)
                df = df.append(df_temp, ignore_index=True)
                count_files += 1
    print('------------------------')
    print('Total files read: ' + str(count_files))
    print('---- %d Dataframes appended ----' %count_files)
    print('Total lenght of dataframe is: ', len(df))
    return df

In [172]:
dfs = buildDF(home_dir, stock_dir, 'Stock Quotes')

---- Base Dataframe ----
Lenght of base dataframe is:  10
/home/hase/Documents/ZHAW/MAIN/04_Big_Data/30_Project/spark-stock-market-streaming/collected_data/20180611-221636.json
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  332.0500  12997109  2018-06-11 16:00:00
1      AAPL  191.1600  17974245  2018-06-11 16:09:39
2      MSFT  101.0500  22795861  2018-06-11 16:11:44
3       MCD  166.4400   3214869  2018-06-11 16:00:43
4       NKE   74.5700   4676668  2018-06-11 16:02:14
------------------------
Total files read: 1061
---- 1061 Dataframes appended ----
Total lenght of dataframe is:  10610


### Data Pre-processing

In [173]:
def transDF(df, stamp, header, names):
    '''Pre-processing of the data by doing some transformations
    and aggregations to the dataframe created previously
    Args:
        df(dataframe): input dataframe
        stamp(str): name column to parse for datetime
        header(dict): a dictionary with {'original col name':'new col name'}
        names(dict): a dictionary with the mapping {'stock abrev.':'stock full name'}
    return:
        df(dataframe): transformed dataframe'''
    print('---- Input Dataframe ----')
    print(df.head())
    # Parse date and time
    print('------------------------')
    print('Parsing datetimes...')
    df['stamp'] = pd.to_datetime(df[stamp])
    df['date'] = df['stamp'].dt.date
    #df['year'] = df['stamp'].dt.year
    #df['month'] = df['stamp'].dt.strftime('%b')
    #df['day'] = df['stamp'].dt.day
    df['time'] = df['stamp'].dt.time
    print('Dropping auxiliary columns...')
    df.drop([stamp], axis=1, inplace=True)
    # Set dataframe index
    df.set_index(['date', 'time'], inplace=True)
    #df.set_index(['year', 'month', 'day', 'time'], inplace=True)
    print('Multi-index set from: ', stamp)
    print('Renaming columns...')
    df.rename(columns=header, inplace=True)
    print('Drop duplicates...')
    original_len = len(df)
    df.drop_duplicates(inplace=True)
    drop_len = len(df)
    print('Dataframe reduced from %d to %d rows' %(original_len, drop_len))
    df.sort_index(inplace=True)
    print('Renaming rows...')
    for key, value in names.items():
        mask = df.sym == key
        df.loc[mask, 'sym'] = value + ', ' + '(' + key + ')'
    print('---- Modified Dataframe ----')
    print(df.head())
    return df

In [174]:
dfss = transDF(dfs, stamp_name, header_names, stock_names)

---- Input Dataframe ----
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  332.0500  12997109  2018-06-11 16:00:00
1      AAPL  191.1600  17974245  2018-06-11 16:09:39
2      MSFT  101.0500  22795861  2018-06-11 16:11:44
3       MCD  166.4400   3214869  2018-06-11 16:00:43
4       NKE   74.5700   4676668  2018-06-11 16:02:14
------------------------
Parsing datetimes...
Dropping auxiliary columns...
Multi-index set from:  4. timestamp
Renaming columns...
Drop duplicates...
Dataframe reduced from 10610 to 7455 rows
Renaming rows...
---- Modified Dataframe ----
                                                    sym         $ vol  \
date       time                                                         
2018-06-05 15:59:46       Goldman Sachs Group Inc, (GS)  228.3000  --   
           15:59:53  Lockheed Martin Corporation, (LMT)  320.1300  --   
           15:59:55                       TESLA, (TSLA)  290.9400  --   
           15:59:55       Microsoft Corporation, (MS

In [175]:
dfss.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sym,$,vol,stamp
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-06-05,15:59:46,"Goldman Sachs Group Inc, (GS)",228.3,--,2018-06-05 15:59:46
2018-06-05,15:59:53,"Lockheed Martin Corporation, (LMT)",320.13,--,2018-06-05 15:59:53
2018-06-05,15:59:55,"TESLA, (TSLA)",290.94,--,2018-06-05 15:59:55
2018-06-05,15:59:55,"Microsoft Corporation, (MSFT)",102.21,--,2018-06-05 15:59:55
2018-06-05,15:59:57,"Apple Inc., (AAPL)",193.36,--,2018-06-05 15:59:57
