# Playground Notebook
## To try, validate and debug the code into the dashboard

In [245]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import os

In [246]:
# Home directory
home_dir = os.path.expanduser(r'~/Documents/ZHAW/MAIN/04_Big_Data/30_Project/')
# Stock market data directory
stock_dir = r'spark-stock-market-streaming/collected_data/'
# Tweet data directory
tweet_dir = r'???'

### Build Dataframe from json files

In [247]:
def buildDF(base_dir, data_dir, json_col):
    '''Construct a big dataframe by aggregating the individual json files
    located at the proper data directory
    Args:
        base_dir(str), the home or base directory
        data_dir(str), the directory containing the data
        json_col(str), which column to normalize from the json file
    return:
        df(dataframe), full dataframe iaw json structure'''
    folder = os.path.join(base_dir + data_dir)
    files = os.listdir(folder)
    count_files = 0
    for file in files:
        file_path = os.path.join(folder + file)
        with open(file_path) as data_file:
            data = json.load(data_file)
            if count_files == 0:
                df = json_normalize(data, json_col)
                print('---- Base Dataframe ----')
                print('Lenght of base dataframe is: ', len(df))
                print(file_path)
                print(df.head())
                count_files += 1
            else:
                df_temp = json_normalize(data, json_col)
                df = df.append(df_temp, ignore_index=True)
                count_files += 1
    print('------------------------')
    print('Total files read: ' + str(count_files))
    print('---- %d Dataframes appended ----' %count_files)
    print('Total lenght of dataframe is: ', len(df))
    return df

In [248]:
dfs = buildDF(home_dir, stock_dir, 'Stock Quotes')

---- Base Dataframe ----
Lenght of base dataframe is:  10
/home/hase/Documents/ZHAW/MAIN/04_Big_Data/30_Project/spark-stock-market-streaming/collected_data/20180606-220010.json
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  319.4700  18529555  2018-06-06 16:00:00
1      AAPL  194.0000  19893045  2018-06-06 16:00:00
2      MSFT  102.4450  19485170  2018-06-06 16:00:00
3       MCD  162.3250   2723457  2018-06-06 16:00:00
4       NKE   74.7350   4346701  2018-06-06 16:00:00
------------------------
Total files read: 530
---- 530 Dataframes appended ----
Total lenght of dataframe is:  5300


### Data Pre-processing

In [249]:
def transDF(df, stamp, header):
    '''Pre-processing of the data by doing some transformations
    and aggregations to the dataframe created previously
    Args:
        df(dataframe): input dataframe
        stamp(str): name column to parse for datetime
        header(dict): a dictionary with {'original col name':'new col name'}
    return:
        df(dataframe): transformed dataframe'''
    print('---- Input Dataframe ----')
    print(df.head())
    # Parse date and time
    print('------------------------')
    print('Parsing datetimes...')
    df['stamp'] = pd.to_datetime(df[stamp])
    df['year'] = df['stamp'].dt.year
    df['month'] = df['stamp'].dt.month
    df['day'] = df['stamp'].dt.day
    df['hour'] = df['stamp'].dt.hour
    df['minute'] = df['stamp'].dt.minute
    df['second'] = df['stamp'].dt.second
    # Drop unneeded columns
    print('Dropping columns...')
    df.drop([stamp], axis=1, inplace=True)
    # Set dataframe index
    df.set_index('stamp', inplace=True)
    print('Index set to: ', stamp)
    # Rename columns
    print('Renaming columns...')
    df.rename(columns=header, inplace=True)
    print('Drop duplicates...')
    original_len = len(df)
    df.drop_duplicates(inplace=True)
    drop_len = len(df)
    print('Dataframe reduced from %d to %d rows' %(original_len, drop_len))
    print('---- Modified Dataframe ----')
    print(df.head())
    return df

In [250]:
dfss = transDF(dfs, '4. timestamp',
               {'1. symbol':'sym',
                '2. price':'$', 
                '3. volume':'vol'}
              )

---- Input Dataframe ----
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  319.4700  18529555  2018-06-06 16:00:00
1      AAPL  194.0000  19893045  2018-06-06 16:00:00
2      MSFT  102.4450  19485170  2018-06-06 16:00:00
3       MCD  162.3250   2723457  2018-06-06 16:00:00
4       NKE   74.7350   4346701  2018-06-06 16:00:00
------------------------
Parsing datetimes...
Dropping columns...
Index set to:  4. timestamp
Renaming columns...
Drop duplicates...
Dataframe reduced from 5300 to 3695 rows
---- Modified Dataframe ----
                      sym         $       vol  year  month  day  hour  minute  \
stamp                                                                           
2018-06-06 16:00:00  TSLA  319.4700  18529555  2018      6    6    16       0   
2018-06-06 16:00:00  AAPL  194.0000  19893045  2018      6    6    16       0   
2018-06-06 16:00:00  MSFT  102.4450  19485170  2018      6    6    16       0   
2018-06-06 16:00:00   MCD  162.3250   2723457  2

In [None]:
len(dfss)