# Playground Notebook
## To try, validate and debug the code into the dashboard

In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import os

## Stocks

In [2]:
# Stock Names
stock_names = {'GS':'Goldman Sachs Group Inc',
               'LMT':'Lockheed Martin Corporation',
               'TSLA':'TESLA',
               'MSFT':'Microsoft Corporation',
               'AAPL':'Apple Inc.',
               'MCD':'McDonalds Corporation',
               'NKE':'Nike Inc',
               'PFE':'Pfizer Inc.',
               'FB':'Facebook, Inc.',
               'GOOGL':'Alphabet Inc.'
              }

In [3]:
# Name of timestamp to parse on the raw data file
stamp_name = '4. timestamp'

In [4]:
# Column names on the raw data file
header_names = {'1. symbol':'sym',
                '2. price':'$', 
                '3. volume':'vol'}

In [23]:
# Directories and Files
# Windows
if os.name == 'nt':
    home_dir = r'//userhome/users$/ksagilop/home/ZHAW/MAIN/04_Big_Data/'
    stock_dir = r'spark-stock-market-streaming/collected_data/'
    tweet_dir = r'spark-stock-market-streaming/collected_tweets_csv_raw/'
# Linux
else:
    home_dir = os.path.expanduser(r'~/Documents/ZHAW/MAIN/04_Big_Data/30_Project/')
    stock_dir = r'spark-stock-market-streaming/collected_data/'
    tweet_dir = r'?'

### Build Dataframe from json files

In [6]:
def buildDF(base_dir, data_dir, json_col):
    '''Construct a big dataframe by aggregating the individual json files
    located at the proper data directory
    Args:
        base_dir(str), the home or base directory
        data_dir(str), the directory containing the data
        json_col(str), which column to normalize from the json file
    return:
        df(dataframe), full dataframe iaw json structure'''
    folder = os.path.join(base_dir + data_dir)
    files = os.listdir(folder)
    count_files = 0
    for file in files:
        file_path = os.path.join(folder + file)
        with open(file_path) as data_file:
            data = json.load(data_file)
            if count_files == 0:
                df = json_normalize(data, json_col)
                print('---- Base Dataframe ----')
                print('Lenght of base dataframe is: ', len(df))
                print(file_path)
                print(df.head())
                count_files += 1
            else:
                df_temp = json_normalize(data, json_col)
                df = df.append(df_temp, ignore_index=True)
                count_files += 1
    print('------------------------')
    print('Total files read: ' + str(count_files))
    print('---- %d Dataframes appended ----' %count_files)
    print('Total lenght of dataframe is: ', len(df))
    return df

In [7]:
dfs = buildDF(home_dir, stock_dir, 'Stock Quotes')

---- Base Dataframe ----
Lenght of base dataframe is:  10
//userhome/users$/ksagilop/home/ZHAW/MAIN/04_Big_Data/spark-stock-market-streaming/collected_data/20180606-140045.json
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  290.9400        --  2018-06-05 15:59:55
1      AAPL  193.3600        --  2018-06-05 15:59:57
2      MSFT  102.2100        --  2018-06-05 15:59:55
3       MCD  159.4800        --  2018-06-05 15:59:57
4       NKE   74.0600        --  2018-06-05 15:59:57
------------------------
Total files read: 2123
---- 2123 Dataframes appended ----
Total lenght of dataframe is:  21230


### Data Pre-processing

In [8]:
def transDF(df, stamp, header, names):
    '''Pre-processing of the data by doing some transformations
    and aggregations to the dataframe created previously
    Args:
        df(dataframe): input dataframe
        stamp(str): name column to parse for datetime
        header(dict): a dictionary with {'original col name':'new col name'}
        names(dict): a dictionary with the mapping {'stock abrev.':'stock full name'}
    return:
        df(dataframe): transformed dataframe'''
    print('---- Input Dataframe ----')
    print(df.head())
    # Parse date and time
    print('------------------------')
    print('Parsing datetimes...')
    df['stamp'] = pd.to_datetime(df[stamp])
    df['date'] = df['stamp'].dt.date
    #df['year'] = df['stamp'].dt.year
    #df['month'] = df['stamp'].dt.strftime('%b')
    #df['day'] = df['stamp'].dt.day
    df['time'] = df['stamp'].dt.time
    #print('Dropping auxiliary columns...')
    #df.drop([stamp], axis=1, inplace=True) # Not needed as 'stamp' is used for x-axis
    # Set dataframe index
    df.set_index(['date', 'time'], inplace=True)
    #df.set_index(['year', 'month', 'day', 'time'], inplace=True)
    print('Multi-index set from: ', stamp)
    print('Renaming columns...')
    df.rename(columns=header, inplace=True)
    print('Drop duplicates...')
    original_len = len(df)
    df.drop_duplicates(inplace=True)
    drop_len = len(df)
    print('Dataframe reduced from %d to %d rows' %(original_len, drop_len))
    df.sort_index(inplace=True)
    print('Renaming rows...')
    for key, value in names.items():
        mask = df.sym == key
        df.loc[mask, 'sym'] = value + ', ' + '(' + key + ')'
    print('---- Modified Dataframe ----')
    print(df.head())
    return df

In [9]:
dfss = transDF(dfs, stamp_name, header_names, stock_names)

---- Input Dataframe ----
  1. symbol  2. price 3. volume         4. timestamp
0      TSLA  290.9400        --  2018-06-05 15:59:55
1      AAPL  193.3600        --  2018-06-05 15:59:57
2      MSFT  102.2100        --  2018-06-05 15:59:55
3       MCD  159.4800        --  2018-06-05 15:59:57
4       NKE   74.0600        --  2018-06-05 15:59:57
------------------------
Parsing datetimes...
Multi-index set from:  4. timestamp
Renaming columns...
Drop duplicates...
Dataframe reduced from 21230 to 15057 rows
Renaming rows...
---- Modified Dataframe ----
                                                    sym         $ vol  \
date       time                                                         
2018-06-05 15:59:46       Goldman Sachs Group Inc, (GS)  228.3000  --   
           15:59:53  Lockheed Martin Corporation, (LMT)  320.1300  --   
           15:59:55                       TESLA, (TSLA)  290.9400  --   
           15:59:55       Microsoft Corporation, (MSFT)  102.2100  --   
        

In [10]:
dfss.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sym,$,vol,4. timestamp,stamp
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-06-05,15:59:46,"Goldman Sachs Group Inc, (GS)",228.3,--,2018-06-05 15:59:46,2018-06-05 15:59:46
2018-06-05,15:59:53,"Lockheed Martin Corporation, (LMT)",320.13,--,2018-06-05 15:59:53,2018-06-05 15:59:53
2018-06-05,15:59:55,"TESLA, (TSLA)",290.94,--,2018-06-05 15:59:55,2018-06-05 15:59:55
2018-06-05,15:59:55,"Microsoft Corporation, (MSFT)",102.21,--,2018-06-05 15:59:55,2018-06-05 15:59:55
2018-06-05,15:59:57,"Apple Inc., (AAPL)",193.36,--,2018-06-05 15:59:57,2018-06-05 15:59:57


## Tweets

In [32]:
hashtag_map = {'nike':'Nike Inc, (NKE)', 
               'facebook':'Facebook, Inc., (FB)', 
               'apple':'Apple Inc., (AAPL)', 
               'microsoft':'Microsoft Corporation, (MSFT)', 
               'mcdonalds':'McDonalds Corporation, (MCD)', 
               'tesla':'TESLA, (TSLA)',
               'n/a':'NA', 
               'goldmansachs':'Goldman Sachs Group Inc, (GS)', 
               'alphabet':'Alphabet Inc., (GOOGL)', 
               'pfizer':'Pfizer Inc., (PFE)'}

In [36]:
def buildDFbis(base_dir, data_dir):
    '''Construct a big dataframe by aggregating the individual csv files
    located at the proper data directory
    Args:
        base_dir(str), the home or base directory
        data_dir(str), the directory containing the data
    return:
        df(dataframe), full dataframe iaw csv structure'''
    folder = os.path.join(base_dir + data_dir)
    files = os.listdir(folder)
    file_path = os.path.join(folder + files[0])
    # Dataframe
    df = pd.read_csv(file_path,
                     sep=';',
                     parse_dates={'stamp':['timestamp']},
                     keep_date_col=False)
    print('Dataframe created from: ', file_path)
    print('with length: ', len(df))
    print('--- Dataframe processing ---')
    print('Creating date and time columns from timestamp')
    df['date'] = df['stamp'].dt.date
    df['time'] = df['stamp'].dt.time
    print('Setting date and time as Index...')
    df.set_index(['date', 'time'], inplace=True)
    return df

In [37]:
dft = buildDFbis(home_dir, tweet_dir)

Dataframe created from:  //userhome/users$/ksagilop/home/ZHAW/MAIN/04_Big_Data/spark-stock-market-streaming/collected_tweets_csv_raw/tweets.csv
with length:  139773
--- Dataframe processing ---
Creating date and time columns from timestamp
Setting date and time as Index...


In [38]:
dft.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,stamp,hashtag,cnt,followers_count,log_followers_count
date,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-06-05,20:47:01,2018-06-05 20:47:01,nike,1,1379,7.229114
2018-06-05,20:47:01,2018-06-05 20:47:01,facebook,1,323,5.777652
2018-06-05,20:47:03,2018-06-05 20:47:03,apple,1,3930,8.276395
2018-06-05,20:47:03,2018-06-05 20:47:03,nike,1,85,4.442651
2018-06-05,20:47:04,2018-06-05 20:47:04,apple,1,6578,8.791486


In [64]:
q = dft.iloc[0]['stamp']

In [66]:
q

Timestamp('2018-06-05 20:47:01')

In [72]:
n = q.hour, q.minute

In [73]:
n, type(n)

((20, 47), tuple)

In [74]:
pd.to_datetime

DatetimeIndex(['1970-01-01 00:00:00.000000020', '1970-01-01 00:00:00.000000047'], dtype='datetime64[ns]', freq=None)