### Prepare Data for Calendar
There is to be a calendar control within the website that for each day, shows the predicted value and the actual value. The actual values have been calculated file that contains also the content, "News_With_Labels.json". The actual information is to be extracted from this file and a new file is to be created.   
  
Create json that is a list of "CalendarData"
* "date"; (int) date of the calculation; YYYYMMDD
* "predicted": (string) model calculation; "Buy", "Sell", "Hold", "Unknown"
* "actual": (string) calculated from actual stock results; "Buy", "Sell", "Hold", "Unknown"

In [1]:
import json
import datetime
import pandas as pd

from datetime import timedelta

#### 1 Read Actual Data

In [2]:
sourceFile = "3.News_With_Labels.json"
    
with open(sourceFile) as jsonFile:
    sourceData = json.load(jsonFile)
    
print("completed loading file")

completed loading file


In [3]:
#-- Deterine Count of articles
len(sourceData['articles'])

4980

In [4]:
#- Check date format conversation
# sourceData['articles'][0]['publishdate']
datetime.datetime.strptime(sourceData['articles'][0]['publishdate'], "%Y-%m-%d")

datetime.datetime(2018, 1, 2, 0, 0)

#### 2 Read Predicted Data
.CSV file contains the predicted data

In [5]:
sourcePredictedFile = "y_pred.csv"

predicted_df = pd.read_csv(sourcePredictedFile)

predicted_df.head()

Unnamed: 0.1,Unnamed: 0,0,date
0,0,Hold,2018-01-02
1,1,Hold,2018-01-02
2,2,Hold,2018-01-02
3,3,Hold,2018-01-02
4,4,Hold,2018-01-02


#### 2 Search for Date
Function to search date for the actual stock results

In [12]:
def getPredictedLabelForDate(searchDate, predicted_df):
    ''' Searches for predicted label for the date
    
    Accepts : searchDate (datetime) date to determine label
              predicted_df (DataFrame) contains predicted information
              
    Returns : (string) able that is predicted; "Buy", "Sell", "Hold" and "Unknown" when not found
    
    '''
    
    labelInfo = "Unknown"
    
    searchDateString = searchDate.strftime("%Y-%m-%d")
    
    subset_df =  predicted_df.loc[predicted_df['date'] == searchDateString]

    if (len(subset_df) > 0):
        labelInfo = subset_df.iloc[0]['0']
    
    
    return labelInfo

In [8]:
def getActualLabelForDate(searchDate, sourceData):
    ''' Searches the date for the label that was calculated for a date. Gets the first value found.
    
    Accepts : searchDate (datetime) date to determine the label
              sourceDate (dictionary) data that contains labels
                      createdate: (string) date when the file was generated
                      articles: (list) all of the articles
                          publishdate: (string) date when the article was published, format "m/d/YYYY"
                          content: (string) content 
                          label: (string) information on what actual stock result
            
    Returns : label that was calculated; "Buy", "Sell", "Hold" and "Unknown" when not found
    '''
    
    labelInfo = "Unknown"
    
    for article in sourceData['articles']:
        
        #- Convert Date
        articleDate = datetime.datetime.strptime(article['publishdate'], "%Y-%m-%d")
        
        if (articleDate == searchDate):
            labelInfo = article['label']
            break
    
    return labelInfo      

#### 3 Get Results by Date
Get the results for the study period

In [13]:
#- Set Date Range
startSearchDate = datetime.datetime(2017,7,1)
endSearchDate = datetime.datetime(2019,6,30)

calendarDates = []
searchDate = startSearchDate
continueSearch = True

while continueSearch == True:
    
    #- Get Actual Label
    actualLabel = getActualLabelForDate(searchDate, sourceData)
    
    
    #- Get Predictive Label
    predictiveLabel = getPredictedLabelForDate(searchDate, predicted_df)
    
    calendarDates.append({
        "date": int(searchDate.strftime('%Y%m%d')),
        "predicted": predictiveLabel,
        "actual": actualLabel
    })

    
    #- Update Date
    searchDate = searchDate + timedelta(days=1)
    
    
    #- Check Continue Search
    if (searchDate > endSearchDate):
        continueSearch = False
      

print(f"Completed getting values, number of items: {len(calendarDates)}")

Completed getting values, number of items: 730


#### 4 Save Results
Save the list of the calendarInfo objects to disk

In [14]:
#- Json
with open('calendarData_4.json', 'w') as fp:
    json.dump(calendarDates, fp)

print("Completed writing file to disk")

Completed writing file to disk
