In [1]:
import docx
import re
from datetime import datetime
import csv
import json

In [2]:
class Mission:
    '''A class that contains all the data for each of the shuttle
    missions, as read from the word document provided by NASA
    '''
    name = None
    notes = ''    
    mission_start = None
    mission_end = None
    
    def __init__(self, mission_data):
        self.days = []
        self.name  = mission_data[0].strip()
        
        launch_date = ' '.join([mission_data[1],mission_data[2],mission_data[5]])
        self.mission_start = datetime.strptime(launch_date, MISSION_DATE_FORMAT).strftime('%m/%d/%Y')
        
        if mission_data[4] is not None:
            #handles Columbia crash :-(
            if mission_data[3] is None:
                #mission finished the same month is started
                end_date = ' '.join([mission_data[1],mission_data[4],mission_data[5]])
            else:
                #mission finished a different month
                end_date = ' '.join([mission_data[3],mission_data[4],mission_data[5]])
            
            self.mission_end = datetime.strptime(end_date, MISSION_DATE_FORMAT).strftime('%m/%d/%Y')
        
        if mission_data[6] is not None:
            self.notes = mission_data[6].strip()

In [3]:
def try_parsing_date(text):
    for fmt in ('%m/%d/%Y', '%m/%d/%y'):
        try:
            return datetime.strptime(text, fmt)
        except ValueError:
            pass
    raise ValueError('no valid date format found')

MISSION_DATE_FORMAT = '%B %d %Y'

In [4]:
def getText(filename):
    doc = docx.Document(filename)
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
    return fullText

In [5]:
full_text = getText('./original data/SPACE SHUTTLE.docx')

In [6]:
mission_detail_regex = re.compile(r'''(STS-\d+-?\w?)\s*(\w+)\s(\d+)-?(\w+\s)?(\d+)?,\s(\d+)(\s.*)?''')

In [7]:
mission_indices = []
missions_list = []
for paragraph in full_text:
    mission_regex = mission_detail_regex.search(paragraph.strip())
    if mission_regex != None:
        mission_data = mission_regex.groups()
        mission = Mission(mission_data)
        mission.first_line = full_text.index(paragraph)
        #print(mission_data)
        missions_list.append(mission)
    else:
        pass

In [8]:
for index, mission in enumerate(missions_list):
    if index <= len(missions_list)-1:
        first_song_line = mission.first_line+1
        if mission != missions_list[-1]:
            last_song_line = missions_list[index+1].first_line
        else:
            last_song_line = len(full_text)-1
        
        for line in range(first_song_line, last_song_line):    #gets all the lines in the current mission
            text = full_text[line]
            song_data_regex = re.compile(r'(\d+/\d+/\d+)\s*(.*)')
            song_regex = song_data_regex.search(text)
            if song_regex != None:
                mission.days.append({
                    'date': try_parsing_date(song_regex.group(1)).strftime('%m/%d/%Y'),
                    'data':song_regex.group(2)
                })        

In [9]:
with open('./formatted data/raw_song_data.tsv', 'w', newline='') as outputFile:
    outputWriter = csv.writer(outputFile, delimiter='\t')
    outputWriter.writerow(['mission name', 'date','data']) 
    for mission in missions_list:
        for day in mission.days:
            outputWriter.writerow([mission.name, day['date'],day['data']])

In [11]:
with open('./formatted data/word_doc_data.json','w') as outputJson:
    json.dump([{
        'name':m.name,
        'start date':m.mission_start,
        'end date':m.mission_end,
        'notes':m.notes,
        'days':[{day['date']:day['data']} for day in m.days]
    } for m in missions_list], outputJson, indent=4)