In [16]:
import pandas as pd

import dateutil
from dateutil import parser
#https://dateutil.readthedocs.io/en/stable/

import datetime
# https://docs.python.org/3/library/datetime.html
# http://strftime.org
# https://howchoo.com/g/ywi5m2vkodk/working-with-datetime-objects-and-timezones-in-python
# Inlcudes how to replace w/ datetime.replace(year=?): https://pymotw.com/2/datetime/

dt = parser.parse("Aug 28 1999 12:00AM")
print(dt)
print(dt.strftime("%Y-%m-%dT%H:%M:%SZ"))

1999-08-28 00:00:00
1999-08-28T00:00:00Z


In [42]:
# Demo this is possible
pd.DataFrame({"datetime":[parser.parse("Aug 28 1999 12:00AM")]}).astype(str)

Unnamed: 0,datetime
0,1999-08-28


In [11]:
# https://stackoverflow.com/questions/466345/converting-string-into-datetime
# https://docs.python.org/3/library/datetime.html#datetime.datetime.strptime
def fix_datetime_UTC(data_df, date_columns=[], date_pattern="%Y-%m-%dT%H:%M:%SZ"):
    """
    Desired datetime format: 2017-12-08T15:16:03Z
    Corresponding date_pattern for strftime: %Y-%m-%dT%H:%M:%SZ
    
    Depends on:
    from dateutil import parser
    """
    
    assert((type(date_columns)==str) | (type(date_columns)==list))
    if type(date_columns)==str:
        date_columns = [date_columns]
        
    date_expression = data_df[date_columns[0]].astype(str)
    for col in date_columns[1:]:
        date_expression = date_expression + " " + data_df[col].astype(str)
    
    # Need to provide the default parameter to parser.parse so that missing entries don't default to current date
    date_col = date_expression.apply(lambda date: parser.parse(date, default=parser.parse("January 1 1900 00:00:00")).strftime(date_pattern))
    
    return(date_col)

In [88]:
def structure_dttm_from_parts(row, dttm_elems, dttm_pattern):
    dt = datetime.datetime(year=int(row[dttm_elems["year_col"]]), 
                           month=int(row[dttm_elems["month_col"]]),
                           day=int(row[dttm_elems["day_col"]]))
    if "hour_col" in dttm_elems:
        dt = dt.replace(hour=int(row[dttm_elems["hour_col"]]))
    if "min_col" in dttm_elems:
        dt = dt.replace(minute=int(row[dttm_elems["min_col"]]))
    if "sec_col" in dttm_elems:
        dt = dt.replace(second=int(row[dttm_elems["sec_col"]]))
    if "milli_col" in dttm_elems:
        dt = dt.replace(milliseconds=int(row[dttm_elems["milli_col"]]))
    if "micro_col" in dttm_elems:
        dt = dt.replace(microseconds=int(row[dttm_elems["micro_col"]]))
    if "tzinfo_col" in dttm_elems:
        timezone = pytz.timezone(row[dttm_elems["tzinfo_col"]])
        dt = timezone.localize(dt)
    
    dttm_str = dt.strftime(dttm_pattern)
    return(dttm_str)

def fix_datetime_UTC(data_df, dttm_elems_in_sep_columns=True, 
                     dttm_elems={},
                     dttm_col=None, 
                     dttm_pattern="%Y-%m-%dT%H:%M:%SZ"):
    """
    Desired datetime format: 2017-12-08T15:16:03Z
    Corresponding date_pattern for strftime: %Y-%m-%dT%H:%M:%SZ
    
    If date_elems_in_sep_columns=True, then there will be a dictionary date_elems
    That at least contains the following elements:
    date_elems = {"year_col":`int or string`,"month_col":`int or string`,"day_col":`int or string`}
    OPTIONAL KEYS IN date_elems:
    * hour_col
    * min_col
    * sec_col
    * milli_col
    * micro_col
    * tz_col
    
    Depends on:
    from dateutil import parser
    """
    default_date = parser.parse("January 1 1900 00:00:00")
        
    # Mutually exclusive to provide broken down datetime factors, 
    # and either a date, time, or datetime object
    if dttm_elems_in_sep_columns:
        assert(type(dttm_elems)==dict)
        assert(dttm_col==None)
        
        tmp = data_df.copy()
        if "year_col" not in dttm_elems:
            dttm_elems["year_col"] = "year_tmp"
        if dttm_elems["year_col"] not in tmp.columns:
            tmp[dttm_elems["year_col"]] = 1990
            
        if "month_col" not in dttm_elems:
            dttm_elems["month_col"] = "month_tmp"
        if dttm_elems["month_col"] not in tmp.columns:
            tmp[dttm_elems["month_col"]] = 1
            
        if "day_col" not in dttm_elems:
            dttm_elems["day_col"] = "day_tmp"
        if dttm_elems["day_col"] not in tmp.columns:
            tmp[dttm_elems["day_col"]] = 1
        
        dttm_col = tmp.apply(lambda row: structure_dttm_from_parts(row, dttm_elems, dttm_pattern), axis=1)
        
    else:
        # Need to provide the default parameter to parser.parse so that missing entries don't default to current date
        dttm_col = data_df.apply(lambda row: parser.parse(row[dttm_col], default=default_date).strftime(dttm_pattern), axis=1)
    
    return(dttm_col)

In [67]:
print(datetime.datetime(year=1990, month=int("02"), day=10, hour=2, minute=10, second=10))
dt = datetime.datetime(year=1990, month=int("02"), day=10)
print(dt)

1990-02-10 02:10:10
1990-02-10 00:00:00


In [77]:
df = pd.DataFrame({"Year":[1990, 1991, 1992], "Month":[1,2,3], "Day":[12,10,4]})
df

Unnamed: 0,Day,Month,Year
0,12,1,1990
1,10,2,1991
2,4,3,1992


In [90]:
# When can this go wrong?
print(fix_datetime_UTC(df, dttm_elems={"year_col":"Year"}))
print(fix_datetime_UTC(df, dttm_elems={"month_col":"Month"}))
print(fix_datetime_UTC(df, dttm_elems={"day_col":"Day"}))
print(fix_datetime_UTC(df, dttm_elems={"year_col":"Year",
                                       "month_col":"Month",
                                       "day_col":"Day"}))

0    1990-01-01T00:00:00Z
1    1991-01-01T00:00:00Z
2    1992-01-01T00:00:00Z
dtype: object
0    1990-01-01T00:00:00Z
1    1990-02-01T00:00:00Z
2    1990-03-01T00:00:00Z
dtype: object
0    1990-01-12T00:00:00Z
1    1990-01-10T00:00:00Z
2    1990-01-04T00:00:00Z
dtype: object
0    1990-01-12T00:00:00Z
1    1991-02-10T00:00:00Z
2    1992-03-04T00:00:00Z
dtype: object


In [31]:
# When can this go wrong?
print(fix_datetime_UTC(df, date_columns=["Year", "Month", "Day"]))
print(fix_datetime_UTC(df, date_columns=["Month", "Day"]))
print(fix_datetime_UTC(df, date_columns=["Day", "Month"]))
print(fix_datetime_UTC(df, date_columns=["Year", "Day"]))
print(fix_datetime_UTC(df, date_columns=["Year", "Month"]))
print(fix_datetime_UTC(df, date_columns=["Day", "Year"]))

0    1990-01-12T00:00:00Z
1    1991-02-10T00:00:00Z
2    1992-03-04T00:00:00Z
dtype: object
0    1900-01-12T00:00:00Z
1    1900-02-10T00:00:00Z
2    1900-03-04T00:00:00Z
dtype: object
0    1900-12-01T00:00:00Z
1    1900-10-02T00:00:00Z
2    1900-04-03T00:00:00Z
dtype: object
0    1990-12-01T00:00:00Z
1    1991-10-01T00:00:00Z
2    1992-04-01T00:00:00Z
dtype: object
0    1990-01-01T00:00:00Z
1    1991-02-01T00:00:00Z
2    1992-03-01T00:00:00Z
dtype: object
0    1990-12-01T00:00:00Z
1    1991-10-01T00:00:00Z
2    1992-04-01T00:00:00Z
dtype: object


In [35]:
df = pd.DataFrame({"Year":[1990, 1991, 1992], "Month":[1,2,3], "Day":[13,10,4]})
print(fix_datetime_UTC(df, date_columns=["Month", "Day"]))
print(fix_datetime_UTC(df, date_columns=["Day", "Month"]))

0    1900-01-13T00:00:00Z
1    1900-02-10T00:00:00Z
2    1900-03-04T00:00:00Z
dtype: object


ValueError: month must be in 1..12