## Pandas Date Time Operations

In [1]:
import os, sys, time
t_nb_start = time.time()
import pandas as pd
import numpy as np
import datetime as dt
!python --version

Python 3.9.12


In [2]:
df = pd.DataFrame(
    {'dt_str': [pd.to_datetime('2019-05-15 13:25:43'),
                pd.to_datetime('2018-03-14 13:25:43'),
                pd.to_datetime('2017-01-13 13:25:43')],
     'label': ['A','B','C'],
     'date_str': [
         "1/1/2022",
         "2/1/2022",
         "3/31/2022"]
     })

In [3]:
# convert datetime to int, 
# then divide by 10*9 to convert nano seconds to seconds
df['dt_int'] = pd.to_datetime(df['dt_str']).view(int) / 10**9   

In [4]:
print(type(df))
print("-"*40)
display(df)

<class 'pandas.core.frame.DataFrame'>
----------------------------------------


Unnamed: 0,dt_str,label,date_str,dt_int
0,2019-05-15 13:25:43,A,1/1/2022,1557927000.0
1,2018-03-14 13:25:43,B,2/1/2022,1521034000.0
2,2017-01-13 13:25:43,C,3/31/2022,1484314000.0


In [5]:
chars_allowed = set(['0','1','2','3','4','5','6','7','8','9','/'])

def clean_date_str(ss):
    """
    # converts date string from 'm/d/y' to "yyyy-mm-dd" format
    # if format is wrong - returns np.nan
    # valid years are 1971-2022
    """
    if ss != ss:
        return np.nan
    if ss == None:
        return np.nan
    if not (set(ss) <= chars_allowed):
        return np.nan
    mylist = ss.split("/")
    if len(mylist) != 3:
        return np.nan
    
    m,d,y = mylist
    year = int(y)
    mm   = int(m)
    dd   = int(d)
    
    if 80 <= year < 100:
        yyyy = 1900 + year
    elif year < 40:
        yyyy = 2000 + year
    elif 2000 <= year < 2040:
        yyyy = year
    else:
        return np.nan

    try:
        dt.datetime(year=yyyy,month=mm,day=dd,hour=1)
        return "%4d-%02d-%02d" % (yyyy, mm, dd)
    except:
        return np.nan 

In [6]:
col = "date_str"
# clean date_str - and convery to "YYYY-MM-DD" format 
df[col+"2"] = df[col].map(clean_date_str)
df

Unnamed: 0,dt_str,label,date_str,dt_int,date_str2
0,2019-05-15 13:25:43,A,1/1/2022,1557927000.0,2022-01-01
1,2018-03-14 13:25:43,B,2/1/2022,1521034000.0,2022-02-01
2,2017-01-13 13:25:43,C,3/31/2022,1484314000.0,2022-03-31


In [7]:
# convert string to datetime
col = "date_str2"
df[col+"3"] = pd.to_datetime(df[col].values, infer_datetime_format=True)
display(df)
print("-"*60)
display(df.dtypes)

Unnamed: 0,dt_str,label,date_str,dt_int,date_str2,date_str23
0,2019-05-15 13:25:43,A,1/1/2022,1557927000.0,2022-01-01,2022-01-01
1,2018-03-14 13:25:43,B,2/1/2022,1521034000.0,2022-02-01,2022-02-01
2,2017-01-13 13:25:43,C,3/31/2022,1484314000.0,2022-03-31,2022-03-31


------------------------------------------------------------


dt_str        datetime64[ns]
label                 object
date_str              object
dt_int               float64
date_str2             object
date_str23    datetime64[ns]
dtype: object

In [8]:
# convert datetime to string
mylist = df["date_str23"].values
mylist = [np.datetime_as_string(x, unit='D') for x in mylist]
mylist

['2022-01-01', '2022-02-01', '2022-03-31']

In [9]:
# find max and min dates
mmin = df["date_str23"].min()            # type - pandas Timestamp
mmin_str = mmin.date().strftime("%Y-%m-%d")  # type - str
display(mmin_str)

mmax = df["date_str23"].max()            # type - pandas Timestamp
mmax_str = mmax.date().strftime("%Y-%m-%d")  # type - str
display(mmax_str)

'2022-01-01'

'2022-03-31'

In [10]:
# extract day, month, year from date
df["day"]   = df["dt_str"].dt.day
df["month"] = df["dt_str"].dt.month
df["year"]  = df["dt_str"].dt.year
df

Unnamed: 0,dt_str,label,date_str,dt_int,date_str2,date_str23,day,month,year
0,2019-05-15 13:25:43,A,1/1/2022,1557927000.0,2022-01-01,2022-01-01,15,5,2019
1,2018-03-14 13:25:43,B,2/1/2022,1521034000.0,2022-02-01,2022-02-01,14,3,2018
2,2017-01-13 13:25:43,C,3/31/2022,1484314000.0,2022-03-31,2022-03-31,13,1,2017


In [11]:
# extract "period_month" (as "2001-05") from date
df["period_month"] = pd.to_datetime(df["dt_str"]).dt.to_period('M')
df

Unnamed: 0,dt_str,label,date_str,dt_int,date_str2,date_str23,day,month,year,period_month
0,2019-05-15 13:25:43,A,1/1/2022,1557927000.0,2022-01-01,2022-01-01,15,5,2019,2019-05
1,2018-03-14 13:25:43,B,2/1/2022,1521034000.0,2022-02-01,2022-02-01,14,3,2018,2018-03
2,2017-01-13 13:25:43,C,3/31/2022,1484314000.0,2022-03-31,2022-03-31,13,1,2017,2017-01


In [12]:
df.dtypes

dt_str          datetime64[ns]
label                   object
date_str                object
dt_int                 float64
date_str2               object
date_str23      datetime64[ns]
day                      int64
month                    int64
year                     int64
period_month         period[M]
dtype: object