In [None]:
import pandas as pd
import numpy as np
import ast
import sys
import datetime as dt
from datetime import timedelta

In [None]:
# See https://stackoverflow.com/questions/18495846/pandas-data-from-stdin.
# Note: You can import the csv file and change the dataframe (df) path to 
# df = pd.read_csv('sample-with-broken-utf8.csv', encoding='iso-8859-1'), if necessary.
df = pd.read_csv(sys.stdin)

In [None]:
# Convert the Timestamp column object to datetime.  
# The to-date() function makes the datetime object timezone aware to DST.
# See https://www.marsja.se/pandas-convert-column-to-datetime/.
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
# Set the current timedate as US/Pacific Standard Time.
# The tz_localize method makes the datetime object aware of DayLight Savings Time.
# See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.tz_localize.html.
# If we want to check more timezones, you can pip install pytz, import pytz and do pytz.common_timezones.
df['Timestamp'] = df['Timestamp'].dt.tz_localize('US/Pacific')

In [None]:
# Convert datetime object to US/Eastern Standard Time
df['Timestamp'] = df['Timestamp'].dt.tz_convert('US/Eastern')

If the DST transition causes nonexistent times, you can shift these dates forward or backward with a timedelta object or ‘shift_forward’ or ‘shift_backward’ (see https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.tz_localize.html).



In [None]:
# Sometimes objects are saved as different object types.  You can set the Address to unicode.
# See https://www.programcreek.com/python/example/102072/numpy.unicode
df['Address'].astype(np.unicode)

In [None]:
# Create an AddressValidator column
# Test to see if Address is unicode
df['AddressValidator'] = np.where(df['Address'].astype("unicode"), True, False)

In [None]:
# Set AddressValidator column to string object data type.
df['AddressValidator'] = df['AddressValidator'].astype(str)

In [None]:
# validate if AddressValidator is 'True then Address will have the address passed
df['Address'][df.AddressValidator.str.contains('True')] = df['Address']

In [None]:
# ZIP is normally an int64 data type; however, this will not resolve how to look at the length.
# A better option would be to convert the zip column into a string to get the length.
# A string is an object data type.
# NOTE: int64 objects used with comparators can lead to a false positive or false negative.
# TODO: Investigate how to refactor code further.
df['ZIP'] = df.ZIP.astype(str)

In [None]:
# Create a ZipLength column to check the length of the ZIP column values.
df["ZipLength"]= df['ZIP'].str.len()

In [None]:
# Set the ZipLength column to an object data type.
# Note: int64 objects during comparisions may return a false positive or false negative.
df["ZipLength"]= df["ZipLength"].astype(object)

In [None]:
# Create an array the length of the ZIP rows with the value 5.
# Check the ZipLength column values with the valid_len_arr values.
# Note: Comparators may provide a false negative or false positive
# this code may be refactored using a.bool() or a.all() to avoid creating an array.
valid_len_arr[len(df['ZIP'])-1] = 5
df['ZipLength'] = np.where((df['ZipLength'].isin(valid_len_arr)), True, False)

In [None]:
# Set ZipLength data type to a string.
df['ZipLength'] = df['ZipLength'].astype(str)

In [None]:
# Validate if column has the word "False"; if yes, set the ZIP column's value to 0.
# see https://stackoverflow.com/questions/23400743/pandas-modify-column-values-in-place-based-on-boolean-array
df['ZIP'][df.ZipLength.str.contains('False')] = '0'

In [None]:
# Begin converting the FullName column.
# Change the FullName column to Unicode.
# Note: This may need to be revised.  Attempts to decode and encode into unicode and/or utf-8 failed.
# Received "AttributeError: 'Series' object has no attribute 'decode'" with the following command:
# df['FullName'] = df['FullName'].str.upper().decode('iso-8859-1').encode('utf8')
# TODO: investigate why unicode data type was not successfully updated for Address & FullName column.

df['FullName'] = df['FullName'].astype(np.unicode)

In [None]:
df['FullName'] = df['FullName'].str.upper()

In [None]:
# Test behavior of special characters such as Chinese.
# Below is the Chinese equivalant to Mary.
df["NameTest"]= '瑪麗'

In [None]:
# Testing if Asian characters would be affected by the str.upper() function.
df['NameTest'] = df['NameTest'].str.upper()

In [None]:
# Start conversion for FooDuration
# FooDuration column will be converted from an object to a float64 data type.
# See https://stackoverflow.com/questions/40485246/pandas-convert-hhmmss-f-string-to-seconds-caveat-hh-sometimes-goes-over
df['FooDuration'] = pd.to_timedelta(df['FooDuration']).apply(lambda td: td.total_seconds())

In [None]:
# Start conversion for BarDuration.
# BarDuration column will be converted from an object to a float64 data type.
df['BarDuration'] = pd.to_timedelta(df['BarDuration']).apply(lambda td: td.total_seconds())

In [None]:
# Calculate the sum for TotalDuration.
df['TotalDuration'] = df['FooDuration'] + df['BarDuration']

In [None]:
df[['FooDuration', 'BarDuration', 'TotalDuration']]

In [None]:
# Set Notes column to a bytes string.
df["Notes"] = df["Notes"].str.encode('utf-8')

In [None]:
# Replace invalid unicode characters with unicode replacement character  
df["Notes"] = df.Notes.replace('U+FFFD', 'replace')

In [None]:
# Remove the NameTest and ZipLength columns from the dataframe.
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html
df.drop(columns=['NameTest', 'ZipLength'])

In [None]:
# Export the dataframe to a new csv named "output.csv".
# See https://towardsdatascience.com/how-to-export-pandas-dataframe-to-csv-2038e43d9c03.
df.to_csv('output.csv')