## Prepare

In [1]:
import pandas as pd

import configparser
config = configparser.ConfigParser()

config.read('aws.cfg')
AWS_ACCESS_KEY = config['AWS']['aws_access_key']
AWS_SECRET_KEY = config['AWS']['aws_secret_key']

## Extract Data

In [10]:
# Read the CSV file 
source= 's3://techcatalyst-raw/SalesRaw/sales_data.csv'
df = pd.read_csv(source, storage_options={
                   'key' : AWS_ACCESS_KEY,
                   'secret' : AWS_SECRET_KEY
               })

In [11]:
df.head()

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person
0,2024-06-15 05:56:00,Australia/Sydney,,Australia,David
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice
2,2024-02-01 15:04:00,Australia/Sydney,,Australia,Eve
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice
4,2024-07-11 00:33:00,Europe/Berlin,,Germany,David


In [12]:
df.describe(include='all')

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person
count,10000,10000,5951,10000,10000
unique,9913,5,3905,5,5
top,2024-01-26 13:39:00,UTC,?,United Kingdom,Eve
freq,3,2046,2025,2046,2041


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DateTime      10000 non-null  object
 1   Time Zone     10000 non-null  object
 2   Sales Amount  5951 non-null   object
 3   Country       10000 non-null  object
 4   Sales Person  10000 non-null  object
dtypes: object(5)
memory usage: 390.8+ KB


## Extract and Transform

In [14]:
# Read the file agian, but now by better identifying the potential NA values, also Parse the DateTime. Do it in one step while reading the file
df = pd.read_csv(source, storage_options={
                   'key' : AWS_ACCESS_KEY,
                   'secret' : AWS_SECRET_KEY
               }, parse_dates=['DateTime'], na_values=['?', 'None', ''])

In [15]:
df.head()

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person
0,2024-06-15 05:56:00,Australia/Sydney,,Australia,David
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice
2,2024-02-01 15:04:00,Australia/Sydney,,Australia,Eve
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice
4,2024-07-11 00:33:00,Europe/Berlin,,Germany,David


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   DateTime      10000 non-null  datetime64[ns]
 1   Time Zone     10000 non-null  object        
 2   Sales Amount  3926 non-null   float64       
 3   Country       10000 non-null  object        
 4   Sales Person  10000 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 390.8+ KB


In [17]:
df.describe()

Unnamed: 0,DateTime,Sales Amount
count,10000,3926.0
mean,2024-07-01 02:47:21.606000128,15572.928839
min,2024-01-01 00:01:00,10.38
25%,2024-03-31 14:52:45,514.525
50%,2024-06-30 13:37:00,10891.17
75%,2024-09-30 05:46:45,30655.2175
max,2024-12-30 21:37:00,49996.82
std,,16922.555911


In [86]:
# Remove outlier values in Sales. Later you will learn how to statistically determine outliers. For now, an SME told you the threshold is 37,000
threshold = 37000

# now replace the outlier values with pd.NA
#df.loc[df['Sales Amount'] > threshold, 'Sales Amount'] = pd.NA
df['Sales Amount'] = df['Sales Amount'].apply(lambda x: pd.NA if x > threshold else x)
df
# Your code

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person,DateTime_Localized,DateTime_UTC
0,2024-06-15 05:56:00,Australia/Sydney,9829.40,Australia,David,2024-06-15 05:56:00+10:00,2024-06-14 19:56:00+00:00
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice,2024-09-30 05:36:00+02:00,2024-09-30 03:36:00+00:00
2,2024-02-01 15:04:00,Australia/Sydney,9829.40,Australia,Eve,2024-02-01 15:04:00+11:00,2024-02-01 04:04:00+00:00
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice,2024-04-24 05:12:00+10:00,2024-04-23 19:12:00+00:00
4,2024-07-11 00:33:00,Europe/Berlin,9829.40,Germany,David,2024-07-11 00:33:00+02:00,2024-07-10 22:33:00+00:00
...,...,...,...,...,...,...,...
9995,2024-11-18 14:35:00,Europe/Berlin,9829.40,Germany,Bob,2024-11-18 14:35:00+01:00,2024-11-18 13:35:00+00:00
9996,2024-05-04 10:13:00,Asia/Tokyo,287.35,Japan,Bob,2024-05-04 10:13:00+09:00,2024-05-04 01:13:00+00:00
9997,2024-03-07 03:51:00,Australia/Sydney,9829.40,Australia,Bob,2024-03-07 03:51:00+11:00,2024-03-06 16:51:00+00:00
9998,2024-10-26 04:30:00,Australia/Sydney,175.50,Australia,Eve,2024-10-26 04:30:00+11:00,2024-10-25 17:30:00+00:00


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   DateTime      10000 non-null  datetime64[ns]
 1   Time Zone     10000 non-null  object        
 2   Sales Amount  3252 non-null   object        
 3   Country       10000 non-null  object        
 4   Sales Person  10000 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 390.8+ KB


In [28]:
# Replace all null values with the overall mean 
average = df['Sales Amount'].mean()
average

9829.400738007385

In [31]:
# Now fill all NA values with that mean
# Your code here
df['Sales Amount'].fillna(df['Sales Amount'].mean(), inplace=True)

In [32]:
df.head()

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person
0,2024-06-15 05:56:00,Australia/Sydney,9829.400738,Australia,David
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice
2,2024-02-01 15:04:00,Australia/Sydney,9829.400738,Australia,Eve
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice
4,2024-07-11 00:33:00,Europe/Berlin,9829.400738,Germany,David


In [34]:
# You will need to round the Sale AMount now to 2 decimal places 

# Your code here 
df['Sales Amount'] = df['Sales Amount'].round(2)
df.head()

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person
0,2024-06-15 05:56:00,Australia/Sydney,9829.4,Australia,David
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice
2,2024-02-01 15:04:00,Australia/Sydney,9829.4,Australia,Eve
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice
4,2024-07-11 00:33:00,Europe/Berlin,9829.4,Germany,David


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   DateTime      10000 non-null  datetime64[ns]
 1   Time Zone     10000 non-null  object        
 2   Sales Amount  10000 non-null  float64       
 3   Country       10000 non-null  object        
 4   Sales Person  10000 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 390.8+ KB


In [36]:
df.describe()

Unnamed: 0,DateTime,Sales Amount
count,10000,10000.0
mean,2024-07-01 02:47:21.606000128,9829.40024
min,2024-01-01 00:01:00,10.38
25%,2024-03-31 14:52:45,9829.4
50%,2024-06-30 13:37:00,9829.4
75%,2024-09-30 05:46:45,9829.4
max,2024-12-30 21:37:00,36959.15
std,,7001.282356


In [37]:
df['Time Zone'].unique()

array(['Australia/Sydney', 'Europe/Berlin', 'Asia/Tokyo', 'UTC',
       'US/Eastern'], dtype=object)

In [42]:
# you will need to drop two rows have will cause an issue. They have been identified as index 9723 and index 9373

# df = df.drop(index=[9723, 9373])

In [65]:
# Localize each DateTime with the corresponding Time Zone.
# Hint, you can use df.apply and Lambda function
import datetime as dt
df['DateTime_Localized'] = df.apply(lambda row: row['DateTime'].tz_localize(row['Time Zone']), axis=1)
df.head()

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person,DateTime_Localized,DateTime_UTC
0,2024-06-15 05:56:00,Australia/Sydney,9829.4,Australia,David,2024-06-15 05:56:00+10:00,2024-06-15 05:56:00+00:00
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice,2024-09-30 05:36:00+02:00,2024-09-30 05:36:00+00:00
2,2024-02-01 15:04:00,Australia/Sydney,9829.4,Australia,Eve,2024-02-01 15:04:00+11:00,2024-02-01 15:04:00+00:00
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice,2024-04-24 05:12:00+10:00,2024-04-24 05:12:00+00:00
4,2024-07-11 00:33:00,Europe/Berlin,9829.4,Germany,David,2024-07-11 00:33:00+02:00,2024-07-11 00:33:00+00:00


### Testing/Validation

In [66]:
df['DateTime'][0]

Timestamp('2024-06-15 05:56:00')

In [67]:
df['DateTime_Localized'][0]

Timestamp('2024-06-15 05:56:00+1000', tz='Australia/Sydney')

In [69]:
df['DateTime'][1]

Timestamp('2024-09-30 05:36:00')

In [70]:
df['DateTime_Localized'][1]

Timestamp('2024-09-30 05:36:00+0200', tz='Europe/Berlin')

In [71]:
# Now, that you have localized, convert each DateTime_Localized to UTC Format
# Hint, think about using df.apply and a lambda function

df['DateTime_UTC'] = df.apply(lambda row: row['DateTime_Localized'].tz_convert('UTC'), axis=1)
df.head()

Unnamed: 0,DateTime,Time Zone,Sales Amount,Country,Sales Person,DateTime_Localized,DateTime_UTC
0,2024-06-15 05:56:00,Australia/Sydney,9829.4,Australia,David,2024-06-15 05:56:00+10:00,2024-06-14 19:56:00+00:00
1,2024-09-30 05:36:00,Europe/Berlin,23882.74,Germany,Alice,2024-09-30 05:36:00+02:00,2024-09-30 03:36:00+00:00
2,2024-02-01 15:04:00,Australia/Sydney,9829.4,Australia,Eve,2024-02-01 15:04:00+11:00,2024-02-01 04:04:00+00:00
3,2024-04-24 05:12:00,Australia/Sydney,20945.38,Australia,Alice,2024-04-24 05:12:00+10:00,2024-04-23 19:12:00+00:00
4,2024-07-11 00:33:00,Europe/Berlin,9829.4,Germany,David,2024-07-11 00:33:00+02:00,2024-07-10 22:33:00+00:00


### Testing/ Validation

In [72]:
df['DateTime_Localized'][0]

Timestamp('2024-06-15 05:56:00+1000', tz='Australia/Sydney')

In [73]:
df['DateTime_UTC'][0]

Timestamp('2024-06-14 19:56:00+0000', tz='UTC')

In [74]:
df['DateTime_Localized'][1]

Timestamp('2024-09-30 05:36:00+0200', tz='Europe/Berlin')

In [75]:
df['DateTime_UTC'][1]

Timestamp('2024-09-30 03:36:00+0000', tz='UTC')

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9998 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   DateTime            9998 non-null   datetime64[ns]     
 1   Time Zone           9998 non-null   object             
 2   Sales Amount        9998 non-null   float64            
 3   Country             9998 non-null   object             
 4   Sales Person        9998 non-null   object             
 5   DateTime_Localized  9998 non-null   object             
 6   DateTime_UTC        9998 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(1), object(4)
memory usage: 882.9+ KB


### Analysis / Validation

In [77]:
SalesByCountry = df.groupby('Country')['Sales Amount'].sum()
SalesByCountry

Country
Australia         18548078.80
Germany           19862156.50
Japan             19638421.38
United Kingdom    19945753.23
United States     20289638.47
Name: Sales Amount, dtype: float64

In [79]:
import matplotlib
SalesByCountry.plot(kind='bar');

ModuleNotFoundError: No module named 'matplotlib'

In [80]:
SalesBySP = df.groupby('Sales Person')['Sales Amount'].sum()
SalesBySP

Sales Person
Alice      20216678.02
Bob        19432012.45
Charlie    18415351.28
David      20147755.70
Eve        20072250.93
Name: Sales Amount, dtype: float64

In [81]:
SalesBySP.plot(kind='bar')

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

## Load

In [85]:
dest = 's3://techcatalyst-transformed/SalesTransformed/peter'
for country, partitioned_df in df.groupby('Country'):
    partitioned_df.to_csv(f'{dest}/{country}.csv', storage_options={
                   'key' : AWS_ACCESS_KEY,
                   'secret' : AWS_SECRET_KEY
               })
