## Importing modules

In [29]:
import re
import warnings
import datetime
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

## Importing and examining data

- ```user_name``` has unstandardized capitalization
- ```login_time``` has mixed ```datetime``` formats
- ```duration``` needs to be formatted to trim away units and be converted to numerical (float) data

In [32]:
df = pd.read_csv('practice.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          8 non-null      int64 
 1   user_name   8 non-null      object
 2   login_time  8 non-null      object
 3   activity    8 non-null      object
 4   duration    8 non-null      object
dtypes: int64(1), object(4)
memory usage: 452.0+ bytes


Unnamed: 0,id,user_name,login_time,activity,duration
0,1,Alice,2024-02-17 9:05am,Logged in,10 min
1,2,BOB,17-02-2024 09:10 AM,Viewed Report,9.5minutes
2,3,Charlie,2024/02/17 09:15:30,Downloaded File,8mins
3,4,alice,17-02-2024 9:20,Logged out,"7,0 minutes"
4,5,Eve,17-Feb-2024 09:25AM,Uploaded File,6 mins


## Cleaning data

In [35]:
# clean columns that should contain numerical data
def clean_numeric(text:str):
	if pd.isna(text):
		return None
	# once the func is applied the column in the df will be converted to float
	# using replace on flat causes AttributeError
	try:
		text = text.replace(',','.')
	except AttributeError:
		return text # return the data as is, for in this case the data is already converted to float
	num_data = re.search(r"\d+(\.\d+)?", text)
	if num_data:
		return float(num_data.group())

# clean columns that should contain datetime data
def clean_datetime(date_series):
	# use errors='coerce' to return NaT (Not a Time) in case of error
	# dayfirst to make sure that the first number read is intepreted as days
	date_series = pd.to_datetime(date_series, format='mixed', dayfirst=True, errors='coerce')
	# if isna detects None (null val) or NaT and set NaT to None
	if pd.isna(date_series):
		return None
	return date_series

df['login_time'].apply(clean_datetime)
df['user_name']  = df['user_name'].str.title() # add .str to directly modify str data columns
df['duration'] = df['duration'].apply(clean_numeric)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          8 non-null      int64  
 1   user_name   8 non-null      object 
 2   login_time  8 non-null      object 
 3   activity    8 non-null      object 
 4   duration    8 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 452.0+ bytes


Unnamed: 0,id,user_name,login_time,activity,duration
0,1,Alice,2024-02-17 9:05am,Logged in,10.0
1,2,Bob,17-02-2024 09:10 AM,Viewed Report,9.5
2,3,Charlie,2024/02/17 09:15:30,Downloaded File,8.0
3,4,Alice,17-02-2024 9:20,Logged out,7.0
4,5,Eve,17-Feb-2024 09:25AM,Uploaded File,6.0
