In [1]:
import pandas as pd
import altair as alt

In [2]:
file_path = "./data/clean_data.csv"
truck_df = pd.read_csv(file_path)
truck_df['timestamp'] = pd.to_datetime(truck_df['timestamp'], format="%Y-%m-%d %H:%M:%S")
truck_df = truck_df.drop(columns=["Unnamed: 0"])


In [30]:
truck_df.sample()

Unnamed: 0,timestamp,type,total,truck_id
3209,2024-08-29 13:19:00,card,1.99,3


In [61]:
transactions_per_truck = truck_df.groupby(truck_df['truck_id'])
transactions_per_truck.count().sort_values('total', ascending=False).drop(columns=['timestamp', 'type']).reset_index()

Unnamed: 0,truck_id,total
0,1,1621
1,5,1393
2,3,1207
3,6,779
4,2,694
5,4,299


In [32]:
lowest_trans_value = truck_df.groupby(truck_df['truck_id'])
lowest_trans_value.sum('total').sort_values('total', ascending=True).reset_index()

Unnamed: 0,truck_id,total
0,4,792.01
1,6,4587.21
2,2,5594.2
3,3,7049.93
4,5,7581.8
5,1,12598.4


In [33]:
avg_trans_val = truck_df['total'].mean().round(2)
print(avg_trans_val)

6.37


In [34]:
avg_val_per_truck = truck_df.groupby(truck_df['truck_id'])["total"].mean().reset_index().round(2)
avg_val_per_truck.head(6).reset_index().round(2)

Unnamed: 0,truck_id,total
0,1,7.77
1,2,8.06
2,3,5.84
3,4,2.65
4,5,5.44
5,6,5.89


In [35]:
payment_methods = truck_df.groupby(truck_df["truck_id"])["type"].value_counts().reset_index()
payment_methods["Proportion (%)"] = ((payment_methods["count"] / truck_df["type"].count()) * 100).round(2)

total_payment_methods = truck_df["type"].value_counts().reset_index()
total_payment_methods["Proportion (%)"] = ((total_payment_methods["count"] / truck_df["type"].count()) * 100).round(2)

display(payment_methods)
display(total_payment_methods)

Unnamed: 0,truck_id,type,count,Proportion (%)
0,1,card,830,13.85
1,1,cash,791,13.2
2,2,card,599,9.99
3,2,cash,95,1.59
4,3,card,619,10.33
5,3,cash,588,9.81
6,4,card,159,2.65
7,4,cash,140,2.34
8,5,cash,789,13.17
9,5,card,604,10.08


Unnamed: 0,type,count,Proportion (%)
0,cash,3182,53.1
1,card,2811,46.9


In [51]:
alt.Chart(avg_val_per_truck.reset_index()).mark_bar().encode(
    x=alt.X("truck_id:N", title=None),
    y=alt.Y("total", title="Average transaction cost ($)"),
    color=alt.Color("truck_id:N", title="Truck ID")
)

In [52]:
alt.Chart(total_payment_methods).mark_arc().encode(
    theta="Proportion (%)",
    color="type"
)

In [20]:
truck_df["str_date"] = [date_time.strftime("%Y-%m-%d") for date_time in truck_df['timestamp']]

trans_by_date = truck_df.groupby(truck_df['str_date']).value_counts().reset_index()

trans_by_date.info()
trans_by_date.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5501 entries, 0 to 5500
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   str_date   5501 non-null   object        
 1   timestamp  5501 non-null   datetime64[ns]
 2   type       5501 non-null   object        
 3   total      5501 non-null   float64       
 4   truck_id   5501 non-null   int64         
 5   count      5501 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 258.0+ KB


Unnamed: 0,str_date,timestamp,type,total,truck_id,count
0,2024-08-25,2024-08-25 13:31:00,card,7.0,1,4
1,2024-08-25,2024-08-25 13:31:00,cash,7.0,1,4
2,2024-08-25,2024-08-25 13:57:00,cash,5.99,6,4
3,2024-08-25,2024-08-25 13:46:00,cash,7.0,1,3
4,2024-08-25,2024-08-25 13:53:00,card,7.0,1,3
