In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_cafe_dirty = pd.read_csv("/Users/maitanelopezsanchez/mi_proyecto-1/Datasets/dirty_cafe_sales.csv")

In [3]:
df_cafe_dirty

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
...,...,...,...,...,...,...,...,...
9995,TXN_7672686,Coffee,2,2.0,4.0,,UNKNOWN,2023-08-30
9996,TXN_9659401,,3,,3.0,Digital Wallet,,2023-06-02
9997,TXN_5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02
9998,TXN_7695629,Cookie,3,,3.0,Digital Wallet,,2023-12-02


## Comenzando la limpieza

In [4]:
#Transformar los tipos de datos a int y str.
df_cafe_dirty['Quantity'] = pd.to_numeric(df_cafe_dirty['Quantity'], errors='coerce')
df_cafe_dirty['Quantity'] = df_cafe_dirty['Quantity'].astype('Int64') 

In [5]:
df_cafe_dirty['Price Per Unit'] = pd.to_numeric(df_cafe_dirty['Price Per Unit'], errors='coerce')


In [6]:
df_cafe_dirty['Transaction Date'] = pd.to_datetime(df_cafe_dirty['Transaction Date'], errors='coerce')

In [7]:
df_cafe_dirty['Total Spent'] = df_cafe_dirty['Total Spent'].astype(str)

df_cafe_dirty['Total Spent'] = df_cafe_dirty['Total Spent'].str.replace(r'[^0-9.]', '', regex=True)

df_cafe_dirty['Total Spent'] = pd.to_numeric(df_cafe_dirty['Total Spent'], errors='coerce')


In [8]:
cols = ['Transaction ID', 'Item', 'Payment Method', 'Location']
df_cafe_dirty[cols] = df_cafe_dirty[cols].astype('string')


In [9]:
df_cafe_dirty.dtypes

Transaction ID      string[python]
Item                string[python]
Quantity                     Int64
Price Per Unit             float64
Total Spent                float64
Payment Method      string[python]
Location            string[python]
Transaction Date    datetime64[ns]
dtype: object

In [10]:
df_cafe_dirty.head(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
8,TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31


In [11]:
df_cafe_dirty.groupby("Item")["Price Per Unit"].first()


Item
Cake        3.0
Coffee      2.0
Cookie      1.0
ERROR       1.5
Juice       3.0
Salad       5.0
Sandwich    4.0
Smoothie    4.0
Tea         1.5
UNKNOWN     3.0
Name: Price Per Unit, dtype: float64

Tras observar el valor de cada Item, podemos ver como hay una coincidencia con el Price Per Unit del Item "ERROR"

In [12]:
df_cafe_dirty["Price Per Unit"].value_counts()

Price Per Unit
3.0    2429
4.0    2331
2.0    1227
5.0    1204
1.0    1143
1.5    1133
Name: count, dtype: int64

In [13]:
df_cafe_dirty[df_cafe_dirty["Price Per Unit"].isin(["Unknown", "ERROR"])]


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [14]:
df_cafe_dirty[df_cafe_dirty["Price Per Unit"].astype(str).str.contains("[A-Za-z]", regex=True)]
#De esta forma podemos ver si alguno de los valores del Price Per Unit tenía algún valor como "$" u otros.


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
56,TXN_3578141,Cake,5,,15.0,,Takeaway,2023-06-27
65,TXN_4987129,Sandwich,3,,,,In-store,2023-10-20
68,TXN_8427104,Salad,2,,10.0,,In-store,2023-10-27
85,TXN_8035512,Tea,3,,4.5,Cash,UNKNOWN,2023-10-29
104,TXN_7447872,Juice,2,,6.0,,,NaT
...,...,...,...,...,...,...,...,...
9924,TXN_5981429,Juice,2,,6.0,Digital Wallet,,2023-12-24
9926,TXN_2464706,Cake,4,,12.0,Digital Wallet,Takeaway,2023-11-09
9961,TXN_2153100,Tea,2,,3.0,Cash,,2023-12-29
9996,TXN_9659401,,3,,3.0,Digital Wallet,,2023-06-02


In [15]:
#A continuación, vamos a eliminar las filas que contengan valores nulos como son "NaN", "ERROR" o "UNKNOWN"
# tanto en la la columna Item como en la columna Price Per Unit, ya que, no nos aportan ninguna información.

In [16]:
invalid_values = ["ERROR", "UNKNOWN"]


In [17]:
#Función que elimine las filas donde AMBOS valores son invalid_values
mask_invalid_both = (
    (df_cafe_dirty["Item"].isin(invalid_values) | df_cafe_dirty["Item"].isna()) &
    (df_cafe_dirty["Price Per Unit"].isin(invalid_values) | df_cafe_dirty["Price Per Unit"].isna())
)


In [18]:
df_cafe_dirty = df_cafe_dirty[~mask_invalid_both]


In [19]:
df_cafe_dirty[
    (df_cafe_dirty["Item"].isin(invalid_values) | df_cafe_dirty["Item"].isna()) &
    (df_cafe_dirty["Price Per Unit"].isin(invalid_values) | df_cafe_dirty["Price Per Unit"].isna())
]



Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [20]:
df_cafe_dirty.shape

(9946, 8)

Hemos eliminado y guardado la primera fase de la limpieza, podemos ver que el tamaño de la tabla de ha reducido de 10.000 filas a 9.946.

El siguiente paso que quiero dar es transormar en valores tangibles aquellos invalid_values donde sabemos el precio por unidad y no el Item o a la inversa. Por ejemplo, sabemos que un Coffee vale 2.0 y la Cookie 1.0. Y al revés, donde el precio sea 1.0 y el Item un invalid_value poder decir que es una Cookie. 
    Nota: En algunos casos se complica ya que el precio de la Cake y del Juice es el mismo.

In [21]:
df_cafe_dirty.groupby("Item")["Price Per Unit"].first()

Item
Cake        3.0
Coffee      2.0
Cookie      1.0
ERROR       1.5
Juice       3.0
Salad       5.0
Sandwich    4.0
Smoothie    4.0
Tea         1.5
UNKNOWN     3.0
Name: Price Per Unit, dtype: float64

Como podemos observar, los Item: "ERROR" tienen un Price Per Unit de 1.5. El único Item que tiene ese Precio es el Té, por ello podemos transformar los Item que sean ERROR a Té.

In [22]:
df_cafe_dirty.loc[
    (df_cafe_dirty["Item"] == "ERROR") & (df_cafe_dirty["Price Per Unit"] == 1.5),
    "Item"
] = "Tea"


In [23]:
df_cafe_dirty_tea = df_cafe_dirty[df_cafe_dirty["Price Per Unit"] == 1.5]


In [72]:
df_cafe_dirty['Quantity'].mean()

np.float64(3.0233629836030436)

In [73]:
df_cafe_dirty.dtypes

Transaction ID       object
Item                 object
Quantity             object
Price Per Unit      float64
Total Spent         float64
Payment Method       object
Location             object
Transaction Date     object
dtype: object

In [112]:
df_cafe_dirty['Quantity'].value_counts()

Quantity
5.0    1976
2.0    1923
4.0    1813
1.0    1812
3.0    1807
Name: count, dtype: int64

### Segundo paso, transformar aquellos Item con invalid_values en un valor cuando se cumpla lo siguiente: 
Coffee → 2.0
Cookie → 1.0
Salad → 5.0
Tea → 1.5
En el caso de Items con Price Per Unit iguales no se podrá realizar este cambio.

In [113]:
mapping = {
    2.0: "Coffee",
    1.0: "Cookie",
    5.0: "Salad",
    1.5: "Tea"
}

In [114]:
for price_, item_name in mapping.items():
    df_cafe_dirty.loc[
        (df_cafe_dirty["Item"].isin(invalid_values)) &
        (df_cafe_dirty["Price Per Unit"] == price_),
        "Item"
    ] = item_name

In [115]:
#Verificación
df_cafe_dirty[df_cafe_dirty["Item"].isin(mapping.values())]


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,08/09/2023
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2.0,5.0,10.0,Other,Not Saved,27/04/2023
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11/06/2023
10,TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07/11/2023
...,...,...,...,...,...,...,...,...
9982,TXN_8567525,Cookie,2.0,1.0,2.0,Other,Takeaway,30/12/2023
9990,TXN_1538510,Coffee,5.0,2.0,10.0,Digital Wallet,Not Saved,22/05/2023
9995,TXN_7672686,Coffee,2.0,2.0,4.0,Other,Not Saved,30/08/2023
9997,TXN_5255387,Coffee,4.0,2.0,8.0,Digital Wallet,Not Saved,02/03/2023


### Tercer paso: Transformar los Price Per Unit que contengan invalid_values.

In [116]:
#Generamos un mapping con los precios de cada Item. Cake - 3.0, etc.
price_mapping = {
    "Cake": 3.0,
    "Coffee": 2.0,
    "Cookie": 1.0,
    "Juice": 3.0,
    "Salad": 5.0,
    "Sandwich": 4.0,
    "Smoothie": 4.0,
    "Tea": 1.5
}

In [117]:
for item_name, price_ in price_mapping.items():
    df_cafe_dirty.loc[
        (df_cafe_dirty["Price Per Unit"].isin(invalid_values) | df_cafe_dirty["Price Per Unit"].isna()) &
        (df_cafe_dirty["Item"] == item_name),
        "Price Per Unit"
    ] = price_


In [118]:
df_cafe_dirty[df_cafe_dirty["Price Per Unit"].isin(invalid_values)]
#Devuelve una tabla vacía por lo que ya no hay valores nulos en la columna Price Per Unit.


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


Debido a que los UNKNOWN que quedan tienen precio de 3.0 y de 4.0 y ese precio puede ser de 2 Items diferentes, considero que la mejor opción es eliminarlas ya que solo son 147 de 10.000 filas.

In [119]:
df_cafe_dirty[df_cafe_dirty["Item"] != "UNKNOWN"]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2.0,5.0,10.0,Other,Not Saved,27/04/2023
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11/06/2023
...,...,...,...,...,...,...,...,...
9993,TXN_4766549,Smoothie,2.0,4.0,8.0,Cash,Not Saved,20/10/2023
9995,TXN_7672686,Coffee,2.0,2.0,4.0,Other,Not Saved,30/08/2023
9997,TXN_5255387,Coffee,4.0,2.0,8.0,Digital Wallet,Not Saved,02/03/2023
9998,TXN_7695629,Cookie,3.0,1.0,3.0,Digital Wallet,Not Saved,02/12/2023


In [None]:
df_cafe_dirty_unknown = df_cafe_dirty[df_cafe_dirty["Item"] == "UNKNOWN"]
#Verificación  del tamaño de filas de Unkonws.
df_cafe_dirty_unknown

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [121]:
df_cafe_dirty['Quantity'].isna().sum()


np.int64(0)

In [122]:
df_cafe_dirty_quantity_nan = df_cafe_dirty[df_cafe_dirty["Quantity"].isna()]
df_cafe_dirty_quantity_nan


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


Por la tabla que nos devuelve nos muestra que para las 476 filas donde Quantity es un valor NA sabemos el precio y el total, por lo que podemos deducir el valor de Quantity ya que "Quantity * Price Per Unit = Total Spent"

Tomando en base que esa igualdad de Quantity * Price Per Unit = Total Spent, vamos a crear una función que en el caso de que falte 1 de los 3 valores lo calcule, que si faltan 2 no lo calcule y si alguno no encajase no lo modifique.

In [123]:
import numpy as np

In [124]:
def fix_row(row):
    q = row["Quantity"]
    p = row["Price Per Unit"]
    t = row["Total Spent"]
    
    # 1. Si solo falta Total Spent → Quantity * Price
    if pd.isna(t) and pd.notna(q) and pd.notna(p):
        row["Total Spent"] = q * p
    
    # 2. Si solo falta Quantity → Total / Price
    elif pd.isna(q) and pd.notna(t) and pd.notna(p) and p != 0:
        row["Quantity"] = t / p
    
    # 3. Si solo falta Price → Total / Quantity
    elif pd.isna(p) and pd.notna(t) and pd.notna(q) and q != 0:
        row["Price Per Unit"] = t / q
    
    return row

df_cafe_dirty = df_cafe_dirty.apply(fix_row, axis=1)


In [125]:
df_cafe_dirty.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2.0,5.0,10.0,Other,Not Saved,27/04/2023
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11/06/2023
5,TXN_2602893,Smoothie,5.0,4.0,20.0,Credit Card,Not Saved,31/03/2023
7,TXN_6699534,Sandwich,4.0,4.0,16.0,Cash,Not Saved,28/10/2023
9,TXN_2064365,Sandwich,5.0,4.0,20.0,Other,In-store,31/12/2023
10,TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07/11/2023
11,TXN_3051279,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,Not Time


In [126]:
df_cafe_dirty['Quantity'].isna().sum()

np.int64(0)

In [127]:
df_cafe_dirty_quantity_na = df_cafe_dirty[df_cafe_dirty["Quantity"].isna()]
df_cafe_dirty_quantity_na

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


Eliminaremos las 20 filas que contienen valores nulos en Quantity y en Total, ya que no nos aportan valor

In [128]:
df_cafe_dirty = df_cafe_dirty.dropna(subset=["Quantity"])
df_cafe_dirty[["Quantity"]].isna().sum()

Quantity    0
dtype: int64

In [129]:
df_cafe_dirty.shape

(9331, 8)

In [130]:
df_cafe_dirty['Price Per Unit'].isna().sum()

np.int64(0)

In [131]:
df_cafe_dirty['Total Spent'].isna().sum()

np.int64(0)

Finalmente, las columnas de Quantity, Price Per Unit y Total Spent están libres de nulos

In [132]:
#¿Item sigue teniendo algún valor nulo?
df_cafe_dirty['Item'].isna().sum()

np.int64(0)

In [133]:
#Como Item sigue teniendo valores nulos cuyo precio unitario puede coincidir con 2 items diferentes la mejor opción
#será eliminar esas filas, ya que, no son demasiadas.
df_cafe_dirty_item_nan = df_cafe_dirty[df_cafe_dirty['Item'].isna()]
df_cafe_dirty_item_nan

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [134]:
df_cafe_dirty = df_cafe_dirty.dropna(subset=["Item"])
df_cafe_dirty['Item'].isna().sum()

np.int64(0)

In [135]:
df_cafe_dirty.shape

(9331, 8)

In [136]:
df_cafe_dirty.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2.0,5.0,10.0,Other,Not Saved,27/04/2023
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11/06/2023
5,TXN_2602893,Smoothie,5.0,4.0,20.0,Credit Card,Not Saved,31/03/2023
7,TXN_6699534,Sandwich,4.0,4.0,16.0,Cash,Not Saved,28/10/2023
9,TXN_2064365,Sandwich,5.0,4.0,20.0,Other,In-store,31/12/2023
10,TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07/11/2023
11,TXN_3051279,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,Not Time


In [137]:
df_cafe_dirty_item_invalid_values = df_cafe_dirty[df_cafe_dirty['Item'].isin(invalid_values)]
df_cafe_dirty_item_invalid_values

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [138]:
#Eliminamos las filas con Unknown y Error (invalid_values previamente definido) ya que, el price per unit nos indica
# que podría ser más de un Item.
df_cafe_dirty = df_cafe_dirty[~df_cafe_dirty['Item'].isin(invalid_values)]
df_cafe_dirty['Item'].isin(invalid_values).sum()


np.int64(0)

In [139]:
df_cafe_dirty.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2.0,5.0,10.0,Other,Not Saved,27/04/2023
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11/06/2023
5,TXN_2602893,Smoothie,5.0,4.0,20.0,Credit Card,Not Saved,31/03/2023
7,TXN_6699534,Sandwich,4.0,4.0,16.0,Cash,Not Saved,28/10/2023
9,TXN_2064365,Sandwich,5.0,4.0,20.0,Other,In-store,31/12/2023
10,TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07/11/2023
11,TXN_3051279,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,Not Time


In [140]:
df_cafe_dirty_Payment_method_invalid = df_cafe_dirty[df_cafe_dirty['Payment Method'].isin(invalid_values)]
df_cafe_dirty_Payment_method_invalid

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [141]:
df_cafe_dirty_Payment_method_nan = df_cafe_dirty[df_cafe_dirty['Payment Method'].isna()]
df_cafe_dirty_Payment_method_nan

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [142]:
#Como son muchas filas con valores nulos o invalidos, vamos a transformarlos por un valor que sea "Other"

In [143]:
def clean_payment_method (value):
    if pd.isna(value) or value in invalid_values:
        return "Other"
    return value

In [144]:
df_cafe_dirty['Payment Method'] = df_cafe_dirty['Payment Method'].apply(clean_payment_method) 

In [145]:
df_cafe_dirty.head(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2.0,5.0,10.0,Other,Not Saved,27/04/2023
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,11/06/2023
5,TXN_2602893,Smoothie,5.0,4.0,20.0,Credit Card,Not Saved,31/03/2023
7,TXN_6699534,Sandwich,4.0,4.0,16.0,Cash,Not Saved,28/10/2023
9,TXN_2064365,Sandwich,5.0,4.0,20.0,Other,In-store,31/12/2023
10,TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07/11/2023
11,TXN_3051279,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,Not Time


In [146]:
df_cafe_dirty.Location.unique()

array(['Takeaway', 'In-store', 'Not Saved'], dtype=object)

In [147]:
df_cafe_dirty_location_invalid_values = df_cafe_dirty[df_cafe_dirty['Location'].isin(invalid_values)]
df_cafe_dirty_location_invalid_values

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [148]:
df_cafe_dirty_location_nan = df_cafe_dirty[df_cafe_dirty['Location'].isna()]
df_cafe_dirty_location_nan

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [149]:
#Demasiados valores nulos o invalid_values.
# Toma de decisión: ¿Cambiamos los valores nulos o invalid_values o eliminamos la columna al faltar 1/3 de los valores?
# Voy a escoger la opción de cambiar los valores nulos e invalidos y unificarlos a Location Not Saved

In [151]:
df_cafe_dirty['Transaction Date'] = df_cafe_dirty['Transaction Date'].dt.strftime('%d/%m/%Y')
#invertir el orden de la fecha para que aparezca primero el día después el mes y por último el año

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
df_cafe_dirty.head(6)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2,5.0,10.0,Other,UNKNOWN,27/04/2023
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,11/06/2023
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,31/03/2023


In [None]:
#Defino la función para modificar valores en Location:
def clean_location (value):
    if pd.isna(value) or value in invalid_values:
        return "Not Saved"
    return value

In [None]:
df_cafe_dirty['Location'] = df_cafe_dirty['Location'].apply(clean_location)

In [None]:
df_cafe_dirty.sample(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
5567,TXN_5748083,Salad,2.0,5.0,10.0,Other,Takeaway,08/04/2023
9192,TXN_4554194,Smoothie,3.0,4.0,12.0,Credit Card,Takeaway,04/10/2023
5285,TXN_6995665,Tea,2.0,1.5,3.0,Other,Not Saved,
3942,TXN_1489746,Cookie,3.0,1.0,3.0,Digital Wallet,In-store,18/12/2023
7143,TXN_5641209,Salad,3.0,5.0,15.0,Other,Takeaway,03/12/2023
7809,TXN_9662827,Tea,1.0,1.5,1.5,Other,Not Saved,30/09/2023
3047,TXN_1853298,Cookie,5.0,1.0,5.0,Credit Card,Not Saved,02/01/2023
5353,TXN_9731164,Salad,1.0,5.0,5.0,Cash,Not Saved,07/12/2023
2974,TXN_8571836,Juice,3.0,3.0,9.0,Cash,Not Saved,27/12/2023
2050,TXN_8788437,Coffee,1.0,2.0,2.0,Other,Not Saved,19/11/2023


In [None]:
df_cafe_dirty_transaction_date_nat = df_cafe_dirty[df_cafe_dirty['Transaction Date'].isna()]
df_cafe_dirty_transaction_date_nat 

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
11,TXN_3051279,Sandwich,2,4.0,8.0,Credit Card,Takeaway,
29,TXN_7640952,Cake,4,3.0,12.0,Digital Wallet,Takeaway,
33,TXN_7710508,Cookie,5,1.0,5.0,Cash,Not Saved,
77,TXN_2091733,Salad,1,5.0,5.0,Other,In-store,
103,TXN_7028009,Cake,4,3.0,12.0,Other,Takeaway,
...,...,...,...,...,...,...,...,...
9933,TXN_9460419,Cake,1,3.0,3.0,Other,Takeaway,
9937,TXN_8253472,Cake,1,3.0,3.0,Other,Not Saved,
9949,TXN_3130865,Juice,3,3.0,9.0,Other,In-store,
9983,TXN_9226047,Smoothie,3,4.0,12.0,Cash,Not Saved,


In [None]:
def clean_time (value):
    if pd.isna(value) or value in invalid_values:
        return "Not Time"
    return value

In [None]:
df_cafe_dirty['Transaction Date'] = df_cafe_dirty['Transaction Date'].apply(clean_time)

In [None]:
df_cafe_dirty.sample(30)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
9106,TXN_4638979,Salad,1.0,5.0,5.0,Other,Not Saved,28/06/2023
6411,TXN_9213505,Sandwich,2.0,4.0,8.0,Other,Not Saved,01/11/2023
1958,TXN_8707598,Juice,4.0,3.0,12.0,Credit Card,Not Saved,22/10/2023
6179,TXN_2453194,Cake,4.0,3.0,12.0,Digital Wallet,Not Saved,18/11/2023
4162,TXN_4262142,Cake,2.0,3.0,6.0,Cash,Takeaway,02/01/2023
2296,TXN_4205369,Salad,5.0,5.0,25.0,Other,Takeaway,16/07/2023
6266,TXN_9436342,Juice,4.0,3.0,12.0,Digital Wallet,In-store,26/11/2023
8687,TXN_5992166,Juice,5.0,3.0,15.0,Cash,Takeaway,01/10/2023
1831,TXN_2624121,Tea,5.0,1.5,7.5,Digital Wallet,In-store,28/01/2023
4489,TXN_6949949,Salad,3.0,5.0,15.0,Other,Not Saved,14/05/2023


In [None]:
df_cafe_dirty.to_csv('dataset cafe limip', index = False) 