In [735]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [780]:
df_cafe_dirty = pd.read_csv("/Users/maitanelopezsanchez/mi_proyecto-1/Datasets/dirty_cafe_sales.csv")

In [781]:
df_cafe_dirty

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
...,...,...,...,...,...,...,...,...
9995,TXN_7672686,Coffee,2,2.0,4.0,,UNKNOWN,2023-08-30
9996,TXN_9659401,,3,,3.0,Digital Wallet,,2023-06-02
9997,TXN_5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02
9998,TXN_7695629,Cookie,3,,3.0,Digital Wallet,,2023-12-02


## Comenzando la limpieza

In [782]:
#Transformar los tipos de datos a int y str.
df_cafe_dirty['Quantity'] = pd.to_numeric(df_cafe_dirty['Quantity'], errors='coerce')
df_cafe_dirty['Quantity'] = df_cafe_dirty['Quantity'].astype('Int64') 

In [783]:
df_cafe_dirty['Price Per Unit'] = pd.to_numeric(df_cafe_dirty['Price Per Unit'], errors='coerce')


In [784]:
df_cafe_dirty['Transaction Date'] = pd.to_datetime(df_cafe_dirty['Transaction Date'], errors='coerce')

In [785]:
df_cafe_dirty['Total Spent'] = df_cafe_dirty['Total Spent'].astype(str)

df_cafe_dirty['Total Spent'] = df_cafe_dirty['Total Spent'].str.replace(r'[^0-9.]', '', regex=True)

df_cafe_dirty['Total Spent'] = pd.to_numeric(df_cafe_dirty['Total Spent'], errors='coerce')


In [786]:
cols = ['Transaction ID', 'Item', 'Payment Method', 'Location']
df_cafe_dirty[cols] = df_cafe_dirty[cols].astype('string')


In [787]:
df_cafe_dirty.dtypes

Transaction ID      string[python]
Item                string[python]
Quantity                     Int64
Price Per Unit             float64
Total Spent                float64
Payment Method      string[python]
Location            string[python]
Transaction Date    datetime64[ns]
dtype: object

In [788]:
df_cafe_dirty.head(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
8,TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31


In [789]:
df_cafe_dirty.groupby("Item")["Price Per Unit"].first()


Item
Cake        3.0
Coffee      2.0
Cookie      1.0
ERROR       1.5
Juice       3.0
Salad       5.0
Sandwich    4.0
Smoothie    4.0
Tea         1.5
UNKNOWN     3.0
Name: Price Per Unit, dtype: float64

Tras observar el valor de cada Item, podemos ver como hay una coincidencia con el Price Per Unit del Item "ERROR"

In [790]:
df_cafe_dirty["Price Per Unit"].value_counts()

Price Per Unit
3.0    2429
4.0    2331
2.0    1227
5.0    1204
1.0    1143
1.5    1133
Name: count, dtype: int64

In [791]:
df_cafe_dirty[df_cafe_dirty["Price Per Unit"].isin(["Unknown", "ERROR"])]


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [792]:
df_cafe_dirty[df_cafe_dirty["Price Per Unit"].astype(str).str.contains("[A-Za-z]", regex=True)]
#De esta forma podemos ver si alguno de los valores del Price Per Unit tenía algún valor como "$" u otros.


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
56,TXN_3578141,Cake,5,,15.0,,Takeaway,2023-06-27
65,TXN_4987129,Sandwich,3,,,,In-store,2023-10-20
68,TXN_8427104,Salad,2,,10.0,,In-store,2023-10-27
85,TXN_8035512,Tea,3,,4.5,Cash,UNKNOWN,2023-10-29
104,TXN_7447872,Juice,2,,6.0,,,NaT
...,...,...,...,...,...,...,...,...
9924,TXN_5981429,Juice,2,,6.0,Digital Wallet,,2023-12-24
9926,TXN_2464706,Cake,4,,12.0,Digital Wallet,Takeaway,2023-11-09
9961,TXN_2153100,Tea,2,,3.0,Cash,,2023-12-29
9996,TXN_9659401,,3,,3.0,Digital Wallet,,2023-06-02


In [None]:
#A continuación, vamos a eliminar las filas que contengan valores nulos como son "NaN", "ERROR" o "UNKNOWN"
# tanto en la la columna Item como en la columna Price Per Unit, ya que, no nos aportan ninguna información.

In [794]:
invalid_values = ["ERROR", "UNKNOWN"]


In [875]:
#Función que elimine las filas donde AMBOS valores son invalid_values
mask_invalid_both = (
    (df_cafe_dirty["Item"].isin(invalid_values) | df_cafe_dirty["Item"].isna()) &
    (df_cafe_dirty["Price Per Unit"].isin(invalid_values) | df_cafe_dirty["Price Per Unit"].isna())
)


In [796]:
df_cafe_dirty = df_cafe_dirty[~mask_invalid_both]


In [797]:
df_cafe_dirty[
    (df_cafe_dirty["Item"].isin(invalid_values) | df_cafe_dirty["Item"].isna()) &
    (df_cafe_dirty["Price Per Unit"].isin(invalid_values) | df_cafe_dirty["Price Per Unit"].isna())
]



Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


In [798]:
df_cafe_dirty.shape

(9946, 8)

Hemos eliminado y guardado la primera fase de la limpieza, podemos ver que el tamaño de la tabla de ha reducido de 10.000 filas a 9.946.

El siguiente paso que quiero dar es transormar en valores tangibles aquellos invalid_values donde sabemos el precio por unidad y no el Item o a la inversa. Por ejemplo, sabemos que un Coffee vale 2.0 y la Cookie 1.0. Y al revés, donde el precio sea 1.0 y el Item un invalid_value poder decir que es una Cookie. 
    Nota: En algunos casos se complica ya que el precio de la Cake y del Juice es el mismo.

In [799]:
df_cafe_dirty.groupby("Item")["Price Per Unit"].first()

Item
Cake        3.0
Coffee      2.0
Cookie      1.0
ERROR       1.5
Juice       3.0
Salad       5.0
Sandwich    4.0
Smoothie    4.0
Tea         1.5
UNKNOWN     3.0
Name: Price Per Unit, dtype: float64

Como podemos observar, los Item: "ERROR" tienen un Price Per Unit de 1.5. El único Item que tiene ese Precio es el Té, por ello podemos transformar los Item que sean ERROR a Té.

In [800]:
df_cafe_dirty.loc[
    (df_cafe_dirty["Item"] == "ERROR") & (df_cafe_dirty["Price Per Unit"] == 1.5),
    "Item"
] = "Tea"


In [801]:
df_cafe_dirty_tea = df_cafe_dirty[df_cafe_dirty["Price Per Unit"] == 1.5]


### Segundo paso, transformar aquellos Item con invalid_values en un valor cuando se cumpla lo siguiente: 
Coffee → 2.0
Cookie → 1.0
Salad → 5.0
Tea → 1.5
En el caso de Items con Price Per Unit iguales no se podrá realizar este cambio.

In [802]:
mapping = {
    2.0: "Coffee",
    1.0: "Cookie",
    5.0: "Salad",
    1.5: "Tea"
}

In [803]:
for price_, item_name in mapping.items():
    df_cafe_dirty.loc[
        (df_cafe_dirty["Item"].isin(invalid_values)) &
        (df_cafe_dirty["Price Per Unit"] == price_),
        "Item"
    ] = item_name

In [804]:
#Verificación
df_cafe_dirty[df_cafe_dirty["Item"].isin(mapping.values())]


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
2,TXN_4271903,Cookie,4,1.0,,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
10,TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
...,...,...,...,...,...,...,...,...
9982,TXN_8567525,Cookie,2,1.0,2.0,,Takeaway,2023-12-30
9990,TXN_1538510,Coffee,5,2.0,10.0,Digital Wallet,,2023-05-22
9995,TXN_7672686,Coffee,2,2.0,4.0,,UNKNOWN,2023-08-30
9997,TXN_5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02


### Tercer paso: Transformar los Price Per Unit que contengan invalid_values.

In [805]:
#Generamos un mapping con los precios de cada Item. Cake - 3.0, etc.
price_mapping = {
    "Cake": 3.0,
    "Coffee": 2.0,
    "Cookie": 1.0,
    "Juice": 3.0,
    "Salad": 5.0,
    "Sandwich": 4.0,
    "Smoothie": 4.0,
    "Tea": 1.5
}

In [847]:
for item_name, price_ in price_mapping.items():
    df_cafe_dirty.loc[
        (df_cafe_dirty["Price Per Unit"].isin(invalid_values) | df_cafe_dirty["Price Per Unit"].isna()) &
        (df_cafe_dirty["Item"] == item_name),
        "Price Per Unit"
    ] = price_


In [848]:
df_cafe_dirty[df_cafe_dirty["Price Per Unit"].isin(invalid_values)]
#Devuelve una tabla vacía por lo que ya no hay valores nulos en la columna Price Per Unit.


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date


Debido a que los UNKNOWN que quedan tienen precio de 3.0 y de 4.0 y ese precio puede ser de 2 Items diferentes, considero que la mejor opción es eliminarlas ya que solo son 147 de 10.000 filas.

In [849]:
df_cafe_dirty[df_cafe_dirty["Item"] != "UNKNOWN"]

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
...,...,...,...,...,...,...,...,...
9993,TXN_4766549,Smoothie,2,4.0,8.0,Cash,,2023-10-20
9995,TXN_7672686,Coffee,2,2.0,4.0,,UNKNOWN,2023-08-30
9997,TXN_5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02
9998,TXN_7695629,Cookie,3,1.0,3.0,Digital Wallet,,2023-12-02


In [850]:
df_cafe_dirty_unknown = df_cafe_dirty[df_cafe_dirty["Item"] == "UNKNOWN"]
#Ver el tamaño de filas de Unkonws.
df_cafe_dirty_unknown

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
6,TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
36,TXN_6855453,UNKNOWN,4,3.0,12.0,,In-store,2023-07-17
91,TXN_5455936,UNKNOWN,5,3.0,15.0,,In-store,2023-10-28
153,TXN_6541415,UNKNOWN,4.0,3.0,12.0,Cash,In-store,2023-11-25
165,TXN_3226832,UNKNOWN,5,4.0,20.0,Cash,UNKNOWN,2023-09-04
...,...,...,...,...,...,...,...,...
9727,TXN_3562418,UNKNOWN,2,4.0,8.0,Digital Wallet,UNKNOWN,2023-08-13
9763,TXN_7652830,UNKNOWN,2,3.0,6.0,,,2023-08-15
9836,TXN_9162296,UNKNOWN,3,4.0,12.0,Cash,In-store,2023-05-10
9946,TXN_8807600,UNKNOWN,1,4.0,4.0,Cash,Takeaway,2023-09-24


In [851]:
df_cafe_dirty['Quantity'].isna().sum()


np.int64(35)

In [852]:
df_cafe_dirty_quantity_nan = df_cafe_dirty[df_cafe_dirty["Quantity"].isna()]
df_cafe_dirty_quantity_nan


Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
236,TXN_8562645,Salad,,5.0,,,In-store,2023-05-18
278,TXN_3229409,Juice,,3.0,,Cash,Takeaway,2023-04-15
629,TXN_9289174,Cake,,3.0,12.0,Digital Wallet,In-store,2023-12-30
641,TXN_2962976,Juice,,3.0,,,,2023-03-17
738,TXN_8696094,Sandwich,,4.0,,,Takeaway,2023-05-14
912,TXN_1575608,Sandwich,,4.0,20.0,ERROR,Takeaway,2023-01-05
1008,TXN_7225428,Tea,,1.5,3.0,Credit Card,Takeaway,2023-03-07
1436,TXN_7590801,Tea,,1.5,6.0,Cash,Takeaway,NaT
1482,TXN_3593060,Smoothie,,4.0,16.0,Cash,,2023-03-05
2330,TXN_3849488,Salad,,5.0,5.0,,In-store,2023-03-01


Por la tabla que nos devuelve nos muestra que para las 476 filas donde Quantity es un valor NA sabemos el precio y el total, por lo que podemos deducir el valor de Quantity ya que "Quantity * Price Per Unit = Total Spent"

Tomando en base que esa igualdad de Quantity * Price Per Unit = Total Spent, vamos a crear una función que en el caso de que falte 1 de los 3 valores lo calcule, que si faltan 2 no lo calcule y si alguno no encajase no lo modifique.

In [853]:
import numpy as np

In [854]:
def fix_row(row):
    q = row["Quantity"]
    p = row["Price Per Unit"]
    t = row["Total Spent"]
    
    # 1. Si solo falta Total Spent → Quantity * Price
    if pd.isna(t) and pd.notna(q) and pd.notna(p):
        row["Total Spent"] = q * p
    
    # 2. Si solo falta Quantity → Total / Price
    elif pd.isna(q) and pd.notna(t) and pd.notna(p) and p != 0:
        row["Quantity"] = t / p
    
    # 3. Si solo falta Price → Total / Quantity
    elif pd.isna(p) and pd.notna(t) and pd.notna(q) and q != 0:
        row["Price Per Unit"] = t / q
    
    return row

df_cafe_dirty = df_cafe_dirty.apply(fix_row, axis=1)


In [855]:
df_cafe_dirty.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
8,TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31


In [856]:
df_cafe_dirty['Quantity'].isna().sum()

np.int64(20)

In [857]:
df_cafe_dirty_quantity_na = df_cafe_dirty[df_cafe_dirty["Quantity"].isna()]
df_cafe_dirty_quantity_na

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
236,TXN_8562645,Salad,,5.0,,,In-store,2023-05-18
278,TXN_3229409,Juice,,3.0,,Cash,Takeaway,2023-04-15
641,TXN_2962976,Juice,,3.0,,,,2023-03-17
738,TXN_8696094,Sandwich,,4.0,,,Takeaway,2023-05-14
2796,TXN_9188692,Cake,,3.0,,Credit Card,,2023-12-01
3203,TXN_4565754,Smoothie,,4.0,,Digital Wallet,Takeaway,2023-10-06
3224,TXN_6297232,Coffee,,2.0,,,,2023-04-07
3401,TXN_3251829,Tea,,1.5,,Digital Wallet,In-store,2023-07-25
4257,TXN_6470865,Coffee,,2.0,,Digital Wallet,Takeaway,2023-09-18
5841,TXN_5884081,Cookie,,1.0,,Digital Wallet,In-store,2023-07-05


Eliminaremos las 20 filas que contienen valores nulos en Quantity y en Total, ya que no nos aportan valor

In [859]:
df_cafe_dirty = df_cafe_dirty.dropna(subset=["Quantity"])
df_cafe_dirty[["Quantity"]].isna().sum()

Quantity    0
dtype: int64

In [861]:
df_cafe_dirty.shape

(9926, 8)

In [863]:
df_cafe_dirty['Price Per Unit'].isna().sum()

np.int64(0)

In [864]:
df_cafe_dirty['Total Spent'].isna().sum()

np.int64(0)

Finalmente, las columnas de Quantity, Price Per Unit y Total Spent están libres de nulos

In [865]:
#¿Item sigue teniendo algún valor nulo?
df_cafe_dirty['Item'].isna().sum()

np.int64(310)

In [870]:
#Como Item sigue teniendo valores nulos cuyo precio unitario puede coincidir con 2 items diferentes la mejor opción
#será eliminar esas filas, ya que, no son demasiadas.
df_cafe_dirty_item_nan = df_cafe_dirty[df_cafe_dirty['Item'].isna()]
df_cafe_dirty_item_nan

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
8,TXN_4717867,,5,3.0,15.0,,Takeaway,2023-07-28
30,TXN_1736287,,5,2.0,10.0,Digital Wallet,,2023-06-02
61,TXN_8051289,,1,3.0,3.0,,In-store,2023-10-09
72,TXN_6044979,,1,1.0,1.0,Cash,In-store,2023-12-08
89,TXN_4132730,,5,1.0,5.0,,In-store,2023-03-12
...,...,...,...,...,...,...,...,...
9771,TXN_9582818,,1,4.0,4.0,Cash,In-store,2023-10-27
9788,TXN_4600894,,5,5.0,25.0,Digital Wallet,,2023-09-17
9855,TXN_3740505,,2,1.5,3.0,,,2023-11-21
9876,TXN_3105633,,1,2.0,2.0,,In-store,2023-03-30


In [872]:
df_cafe_dirty = df_cafe_dirty.dropna(subset=["Item"])
df_cafe_dirty['Item'].isna().sum()

np.int64(0)

In [873]:
df_cafe_dirty.shape

(9616, 8)

In [874]:
df_cafe_dirty.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
6,TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31
10,TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07


In [882]:
df_cafe_dirty_item_invalid_values = df_cafe_dirty[df_cafe_dirty['Item'].isin(invalid_values)]
df_cafe_dirty_item_invalid_values

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
6,TXN_4433211,UNKNOWN,3,3.0,9.0,ERROR,Takeaway,2023-10-06
36,TXN_6855453,UNKNOWN,4,3.0,12.0,,In-store,2023-07-17
69,TXN_8471743,ERROR,5,3.0,15.0,Digital Wallet,In-store,2023-04-06
91,TXN_5455936,UNKNOWN,5,3.0,15.0,,In-store,2023-10-28
153,TXN_6541415,UNKNOWN,4.0,3.0,12.0,Cash,In-store,2023-11-25
...,...,...,...,...,...,...,...,...
9910,TXN_2338617,ERROR,2,3.0,6.0,Digital Wallet,UNKNOWN,2023-01-12
9918,TXN_2292088,ERROR,1,4.0,4.0,Digital Wallet,Takeaway,2023-03-04
9946,TXN_8807600,UNKNOWN,1,4.0,4.0,Cash,Takeaway,2023-09-24
9981,TXN_4583012,ERROR,5,4.0,20.0,Digital Wallet,,2023-02-27


In [885]:
#Eliminamos las filas con Unknown y Error (invalid_values previamente definido) ya que, el price per unit nos indica
# que podría ser más de un Item.
df_cafe_dirty = df_cafe_dirty[~df_cafe_dirty['Item'].isin(invalid_values)]
df_cafe_dirty['Item'].isin(invalid_values).sum()


np.int64(0)

In [886]:
df_cafe_dirty.head(15)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31
10,TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
11,TXN_3051279,Sandwich,2,4.0,8.0,Credit Card,Takeaway,NaT


In [890]:
df_cafe_dirty_Payment_method_invalid = df_cafe_dirty[df_cafe_dirty['Payment Method'].isin(invalid_values)]
df_cafe_dirty_Payment_method_invalid

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
25,TXN_7958992,Smoothie,3,4.0,12.0,UNKNOWN,UNKNOWN,2023-12-13
39,TXN_6688524,Coffee,4,2.0,8.0,ERROR,,2023-06-29
51,TXN_6342161,Salad,5,5.0,25.0,ERROR,Takeaway,2023-01-08
95,TXN_8268061,Salad,3,5.0,15.0,ERROR,Takeaway,2023-08-20
...,...,...,...,...,...,...,...,...
9955,TXN_9187008,Tea,4,1.5,6.0,ERROR,,2023-09-16
9972,TXN_3124078,Cake,4,3.0,12.0,UNKNOWN,In-store,2023-08-06
9980,TXN_6796890,Tea,4,1.5,6.0,UNKNOWN,,2023-08-24
9988,TXN_9594133,Cake,5,3.0,15.0,ERROR,,NaT


In [891]:
df_cafe_dirty_Payment_method_nan = df_cafe_dirty[df_cafe_dirty['Payment Method'].isna()]
df_cafe_dirty_Payment_method_nan

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
9,TXN_2064365,Sandwich,5,4.0,20.0,,In-store,2023-12-31
13,TXN_9437049,Cookie,5,1.0,5.0,,Takeaway,2023-06-01
14,TXN_8915701,Tea,2,1.5,3.0,,In-store,2023-03-21
16,TXN_3765707,Sandwich,1,4.0,4.0,,,2023-06-10
23,TXN_2616390,Sandwich,2,4.0,8.0,,,2023-09-18
...,...,...,...,...,...,...,...,...
9976,TXN_3528020,Cookie,1,1.0,1.0,,Takeaway,2023-08-26
9978,TXN_4302199,Tea,3,1.5,4.5,,,2023-02-16
9982,TXN_8567525,Cookie,2,1.0,2.0,,Takeaway,2023-12-30
9985,TXN_3297457,Cake,2,3.0,6.0,,UNKNOWN,2023-01-03


In [892]:
#Como son muchas filas con valores nulos o invalidos, vamos a transformarlos por un valor que sea "Other"

In [895]:
def clean_payment_method (value):
    if pd.isna(value) or value in invalid_values:
        return "Other"
    return value

In [897]:
df_cafe_dirty['Payment Method'] = df_cafe_dirty['Payment Method'].apply(clean_payment_method) 

In [898]:
df_cafe_dirty.head(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,Other,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
9,TXN_2064365,Sandwich,5,4.0,20.0,Other,In-store,2023-12-31
10,TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
11,TXN_3051279,Sandwich,2,4.0,8.0,Credit Card,Takeaway,NaT


In [902]:
df_cafe_dirty.Location.unique()

array(['Takeaway', 'In-store', 'UNKNOWN', <NA>, 'ERROR'], dtype=object)

In [904]:
df_cafe_dirty_location_invalid_values = df_cafe_dirty[df_cafe_dirty['Location'].isin(invalid_values)]
df_cafe_dirty_location_invalid_values

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
3,TXN_7034554,Salad,2,5.0,10.0,Other,UNKNOWN,2023-04-27
7,TXN_6699534,Sandwich,4,4.0,16.0,Cash,UNKNOWN,2023-10-28
18,TXN_8876618,Cake,5,3.0,15.0,Cash,ERROR,2023-03-25
25,TXN_7958992,Smoothie,3,4.0,12.0,Other,UNKNOWN,2023-12-13
31,TXN_8927252,Cookie,2,1.0,2.0,Credit Card,ERROR,2023-11-06
...,...,...,...,...,...,...,...,...
9913,TXN_8408353,Tea,1,1.5,1.5,Credit Card,UNKNOWN,2023-12-13
9931,TXN_8344810,Smoothie,2,4.0,8.0,Other,UNKNOWN,NaT
9939,TXN_9026468,Sandwich,2,4.0,8.0,Credit Card,ERROR,2023-08-23
9985,TXN_3297457,Cake,2,3.0,6.0,Other,UNKNOWN,2023-01-03


In [905]:
df_cafe_dirty_location_nan = df_cafe_dirty[df_cafe_dirty['Location'].isna()]
df_cafe_dirty_location_nan

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,2023-03-31
16,TXN_3765707,Sandwich,1,4.0,4.0,Other,,2023-06-10
23,TXN_2616390,Sandwich,2,4.0,8.0,Other,,2023-09-18
28,TXN_8467949,Smoothie,5,4.0,20.0,Credit Card,,2023-03-11
33,TXN_7710508,Cookie,5,1.0,5.0,Cash,,NaT
...,...,...,...,...,...,...,...,...
9989,TXN_1741685,Juice,5,3.0,15.0,Cash,,2023-08-18
9990,TXN_1538510,Coffee,5,2.0,10.0,Digital Wallet,,2023-05-22
9993,TXN_4766549,Smoothie,2,4.0,8.0,Cash,,2023-10-20
9997,TXN_5255387,Coffee,4,2.0,8.0,Digital Wallet,,2023-03-02


In [912]:
#Demasiados valores nulos o invalid_values.
# Toma de decisión: ¿Cambiamos los valores nulos o invalid_values o eliminamos la columna al faltar 1/3 de los valores?
# Voy a escoger la opción de cambiar los valores nulos e invalidos y unificarlos a Location Not Saved

In [None]:
df_cafe_dirty['Transaction Date'] = df_cafe_dirty['Transaction Date'].dt.strftime('%d/%m/%Y')
#invertir el orden de la fecha para que aparezca primero el día después el mes y por último el año

In [915]:
df_cafe_dirty.head(6)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,08/09/2023
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,16/05/2023
2,TXN_4271903,Cookie,4,1.0,4.0,Credit Card,In-store,19/07/2023
3,TXN_7034554,Salad,2,5.0,10.0,Other,UNKNOWN,27/04/2023
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,11/06/2023
5,TXN_2602893,Smoothie,5,4.0,20.0,Credit Card,,31/03/2023


In [919]:
#Defino la función para modificar valores en Location:
def clean_location (value):
    if pd.isna(value) or value in invalid_values:
        return "Not Saved"
    return value

In [920]:
df_cafe_dirty['Location'] = df_cafe_dirty['Location'].apply(clean_location)

In [922]:
df_cafe_dirty.sample(10)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
9504,TXN_8532735,Coffee,2.0,2.0,4.0,Cash,Takeaway,28/07/2023
8565,TXN_9061106,Cake,4.0,3.0,12.0,Digital Wallet,Takeaway,04/11/2023
3120,TXN_2052609,Cake,4.0,3.0,12.0,Credit Card,Takeaway,23/06/2023
8735,TXN_6744096,Coffee,1.0,2.0,2.0,Digital Wallet,In-store,15/11/2023
4698,TXN_2273573,Cookie,5.0,1.0,5.0,Cash,In-store,12/11/2023
6225,TXN_6859249,Cookie,2.0,1.0,2.0,Other,Not Saved,
488,TXN_4769307,Tea,5.0,1.5,7.5,Digital Wallet,Not Saved,24/12/2023
917,TXN_2826603,Salad,2.0,5.0,10.0,Digital Wallet,Takeaway,22/06/2023
2614,TXN_4269933,Smoothie,2.0,4.0,8.0,Other,Takeaway,28/03/2023
3081,TXN_3421456,Cake,1.0,3.0,3.0,Other,Not Saved,05/09/2023


In [925]:
df_cafe_dirty_transaction_date_nat = df_cafe_dirty[df_cafe_dirty['Transaction Date'].isna()]
df_cafe_dirty_transaction_date_nat 

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
11,TXN_3051279,Sandwich,2,4.0,8.0,Credit Card,Takeaway,
29,TXN_7640952,Cake,4,3.0,12.0,Digital Wallet,Takeaway,
33,TXN_7710508,Cookie,5,1.0,5.0,Cash,Not Saved,
77,TXN_2091733,Salad,1,5.0,5.0,Other,In-store,
103,TXN_7028009,Cake,4,3.0,12.0,Other,Takeaway,
...,...,...,...,...,...,...,...,...
9933,TXN_9460419,Cake,1,3.0,3.0,Other,Takeaway,
9937,TXN_8253472,Cake,1,3.0,3.0,Other,Not Saved,
9949,TXN_3130865,Juice,3,3.0,9.0,Other,In-store,
9983,TXN_9226047,Smoothie,3,4.0,12.0,Cash,Not Saved,


In [926]:
def clean_time (value):
    if pd.isna(value) or value in invalid_values:
        return "Not Time"
    return value

In [927]:
df_cafe_dirty['Transaction Date'] = df_cafe_dirty['Transaction Date'].apply(clean_time)

In [928]:
df_cafe_dirty.sample(30)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
2918,TXN_9365558,Salad,4.0,5.0,20.0,Credit Card,Takeaway,12/02/2023
8003,TXN_8992377,Cake,4.0,3.0,12.0,Credit Card,Takeaway,12/07/2023
2948,TXN_5234368,Coffee,3.0,2.0,6.0,Digital Wallet,Not Saved,10/06/2023
7833,TXN_8107005,Juice,3.0,3.0,9.0,Other,Not Saved,15/09/2023
2096,TXN_7415686,Sandwich,3.0,4.0,12.0,Credit Card,In-store,13/04/2023
2732,TXN_7092761,Salad,4.0,5.0,20.0,Other,Not Saved,11/05/2023
3698,TXN_6083650,Coffee,2.0,2.0,4.0,Digital Wallet,Takeaway,01/07/2023
4156,TXN_9269683,Sandwich,5.0,4.0,20.0,Digital Wallet,Not Saved,26/09/2023
5022,TXN_1495371,Sandwich,2.0,4.0,8.0,Credit Card,Takeaway,21/07/2023
5473,TXN_5826333,Cookie,1.0,1.0,1.0,Cash,Not Saved,14/05/2023


In [929]:
df_cafe_dirty.to_csv('dataset cafe limip', index = False) 