In [1]:
import pandas as pd

dtype_map = {
    'review_id': 'string',
    'customer_id': 'string',
    'product_id': 'string',
    'rating': 'string',
    'title': 'string',
    'comment': 'string',
    'is_verified_purchase': 'string',
    'helpful_votes': 'string',
    'created_at': 'string'
}

df_reviews = pd.read_csv('../data/raw/reviews.csv', dtype=dtype_map)
df_reviews.head()

Unnamed: 0.1,Unnamed: 0,review_id,customer_id,product_id,rating,title,comment,is_verified_purchase,helpful_votes,created_at
0,0,1,473261,933144,5,Reveal threat.,Hospital them face left space. Business answer...,True,,2024-06-03T18:27:44.440867
1,1,2,469672,969674,4,Voice event soon simple.,Tv owner happen your. Half particularly sort w...,True,61.0,2024-05-28T06:23:34.398247
2,2,3,416180,914709,5,Team organization again.,Father door true industry floor.\nOffer off pr...,True,35.0,2023-07-31T18:03:13.899745
3,3,4,466834,906830,4,Travel and support financial may.,For near response young newspaper order. Progr...,True,48.0,2024-03-05T10:01:02.678492
4,4,5,490557,956457,5,NOTE THESE.,Home reason value sell. Per minute late happen...,False,67.0,2024-05-31T18:08:44.030890


Vamos a descartar las columnas que son innecesarias para nuestro análisis.

In [2]:
df_reviews.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
df_reviews_clean = df_reviews.set_index('review_id')

# DATES
df_reviews_clean['created_at'] = df_reviews_clean['created_at'].str.strip()
df_reviews_clean['created_at'] = pd.to_datetime(df_reviews_clean['created_at'], errors='coerce')

# BOOLEANS
df_reviews_clean['is_verified_purchase'] = df_reviews_clean['is_verified_purchase'].str.strip().str.lower()
df_reviews_clean['is_verified_purchase'] = (
    df_reviews_clean['is_verified_purchase']
    .map({"true": True, "false": False, "1": True, "0": False})
    .astype("boolean")
)

# INTEGERS
for col in ['customer_id', 'product_id', 'rating', 'helpful_votes']:
    df_reviews_clean[col] = df_reviews_clean[col].str.strip()
    df_reviews_clean[col] = pd.to_numeric(df_reviews_clean[col], errors='coerce').astype('Int64')

# STRINGS
for col in ['title', 'comment']:
    df_reviews_clean[col] = df_reviews_clean[col].str.strip()

df_reviews_clean.head()

Unnamed: 0_level_0,customer_id,product_id,rating,title,comment,is_verified_purchase,helpful_votes,created_at
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,473261,933144,5,Reveal threat.,Hospital them face left space. Business answer...,True,,2024-06-03 18:27:44.440867
2,469672,969674,4,Voice event soon simple.,Tv owner happen your. Half particularly sort w...,True,61.0,2024-05-28 06:23:34.398247
3,416180,914709,5,Team organization again.,Father door true industry floor.\nOffer off pr...,True,35.0,2023-07-31 18:03:13.899745
4,466834,906830,4,Travel and support financial may.,For near response young newspaper order. Progr...,True,48.0,2024-03-05 10:01:02.678492
5,490557,956457,5,NOTE THESE.,Home reason value sell. Per minute late happen...,False,67.0,2024-05-31 18:08:44.030890


Exportamos el dataset limpio para reutilizarlo.

In [4]:
df_reviews_clean.to_pickle('../data/clean/reviews.pkl')