In [1]:
reviews = io.load('reviews')

2019-06-14 19:42:16,245 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVLocalDataSet)...


In [2]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
import pandas as pd

def preprocess_reviews(reviews: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the review data.
        
        Args:
            reviews: source data.
        Returns:
            Preprocessed data.
    
    """
    
    # Rename columns of data frame
    reviews = reviews.rename(
        columns = {
            'Unnamed: 0': 'id',
            'Clothing ID': 'product_id',
            'Age': 'author_age',
            'Title': 'review_title',
            'Review Text': 'review_text',
            'Rating': 'star_rating',
            'Recommended IND': 'recommend_flag',
            'Positive Feedback Count': 'upvotes',
            'Division Name': 'product_category_division',
            'Department Name': 'product_category_department',
            'Class Name': 'product_category_class'
        }
    )
    
    # Update review index
    assert reviews['id'].is_unique, 'Review identifier must be unique.'
    reviews = reviews.set_index('id')
    
    # Lower case of category hierarchy
    category_hierarchy = ['product_category_division', 'product_category_department', 'product_category_class']
    reviews[category_hierarchy] = reviews[category_hierarchy].apply(lambda x: x.str.lower(), axis = 0)
    
    # Replace incorrect spelling of 'intimates'
    reviews['product_category_division'] = reviews['product_category_division'].replace('initmates','intimates')
    
    return reviews

In [4]:
preproc_reviews = preprocess_reviews(io.load('reviews'))
preproc_reviews.head()

2019-06-14 19:42:33,069 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVLocalDataSet)...


Unnamed: 0_level_0,product_id,author_age,review_title,review_text,star_rating,recommend_flag,upvotes,product_category_division,product_category_department,product_category_class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,intimates,intimate,intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,general,dresses,dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,general,dresses,dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,general petite,bottoms,pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,general,tops,blouses
