In [1]:
reviews = io.load('reviews')

2019-07-04 17:17:26,702 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVLocalDataSet)...


In [2]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
import pandas as pd

def preprocess_reviews(reviews: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the review data.
        
        Args:
            reviews: source data.
        Returns:
            Preprocessed data.
    
    """
    
    # Rename columns of data frame
    reviews = reviews.rename(
        columns = {
            'Unnamed: 0': 'id',
            'Clothing ID': 'product_id',
            'Age': 'author_age',
            'Title': 'review_title',
            'Review Text': 'review_text',
            'Rating': 'star_rating',
            'Recommended IND': 'recommend_flag',
            'Positive Feedback Count': 'upvotes',
            'Division Name': 'product_category_division',
            'Department Name': 'product_category_department',
            'Class Name': 'product_category_class'
        }
    )
    
    # Update review index
    assert reviews['id'].is_unique, 'Review identifier must be unique.'
    reviews = reviews.set_index('id')
    
    # Lower case of category hierarchy
    category_hierarchy = ['product_category_division', 'product_category_department', 'product_category_class']
    reviews[category_hierarchy] = reviews[category_hierarchy].apply(lambda x: x.str.lower(), axis = 0)
    
    # Replace incorrect spelling of 'intimates'
    reviews['product_category_division'] = reviews['product_category_division'].replace('initmates','intimates')
    
    # Remove reviews without review text
    reviews = reviews.dropna(subset=["review_text"])
    
    return reviews

In [4]:
reviews = preprocess_reviews(io.load('reviews'))
reviews.isnull().sum()

2019-07-04 17:19:07,115 - kedro.io.data_catalog - INFO - Loading data from `reviews` (CSVLocalDataSet)...


product_id                        0
author_age                        0
review_title                   2966
review_text                       0
star_rating                       0
recommend_flag                    0
upvotes                           0
product_category_division        13
product_category_department      13
product_category_class           13
dtype: int64

In [5]:
products_without_category = reviews\
    [reviews.\
         filter(like = 'category')\
         .isnull()\
         .any(1)]\
    .product_id\
    .unique()
products_without_category

array([ 72, 492, 152, 184, 772, 665, 136])

In [6]:
reviews[reviews.product_id.isin(products_without_category)][['product_id', 'review_title', 'review_text']]

Unnamed: 0_level_0,product_id,review_title,review_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9444,72,My favorite socks!!!,"I never write reviews, but these socks are so ..."
13767,492,So soft!,I just love this hoodie! it is so soft and com...
13768,492,Wardrobe staple,Love this hoodie. so soft and goes with everyt...
16216,152,Warm and cozy,"Just what i was looking for. soft, cozy and warm."
16221,152,Love!,I am loving these. they are quite long but are...
16223,152,"""long and warm""",These leg warmers are perfect for me. they are...
18626,184,Nubby footless tights,"These are amazing quality. i agree, size up to..."
18671,184,New workhorse,These tights are amazing! if i care for them w...
20088,772,Comfy sweatshirt!,This sweatshirt is really nice! it's oversize...
21532,665,So worth it!,Got these on sale...absolutely love eberjey! f...


In [7]:
imputed_product_types = {
    72: 'socks',
    492: 'hoodie',
    152: 'leg warmer',
    184: 'tights',
    772: 'sweatshirt',
    665: None,
    136: 'socks'
}

In [8]:
reviews[reviews\
            .review_title\
            .str.contains(imputed_product_types[772], na = False)]\
        .filter(like = 'category')\
        .drop_duplicates()

Unnamed: 0_level_0,product_category_division,product_category_department,product_category_class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
283,general,tops,knits
363,intimates,intimate,lounge
3689,general,jackets,jackets
7303,general,dresses,dresses
10997,intimates,intimate,sleep
14483,general petite,tops,knits
20088,,,


In [9]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [10]:
category_hierarchy = ['product_category_division', 'product_category_department', 'product_category_class']
reviews\
    .filter(category_hierarchy)\
    .drop_duplicates()\
    .sort_values(category_hierarchy)

Unnamed: 0_level_0,product_category_division,product_category_department,product_category_class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12397,general,bottoms,casual bottoms
224,general,bottoms,jeans
15,general,bottoms,pants
638,general,bottoms,shorts
27,general,bottoms,skirts
1,general,dresses,dresses
121,general,jackets,jackets
18,general,jackets,outerwear
4,general,tops,blouses
42,general,tops,fine gauge
