# Amazon Fashion Apparel Recommendation With NLP and Deep Learning

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Loading the dataset

In [9]:
data = pd.read_json("shirts_data.json")

In [10]:
print("No of Data Points : ",data.shape[0])
print("No of Features : ",data.shape[1])

No of Data Points :  183138
No of Features :  19


This particular dataset has around 200,000 data points and 19 features

## Overview of the Dataset

In [12]:
data.head()

Unnamed: 0,sku,asin,product_type_name,formatted_price,author,color,brand,publisher,availability,reviews,large_image_url,availability_type,small_image_url,editorial_review,title,model,medium_image_url,manufacturer,editorial_reivew
0,,B016I2TS4W,SHIRT,,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Minions Como Superheroes Ironman Women's O Nec...,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,
1,,B01N49AI08,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,
2,,B01JDPCOHO,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,
3,,B01N19U5H5,SHIRT,,,,Focal18,,,"[True, https://www.amazon.com/reviews/iframe?a...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,100% Brand New & Fashion<br> Quantity: 1 Piece...,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,,https://images-na.ssl-images-amazon.com/images...,,
4,,B004GSI2OS,SHIRT,$26.26,,Onyx Black/ Stone,FeatherLite,,Usually ships in 6-10 business days,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,now,https://images-na.ssl-images-amazon.com/images...,,Featherlite Ladies' Long Sleeve Stain Resistan...,,https://images-na.ssl-images-amazon.com/images...,,


We can observe that there are 19 features present in the dataset

In [14]:
data.columns

Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',
       'color', 'brand', 'publisher', 'availability', 'reviews',
       'large_image_url', 'availability_type', 'small_image_url',
       'editorial_review', 'title', 'model', 'medium_image_url',
       'manufacturer', 'editorial_reivew'],
      dtype='object')

### Which features are useful for our problem statement?

ASIN - Amazon Standard Identification Number

In [15]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [16]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


### How many product types are there in total?

In [17]:
data['product_type_name'].describe()

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object

There are 72 unique categories of products in the dataset with SHIRT having the top frequency

### The unique product types in the dataset are:

In [18]:
data['product_type_name'].unique()

array(['SHIRT', 'SWEATER', 'APPAREL', 'OUTDOOR_RECREATION_PRODUCT',
       'BOOKS_1973_AND_LATER', 'PANTS', 'HAT', 'SPORTING_GOODS', 'DRESS',
       'UNDERWEAR', 'SKIRT', 'OUTERWEAR', 'BRA', 'ACCESSORY',
       'ART_SUPPLIES', 'SLEEPWEAR', 'ORCA_SHIRT', 'HANDBAG',
       'PET_SUPPLIES', 'SHOES', 'KITCHEN', 'ADULT_COSTUME',
       'HOME_BED_AND_BATH', 'MISC_OTHER', 'BLAZER',
       'HEALTH_PERSONAL_CARE', 'TOYS_AND_GAMES', 'SWIMWEAR',
       'CONSUMER_ELECTRONICS', 'SHORTS', 'HOME', 'AUTO_PART',
       'OFFICE_PRODUCTS', 'ETHNIC_WEAR', 'BEAUTY',
       'INSTRUMENT_PARTS_AND_ACCESSORIES', 'POWERSPORTS_PROTECTIVE_GEAR',
       'SHIRTS', 'ABIS_APPAREL', 'AUTO_ACCESSORY', 'NONAPPARELMISC',
       'TOOLS', 'BABY_PRODUCT', 'SOCKSHOSIERY',
       'POWERSPORTS_RIDING_SHIRT', 'EYEWEAR', 'SUIT', 'OUTDOOR_LIVING',
       'POWERSPORTS_RIDING_JACKET', 'HARDWARE', 'SAFETY_SUPPLY',
       'ABIS_DVD', 'VIDEO_DVD', 'GOLF_CLUB', 'MUSIC_POPULAR_VINYL',
       'HOME_FURNITURE_AND_DECOR', 'TABLET_COMPUTER',

### What are the top 10 frequent product_types?

In [24]:
from collections import Counter

In [21]:
n = 10
dfFrequent = data['product_type_name'].value_counts()[:n].index.tolist()

In [22]:
dfFrequent

['SHIRT',
 'APPAREL',
 'BOOKS_1973_AND_LATER',
 'DRESS',
 'SPORTING_GOODS',
 'SWEATER',
 'OUTERWEAR',
 'OUTDOOR_RECREATION_PRODUCT',
 'ACCESSORY',
 'UNDERWEAR']

Using Counter function

In [25]:
product_count = Counter(list(data['product_type_name']))
product_count.most_common(10)

[('SHIRT', 167794),
 ('APPAREL', 3549),
 ('BOOKS_1973_AND_LATER', 3336),
 ('DRESS', 1584),
 ('SPORTING_GOODS', 1281),
 ('SWEATER', 837),
 ('OUTERWEAR', 796),
 ('OUTDOOR_RECREATION_PRODUCT', 729),
 ('ACCESSORY', 636),
 ('UNDERWEAR', 425)]

### What are the unique colors in the dataset?

In [26]:
data['color'].describe()

count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object

- The most occuring color is Black.
- There are 13,207 records with the color black

### What are the top 5 colors from the dataset?

In [29]:
color_count = Counter(list(data['color']))
type(color_count)

collections.Counter

In [33]:
color_counter = color_count.most_common(5)

In [34]:
color_counter[1:]

[('Black', 13207), ('White', 8616), ('Blue', 3570), ('Red', 2289)]

### What percentage of data is missing from the colors feature?

There are 118,182 missing records in the dataset with no color

In [36]:
color_counter[0]

(None, 118182)

### What is the highest price of a record in the dataset?

### PDF of price feature in dataset
The mean price from the chart is : 

### How many unique brands are there in the dataset?

### Which brand has the highest number of products?

### What are the 10 most common brands in the dataset?