### Extracting product id from the weblinks 

Necessary step to act as reference for web scraping for reviews

In [18]:
#Import necessary libraries
import json 
import re
import os
import pandas as pd

In [19]:
with open('../data/raw_data/initial.json', 'r') as file:
    raw_data= json.load(file)
df=pd.DataFrame.from_dict(raw_data)

In [20]:
df['product_id']=df.weblink.str.partition(sep='grid:')[2]

In [21]:
df.columns

Index(['brand', 'name', 'about the product', 'weblink', 'sub_category',
       'main_category', 'num_likes', 'img_link', 'price', 'size',
       'ingredients', 'rating', 'num_reviews', 'highlights', 'product_name',
       'sensitive', 'combination', 'oily', 'normal', 'dry', 'clean',
       'cruelty-free', 'vegan', 'skin_concerns', 'excl_ingr', 'best for',
       'acids', 'award', 'size_oz', 'size_ml', 'size_g', 'pricepervol',
       'highlighted_ingr', 'clinical_results', 'formulation',
       'formulation_type', 'richness', 'product_type', 'product_id'],
      dtype='object')

In [22]:
df_links_id=df[['product_name', 'weblink', 'product_id', 'num_reviews']].copy()

In [23]:
with open('../data/processed_data/combined_data.json', 'r') as file:
    raw_data= json.load(file)
selected=pd.DataFrame.from_dict(raw_data)

In [24]:
selected_df= pd.merge(selected, df_links_id, on='product_name', how='left')

In [25]:
selected_df.product_id = selected_df.product_id.str.upper()

In [30]:
selected_df.columns

Index(['brand', 'product_name', 'product_type', 'num_likes', 'rating',
       'num_reviews_x', 'sensitive_type', 'combination_type', 'oily_type',
       'normal_type', 'dry_type', 'clean_sephora', 'cruelty_free', 'vegan',
       'best_for_skintype', 'award', 'pricepervol', 'highlighted_ingr',
       'clinical_results', 'formulation_type', 'richness', 'Acne/Blemishes',
       'Anti-Aging', 'Dark Circles', 'Dark spots', 'Dryness',
       'Dullness/Uneven Texture', 'Hair Dryness', 'Hydrating',
       'Loss of firmness', 'Pores ', 'Redness', 'num_excl_ingr',
       'AHA/Glycolic Acid', 'Hyaluronic Acid', 'Salicylic Acid', 'Vitamin C',
       'ingr_list', 'weblink', 'num_reviews_y'],
      dtype='object')

In [31]:
directory= selected_df[['brand', 'product_name']]
directory

Unnamed: 0_level_0,brand,product_name
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
P409800,Glow Recipe,Glow Recipe Watermelon Glow PHA +BHA Pore-Tigh...
P458219,Glow Recipe,Glow Recipe Watermelon Glow PHA +BHA Pore-Tigh...
P468351,Glow Recipe,Glow Recipe Watermelon Glow PHA +BHA Pore-Tigh...
P409800,Tatcha,Tatcha Pure One Step Camellia Oil Cleanser
P409800,goop,goop GOOPGLOW Microderm Instant Glow Exfoliator
...,...,...
P418301,KORA Organics,KORA Organics Noni Radiant Eye Oil
P418301,Peter Thomas Roth,Peter Thomas Roth Water Drench® Hyaluronic Clo...
P418301,Dr. Dennis Gross Skincare,Dr. Dennis Gross Skincare Hyaluronic Marine De...
P418301,Dr. Dennis Gross Skincare,Dr. Dennis Gross Skincare Stress SOS Eye Cream...


In [32]:
datapath_data = os.path.join('../data/raw_data', 'productid_directory.csv')
if not os.path.exists(datapath_data):
    directory.to_csv(datapath_data)

In [9]:
datapath_data = os.path.join('../data/raw_data', 'pre_selection.json')
if not os.path.exists(datapath_data):
    selected_df.to_json(datapath_data)

In [None]:
datapath_data = os.path.join('../data/raw_data', 'data_links_id.json')
if not os.path.exists(datapath_data):
    df_links_id.to_json(datapath_data)

______________________________________________________________________

# Creating full list of product_ids scraped

In [None]:
#Storing each raw json file in a dataframe 
with open('../data/raw_data/cleansers_full.json', 'r') as file:
    raw_data= json.load(file)
cleansers_raw=pd.DataFrame.from_dict(raw_data)

with open('../data/raw_data/eye_products.json', 'r') as file:
    raw_data= json.load(file)
eyeproducts_raw=pd.DataFrame.from_dict(raw_data)

with open('../data/raw_data/moisturizers_full.json', 'r') as file:
    raw_data= json.load(file)
moisturizers_raw=pd.DataFrame.from_dict(raw_data)

with open('../data/raw_data/treatments_full.json', 'r') as file:
    raw_data= json.load(file)
treatments_raw=pd.DataFrame.from_dict(raw_data)

In [None]:
full_df= pd.concat([cleansers_raw, eyeproducts_raw, moisturizers_raw, treatments_raw])

In [None]:
full_df['product_id']=full_df.weblink.str.partition(sep='grid:')[2]

In [None]:
#Also need to fix number of reviews 
full_df.drop(full_df[full_df['num_reviews'].isnull()].index, inplace=True)

In [None]:
#Cleaning num_reviews column
full_df.loc[full_df.num_reviews.str.endswith('K'), 'num_reviews']=full_df.loc[full_df.num_reviews.str.endswith('K'), 'num_reviews'].str.strip('K').astype('float64')*1000
full_df['num_reviews']=full_df['num_reviews'].astype('int64')

In [None]:
full_df.reset_index(inplace=True)

In [None]:
full_df.info()

In [None]:
datapath3 = os.path.join('../data/raw_data', 'preprocessed_full.json')
if not os.path.exists(datapath3):
    full_df.to_json(datapath3)

In [None]:
full_df

In [None]:
with open('../data/raw_data/full_reviews.json', 'r') as file:
    raw_data= json.load(file)
reviews=pd.DataFrame.from_dict(raw_data)

In [None]:
reviews