### Extracting product id from the weblinks 

Necessary step to act as reference for web scraping for reviews

In [25]:
#Import necessary libraries
import json 
import re
import os
import pandas as pd

In [3]:
with open('../data/raw_data/initial.json', 'r') as file:
    raw_data= json.load(file)
df=pd.DataFrame.from_dict(raw_data)

In [4]:
df['product_id']=df.weblink.str.partition(sep='grid:')[2]

In [5]:
df.columns

Index(['index', 'brand', 'name', 'about the product', 'weblink',
       'sub_category', 'main_category', 'num_likes', 'img_link', 'price',
       'size', 'ingredients', 'rating', 'num_reviews', 'highlights',
       'product_name', 'sensitive', 'combination', 'oily', 'normal', 'dry',
       'clean', 'cruelty-free', 'vegan', 'skin_concerns', 'excl_ingr',
       'best for', 'acids', 'award', 'size_oz', 'size_ml', 'size_g',
       'pricepervol', 'highlighted_ingr', 'clinical_results', 'formulation',
       'formulation_type', 'richness', 'product_type', 'product_id'],
      dtype='object')

In [6]:
df.drop(columns=['index'], inplace=True)

In [19]:
df_links_id=df[['product_name', 'weblink', 'product_id', 'num_reviews']].copy()

In [14]:
with open('../data/processed_data/combined_data.json', 'r') as file:
    raw_data= json.load(file)
selected=pd.DataFrame.from_dict(raw_data)

In [10]:
selected_df= pd.merge(selected, df_links_id, on='product_name', how='left')

In [15]:
datapath_data = os.path.join('../data/raw_data', 'pre_selection.json')
if not os.path.exists(datapath_data):
    selected_df.to_json(datapath_data)

In [23]:
datapath_data = os.path.join('../data/raw_data', 'data_links_id.json')
if not os.path.exists(datapath_data):
    df_links_id.to_json(datapath_data)

______________________________________________________________________

# Creating full list of product_ids scraped

In [3]:
#Storing each raw json file in a dataframe 
with open('../data/raw_data/cleansers_full.json', 'r') as file:
    raw_data= json.load(file)
cleansers_raw=pd.DataFrame.from_dict(raw_data)

with open('../data/raw_data/eye_products.json', 'r') as file:
    raw_data= json.load(file)
eyeproducts_raw=pd.DataFrame.from_dict(raw_data)

with open('../data/raw_data/moisturizers_full.json', 'r') as file:
    raw_data= json.load(file)
moisturizers_raw=pd.DataFrame.from_dict(raw_data)

with open('../data/raw_data/treatments_full.json', 'r') as file:
    raw_data= json.load(file)
treatments_raw=pd.DataFrame.from_dict(raw_data)

In [7]:
full_df= pd.concat([cleansers_raw, eyeproducts_raw, moisturizers_raw, treatments_raw])

In [8]:
full_df['product_id']=full_df.weblink.str.partition(sep='grid:')[2]

In [19]:
#Also need to fix number of reviews 
full_df.drop(full_df[full_df['num_reviews'].isnull()].index, inplace=True)

In [20]:
#Cleaning num_reviews column
full_df.loc[full_df.num_reviews.str.endswith('K'), 'num_reviews']=full_df.loc[full_df.num_reviews.str.endswith('K'), 'num_reviews'].str.strip('K').astype('float64')*1000
full_df['num_reviews']=full_df['num_reviews'].astype('int64')

In [21]:
full_df.reset_index(inplace=True)

In [22]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825 entries, 0 to 1824
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   level_0            1825 non-null   int64 
 1   index              1825 non-null   int64 
 2   brand              1825 non-null   object
 3   name               1825 non-null   object
 4   about the product  1825 non-null   object
 5   weblink            1825 non-null   object
 6   sub_category       1825 non-null   object
 7   main_category      1825 non-null   object
 8   num_likes          1825 non-null   object
 9   img_link           1825 non-null   object
 10  price              1809 non-null   object
 11  size               1599 non-null   object
 12  ingredients        1825 non-null   object
 13  rating             1825 non-null   object
 14  num_reviews        1825 non-null   int64 
 15  highlights         1825 non-null   object
 16  product_id         1825 non-null   object


In [24]:
datapath3 = os.path.join('../data/raw_data', 'preprocessed_full.json')
if not os.path.exists(datapath3):
    full_df.to_json(datapath3)

In [30]:
full_df

Unnamed: 0,level_0,index,brand,name,about the product,weblink,sub_category,main_category,num_likes,img_link,price,size,ingredients,rating,num_reviews,highlights,product_id
0,0,0,Glow Recipe,Watermelon Glow PHA +BHA Pore-Tight Toner,"[What it is: , A bestselling gentle, PHA- and ...",https://www.sephora.com/product/cleansing-exfo...,Toners,cleansers,125.1K,https://www.sephora.com/productimages/sku/s234...,$34.00,5.07 oz/ 150 mL,"[ -Watermelon Extract: Hydrates, delivers esse...",4.5 stars,1900,"[Good for: Pores , Good for: Dullness/Uneven T...",p409800
1,1,1,Drunk Elephant,Slaai™ Makeup-Melting Butter Cleanser,"[What it is: , An innovative cleansing balm t...",https://www.sephora.com/product/cleansing-exfo...,Face Wash & Cleansers,cleansers,66.6K,https://www.sephora.com/productimages/sku/s217...,$34.00,,[ -Nourishing Fruit Salad Blend: A mix of non-...,4 stars,1300,[allure 2019 Best of Beauty Award Winner: Clea...,p409800
2,2,2,Tatcha,Pure One Step Camellia Oil Cleanser,"[Which skin type is it good for?, ✔ Normal, ✔ ...",https://www.sephora.com/product/cleansing-exfo...,Face Wash & Cleansers,cleansers,107.6K,https://www.sephora.com/productimages/sku/s167...,$48.00,5.1 oz/ 150 mL,[ -Japanese Camellia Oil (Tsubaki): Seals in m...,4.5 stars,1700,"[Clean at Sephora, Hydrating, Best for Dry, Co...",p409800
3,3,3,goop,GOOPGLOW Microderm Instant Glow Exfoliator,"[What it is: , A clinically tested, dual-acti...",https://www.sephora.com/product/cleansing-exfo...,Exfoliators,cleansers,12.9K,https://www.sephora.com/productimages/sku/s231...,$125.00,1.7 oz/ 50 mL,"[ -Micro-exfoliating Minerals (Quartz, Garnet,...",4.5 stars,1200,"[Clean at Sephora, Best for Dry, Combo, Normal...",p409800
4,4,4,Lancôme,Bi-Facil Double-Action Eye Makeup Remover,"[What it is: , An award-winning makeup remove...",https://www.sephora.com/product/cleansing-exfo...,Makeup Removers,cleansers,58.7K,https://www.sephora.com/productimages/sku/s534...,$32.00,4.2 oz/ 125 mL,"[Aqua/Water, Cyclopentasiloxane, Isohexadecane...",4.5 stars,3700,[],p409800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1820,1847,573,Farmacy,Very Cherry Bright 15% Clean Vitamin C Serum w...,"[What it is:, A potent 15 percent vitamin C s...",https://www.sephora.com/product/advanced-night...,Face Serums,treatments,22K,https://www.sephora.com/productimages/sku/s225...,$62.00,1.0 oz/ 30 mL,[ -Four Forms of Vitamin C (Natural Acerola Ch...,3.5 stars,210,"[Best for Dry, Combo, Normal Skin, Good for: A...",p446908
1821,1848,574,Kate Somerville,Breakout Fighters,"[Which skin type is it good for?, ✔ Normal, ✔ ...",https://www.sephora.com/product/advanced-night...,Value & Gift Sets,treatments,14.7K,https://www.sephora.com/productimages/sku/s211...,$55.00,,"[EradiKate® Acne Treatment:, -Sulfur 10%, Isop...",4 stars,50,[],p446908
1822,1849,575,Estée Lauder,Advanced Night Repair Intense Reset Concentrate,"[What it is: , An overnight treatment that so...",https://www.sephora.com/product/advanced-night...,Face Serums,treatments,8.7K,https://www.sephora.com/productimages/sku/s226...,$82.00,0.7 oz/ 20.7 mL,"[Water\Aqua\Eau, Dimethicone, Glycerin, Methyl...",4.5 stars,32,"[Good for: Dullness/Uneven Texture, Good for: ...",p446908
1823,1850,576,Dr. Brandt Skincare,Dark Spots No More® Triple Acid Spot Minimizin...,"[What it is: , A targeted, daily concentrate ...",https://www.sephora.com/product/bye-bye-lines-...,Face Serums,treatments,8.5K,https://www.sephora.com/productimages/sku/s230...,$68.00,0.5 oz/ 15 mL,[ -10% Azelaic Acid Derivative: Water soluble ...,4 stars,78,"[Best for Dry, Combo, Normal Skin, Good for: D...",p411398


In [26]:
with open('../data/raw_data/full_reviews.json', 'r') as file:
    raw_data= json.load(file)
reviews=pd.DataFrame.from_dict(raw_data)

In [29]:
reviews

Unnamed: 0,reviewer,r_productid,r_star,r_eyecolor,r_haircolor,r_skintone,r_skintype,r_skinconcerns,r_review
0,Becca07017,P409800,5,brown,auburn,fair,combination,acne,"like the cute size, easy to take with me. And ..."
1,angelbeautytalk,P426340,5,green,blonde,light,oily,acne,It’s a very nice gentle exfoliate. It’s doesn’...
2,Carrie314,P417113,4,brown,blonde,light,dry,acne,"I love the smell of this, it’s a very cooling ..."
3,rachelwhywu,P417113,5,brown,black,medium,combination,acne,i have been using this since my mom tossed me ...
4,rachelwhywu,P417113,5,brown,black,medium,combination,acne,i have been using this since my mom tossed me ...
...,...,...,...,...,...,...,...,...,...
1035,Hana55,P446908,5,hazel,blonde,porcelain,dry,aging,Le sérum anti-âge Advanced Night Repair d’Esté...
1036,Tcd38,P411398,5,green,blonde,light,combination,acne,Like a drink of water for your skin! Fab!
1037,Tcd38,P411398,5,green,blonde,light,combination,acne,Like a drink of water for your skin! Fab!
1038,Wewe95,P444984,5,,,,,,I am a big lover of this product!!! I wish I c...
