# Feature engineering

__________________________________________________________________________________

## 1.0 Loading file

In [1]:
#Import necessary libraries
import json 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
with open('../data/processed_data/combined_data.json', 'r') as file:
    data= json.load(file)
df=pd.DataFrame.from_dict(data)

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
brand,Glow Recipe,Tatcha,goop,CLINIQUE,Tata Harper
product_name,Glow Recipe Watermelon Glow PHA +BHA Pore-Tigh...,Tatcha Pure One Step Camellia Oil Cleanser,goop GOOPGLOW Microderm Instant Glow Exfoliator,CLINIQUE Take The Day Off Makeup Remover For L...,Tata Harper Regenerating Exfoliating Cleanser
product_type,toners,face wash and cleansers,exfoliators and peels,face wash and cleansers,face wash and cleansers
num_likes,125100,107600,12900,76700,31000
rating,4.5,4.5,4.5,4.5,4.5
num_reviews,1900,1700,1200,3100,567
sensitive_type,0,1,0,0,0
combination_type,1,1,1,0,1
oily_type,1,1,1,0,0
normal_type,1,1,1,0,0


## 2.0 Feature engineering

**Encode categorical columns**

In [4]:
df= pd.get_dummies(df, columns=['formulation_type', 'richness', 'product_type', 'brand']
              #drop_first=True
              )

**Created alternative y variables**

In [5]:
#Create category based on quantiles
df['affordability']= pd.qcut(df.pricepervol, q=4, labels=['$', '$$', '$$$', '$$$$'], duplicates='raise')

In [6]:
df['affordability'].value_counts()

$$$     340
$       337
$$      334
$$$$    327
Name: affordability, dtype: int64

In [7]:
df.pricepervol.describe()

count    1338.000000
mean       54.721122
std        72.723239
min         1.087500
25%         9.990000
50%        32.941176
75%        72.000000
max       882.352941
Name: pricepervol, dtype: float64

In [8]:
df.groupby('affordability')['pricepervol'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
affordability,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
$,337.0,5.934009,2.168178,1.0875,4.470588,5.91716,7.6,9.99
$$,334.0,20.08492,6.84498,10.0,14.018519,19.5,25.882353,32.941176
$$$,340.0,49.698529,11.775125,33.0,39.411765,48.0,60.0,72.0
$$$$,327.0,145.600114,97.431564,73.0,89.0,115.0,150.0,882.352941


In [9]:
#Identify outliers
IQR = np.quantile(df.pricepervol, 0.75) - np.quantile(df.pricepervol, 0.25)
np.quantile(df.pricepervol, 0.25) -1.5*(IQR), np.quantile(df.pricepervol, 0.25) +1.5*(IQR) 

(-83.025, 103.005)

In [10]:
#Create categories based on bins
#df['affordability_bins']=pd.cut(df.pricepervol, bins=[0, 25, 50, 75, 100, 885], labels=['1st', '2nd', '3rd', '4th', '5th'], include_lowest=True)

In [11]:
#Create 3 price categories
df['price_category']= pd.qcut(df.pricepervol, 3, labels= ['cheap', 'average', 'expensive'])

## 3.0 Saving the cleaned dataset before splitting

In [12]:
datapath = '../data/processed_data'
datapath_df = os.path.join(datapath, 'pre_modelling_data.json')
if not os.path.exists(datapath_df):
    df.to_json(datapath_df)