# Feature Normalisation

We will be normalising the features that we selected for clustering by using `PowerTransformer` and `PCA`.

**Setting up**

In [1]:
%load_ext kedro.ipython
%load_ext autoreload
%matplotlib inline
%autoreload 2

In [7]:
import pandas as pd
import polars as pl
import numpy as np

from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.decomposition import PCA

import logging

from usg.utils import *

log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
sb.set()

In [4]:
numerical

['year',
 'achievements',
 'price',
 'est_owners',
 'num_categories',
 'num_genres',
 'positive_ratings',
 'negative_ratings',
 'ratings_ratio',
 'median_playtime']

In [16]:
mapper = {'achievements': 'pow', 
          'price': 'pow', 
          'positive_ratings': 'pow', 
          'negative_ratings': 'pow', 
          'ratings_ratio': 'pow', 
          'median_playtime': 'pow',
          'year': 'std',
          'est_owners': 'std',
          'num_categories': 'std',
          'num_genres': 'std'
          }
std = [k for k,v in mapper.items() if v == 'std']
pow = [k for k,v in mapper.items() if v == 'pow']

In [22]:
df1 = catalog.load('features_eng_1').set_index('appid')
df2 = catalog.load('features_eng_2').set_index('appid')
df = df1.join(df2, how='left')

df['est_owners'] = df['est_owners'].map(np.log10)

sscaler = StandardScaler().set_output(transform='pandas')
sdf = sscaler.fit_transform(df.loc[:, std]).set_index(df.index)

pscaler = PowerTransformer().set_output(transform='pandas')
pdf = pscaler.fit_transform(df.loc[:, pow]).set_index(df.index)

pca = PCA(n_components=len(numerical)).set_output(transform='pandas')
ndf = pca.fit_transform(pd.concat([sdf, pdf], axis=1)).set_index(df.index)

df = pd.concat([df.loc[:, categorical], ndf], axis=1).reset_index(names='appid')

catalog.save('train', df)
catalog.save('model@sscaler', sscaler)
catalog.save('model@pscaler', pscaler)
catalog.save('model@pca', pca)
df

Unnamed: 0,appid,mac,linux,Multi-player,Steam Achievements,Steam Trading Cards,Indie,Action,Casual,Strategy,...,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9
0,10,1,1,1,0,0,0,1,0,0,...,7.911391,4.625905,-2.226985,0.110762,0.361707,3.788190,0.475330,1.387862,1.758100,-0.298926
1,20,1,1,1,0,0,0,1,0,0,...,6.792718,5.018095,-2.121070,-1.138672,-0.022708,4.420453,0.352367,1.456113,1.320859,-0.121106
2,30,1,1,1,0,0,0,1,0,0,...,6.057853,4.742084,-1.689384,-0.332834,-0.043636,2.771690,-0.000105,1.327511,1.657276,-0.189244
3,40,1,1,1,0,0,0,1,0,0,...,6.280748,4.686884,-1.893985,-1.181522,-0.117595,3.704068,0.519669,1.463307,1.496306,-0.197181
4,50,1,1,1,0,0,0,1,0,0,...,6.920246,5.063520,-2.229868,-0.165080,-0.024656,4.295356,0.287391,1.948846,1.420879,-0.186459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,0,0,0,1,0,1,0,1,0,...,-2.602721,-0.276893,0.362669,-1.034618,-0.533288,-0.461086,-0.341049,0.399237,0.443204,0.503655
27071,1065570,0,0,0,0,0,1,1,0,0,...,-1.789909,0.417985,0.365992,1.698407,-0.121252,-0.651820,0.487585,0.371350,0.296176,-0.112241
27072,1065650,0,0,1,1,0,1,1,1,0,...,-2.107001,-1.889595,0.162107,-2.099990,-0.605811,0.093895,0.713825,0.083974,0.263131,-0.706223
27073,1066700,1,0,0,0,0,1,0,1,0,...,-2.845094,0.330546,-0.022664,-0.977137,0.537219,-0.698513,0.505280,0.197078,0.449466,0.368410


In [24]:
df.describe().round(2)

Unnamed: 0,appid,mac,linux,Multi-player,Steam Achievements,Steam Trading Cards,Indie,Action,Casual,Strategy,...,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9
count,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,...,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0,27075.0
mean,596203.51,0.3,0.19,0.15,0.52,0.29,0.72,0.44,0.38,0.19,...,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0
std,250894.17,0.46,0.39,0.35,0.5,0.45,0.45,0.5,0.48,0.4,...,1.97,1.22,1.07,0.94,0.91,0.75,0.67,0.63,0.48,0.32
min,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.35,-5.26,-4.01,-2.75,-3.01,-3.01,-2.82,-3.07,-1.95,-1.4
25%,401230.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.46,-0.89,-0.73,-0.74,-0.62,-0.46,-0.42,-0.42,-0.25,-0.16
50%,599070.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,-0.38,-0.0,-0.09,-0.03,-0.02,-0.04,0.01,0.05,0.03,-0.01
75%,798760.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.1,0.87,0.62,0.64,0.56,0.43,0.37,0.43,0.28,0.11
max,1069460.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,8.52,5.36,8.44,3.62,5.74,6.04,4.01,3.08,3.67,1.89
