# Preprocessing of features for clustering

In [1]:
# import packages
import pandas as pd
import numpy as np
import psycopg2
from psycopg2 import Error
import matplotlib.pyplot as plt

Extract data from postgress

In [2]:
USER = "***"
PSWD = "***"
HOST = "pg.analytics.northwestern.edu"
PORT = "5432"
DB_NAME = "everything2023"

In [3]:
# Connect to postgress database
try:
    # Connect to an existing database
    connection = psycopg2.connect(user = USER,
                                  password = PSWD,
                                  host = HOST,
                                  port = PORT,
                                  database = DB_NAME)

    # Create a cursor to perform database operations
    cursor = connection.cursor()
    
    # Print PostgreSQL details
    cursor.execute("SELECT version();")
    
    # Fetch result
    record = cursor.fetchone()
    
    # Print succesfull connection
    print("You are connected to - ", record, "\n")
    
except (Exception, Error) as error:
    # Print error conecting to ostgres
    print("Error while connecting to PostgreSQL", error)

You are connected to -  ('PostgreSQL 10.12 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-39), 64-bit',) 



In [4]:
# Read skuinfo table
skuinfo = pd.read_sql("select * from group_9.skuinfo", connection)
skuinfo

  skuinfo = pd.read_sql("select * from group_9.skuinfo", connection)


Unnamed: 0,SKU,DEPT,CLASSID,UPC,STYLE,COLOR,SIZE,PACKSIZE,VENDOR,BRAND
0,3,6505,113,000400000003000,00 F55KT2,WHISPERWHITE,P8EA,1,5119207,TURNBURY
1,4,8101,002,000400000004000,22 615CZ4,SPEARMI,S,1,3311144,C A SPOR
2,5,7307,003,000400000005000,7LBS 245-01,34 SILVER,KING,1,5510554,BEAU IDE
3,8,3404,00B,000400000008000,622 F05H84,MORNING MI,2T,1,2912827,HARTSTRI
4,15,2301,004,000400000015000,126 MDU461,255CAMEL,12,1,0023272,JONES/LA
...,...,...,...,...,...,...,...,...,...,...
1564173,9999973,3103,009,000400009973999,702 S3JAYV,STONE,4,1,6813115,POLO JEA
1564174,9999974,9801,726,000400009974999,G50171,NAVY MULTI,10,1,9212766,GABAR IN
1564175,9999991,2301,004,000400009991999,026 MDU201,618RED ROSE,8,1,0023272,JONES/LA
1564176,9999992,1202,402,000400009992999,14 F52UN1,PALE JADE,L,1,1446212,CABERNET


1. Style (51 MERU08, 9 126NAO) - not used
2. Color (BLACK, KHAKI) - Daniel
3. Size (L) - Xinran
4. Brand (TOMMY HI, MARK ECK) - not used
5. Department (CLINIQUE, LESLIE) - Liang
6. Whether discount (sell price - original price <0?) - Liang
7. Most bought in which season/month (from transaction data) - Mahi
8. City/state of the store (one-to-many relationship) - Mahi
9. Return rate - Xinran

## Style

In [5]:
print(skuinfo["STYLE"].value_counts())
len(skuinfo["STYLE"].unique())

STYLE
N     HARDWI    541
PHOEBE          460
NIKA            387
70    S55KR9    340
E     ROMAYN    333
               ... 
74259             1
6     190120      1
PS102             1
3     370422      1
54    JWRUD2      1
Name: count, Length: 316782, dtype: int64


316782

It contain too many different values and unable to combine, so we won't use this column.

## Color

In [6]:
skuinfo["COLOR"].value_counts().head(10)

COLOR
BLACK       99813
WHITE       43240
NAVY        30754
RED         23050
MULTI       20566
BLUE        20261
BLACK LE    17164
PINK        16755
BROWN       15126
GREY        14134
Name: count, dtype: int64

1. categorical data: keep 199 values with highest frequencies, and the rest assigned as Other
2. numerical data: Convert each color name to a corresponding color representation, such as RGB or HSL values. then calculate the similarity
3. categorical data: check whether it contains black, white....,and the rest assigned as Other

In [7]:
# convert into RGB
import seaborn as sns

color_name = 'blue'
rgb_value = sns.xkcd_rgb[color_name]
print(rgb_value)

#0343df


## Size

In [8]:
skuinfo["SIZE"].value_counts().head(20)

SIZE
L       125606
M       125167
S       107737
ALL     103149
XL       90229
6        31179
12       26881
14       26463
XXL      26234
10       25382
8        25015
4        21271
16       20782
5        19703
100M     17217
090M     17102
085M     16670
080M     16610
095M     16259
060M     16007
Name: count, dtype: int64

Convert all into S/M/L

## Brand

In [9]:
skuinfo["BRAND"].value_counts()

BRAND
POLO FAS    130994
BROWN SH     52178
ENZO ANG     49121
LIZ CLAI     43723
ROUNDTRE     39466
             ...  
SANDY MA         1
MINELLI          1
SOLERA/T         1
BLUE             1
INTERCON         1
Name: count, Length: 1952, dtype: int64

It contain too many different values and unable to combine, so we won't use this column.

## Department

In [10]:
len(skuinfo["DEPT"].value_counts())

60

Only 60 different values

## Whether discount 

Boolean variable
1. use transact data, check the most recent purchase history to see whether it has a discount (0/1)
2. use transact data, check the all purchase history to see whether it has a discount before (0/1)

## Most bought in which season

Categorical variable

use transact data, check in which season/month the product is bought in most

## Location of the store

Categorical variable

Combined with SKSTINFO and STRINFO, Find in which cities/states the product is stored. It is a one-to-many connection. 

## Return rate

Numerical variable
use transact data, calculate the rate of return.