# Data Preprocessing

author: Melih Kacaman
date: 05-10-2022

# Constants

In [3]:
import warnings
warnings.filterwarnings("ignore")

DATASET1_ODBC = 'PROTO.Melih.Dataset1_SimilarProducts'
DATASET1_TITLE = 'dataset1'

## Connect to the MSSQL Server

In [4]:
from common.sql_helper import custom_query
import pandas as pd

query = f'select * from {DATASET1_ODBC}'
result = custom_query(query=query)

result.head()

Unnamed: 0,WebSecenek,MarkaKodu,CinsiyetKodu,UrunGrubuKodu,Renk,cUrl,UrlNo
0,50202512-VR033,8,1,0TK,VR033,https://resim.aydinli.com.tr/08/1/0TK/S5020251...,2
1,50202513-DN0022,5,1,080,DN0022,https://resim.aydinli.com.tr/05/1/080/S5020251...,1
2,50202513-DN0022,5,1,080,DN0022,https://resim.aydinli.com.tr/05/1/080/S5020251...,2
3,50202518-DN0022,5,1,080,DN0022,https://resim.aydinli.com.tr/05/1/080/S5020251...,1
4,50202518-DN0022,5,1,080,DN0022,https://resim.aydinli.com.tr/05/1/080/S5020251...,2


# Download Images

In [5]:
class_names = result[['CinsiyetKodu', 'UrunGrubuKodu', 'Renk']].drop_duplicates()
class_names.head()

Unnamed: 0,CinsiyetKodu,UrunGrubuKodu,Renk
0,1,0TK,VR033
1,1,080,DN0022
5,2,011,VR013
9,2,011,VR033
11,3,082,VR033


In [6]:
import requests

def save_image(image_url, path):
    img_data = requests.get(image_url).content
    with open(path, 'wb') as handler:
        handler.write(img_data)

In [21]:
import requests
from PIL import Image
import io

def save_image_resize(image_url, path):
    img_data = requests.get(image_url).content

    with Image.open(io.BytesIO(img_data)) as im:
        im = im.resize((224, 224), Image.ANTIALIAS)
        im.save(path, optimize=True, quality=95)

In [19]:
import os

def make_dataset(bran_id, dataset_name, datasource, iteration = None):
    try:
        base = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '.'))
        path = os.path.join('datasets', dataset_name)
        path = os.path.join(base, path)
        if not os.path.isdir(path):
            os.mkdir(path)
        path = os.path.join(path, f'brand_{bran_id}')
        if not os.path.isdir(path):
            os.mkdir(path)
        # datasets/dataset/brand_id

        iterator = 0
        for index, row in class_names.iterrows():
            _class = row['CinsiyetKodu'] + '-'+ row['UrunGrubuKodu'] + '-' + row['Renk']
            folder = os.path.join(path, _class)
            res = datasource.query(f'MarkaKodu == "{bran_id}" and CinsiyetKodu == "{row.CinsiyetKodu}" and UrunGrubuKodu == "{row.UrunGrubuKodu}" and Renk == "{row.Renk}"')

            # if exists any data
            if res.shape[0] > 0:
                # check the folder exist
                if not os.path.isdir(folder):
                    os.mkdir(folder)

                # add the imgs to the folder
                for img_index, img_row in res.iterrows():
                    save_image_resize(img_row['cUrl'], os.path.join(folder, str(img_index) + '.jpg'))
                    if iteration is not None and iterator == iteration:
                        break
                    else:
                        iterator+=1
    except Exception as e:
        print('An exception occurred.', e)

In [14]:
%%time
res = make_dataset('08', dataset_name=DATASET1_TITLE, datasource=result, iteration=1000)

CPU times: total: 33.7 s
Wall time: 5min 17s


In [26]:
%%time
res = make_dataset('08', dataset_name='dataset_5', datasource=result, iteration=10)

CPU times: total: 16.9 s
Wall time: 41.8 s


In [25]:
result.shape

(69892, 7)

## For USPA

## For PC

## For Cacharel