[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/omkar-334/gogoML/blob/main/preprocessing.ipynb)

In [1]:
!pip install opencv-python numpy scipy pandas pillow requests



In [8]:
import cv2
import numpy as np
from scipy.ndimage import median_filter
import os
from io import BytesIO
import pandas as pd
from PIL import Image
import requests
import shutil
import os
from pathlib import Path

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 100)

In [3]:
# Unsharp Masking function for sharpening
def unsharp(image, sigma=1, strength=1.2):
    image_mf = median_filter(image, sigma)    # Median filtering
    lap = cv2.Laplacian(image_mf, cv2.CV_64F)      # Calculate the Laplacian (edges) of the median-filtered image
    sharp = image - strength * lap     # Calculate the sharpened image by subtracting the Laplacian (edges) from the original image
    sharp = np.clip(sharp, 0, 255)     # Clip the values to stay within the valid range [0, 255]
    return sharp.astype(np.uint8)

def preprocess(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)     # 1. Grayscale conversion
    denoised_image = cv2.bilateralFilter(gray_image, d=9, sigmaColor=75, sigmaSpace=75)    # 2. Denoising with bilateral filter (preserves edges while smoothing)
    sharpened_image = unsharp(denoised_image, sigma= 2, strength=0.7)      # 3. Improved Sharpening using Unsharp Mask
    width, height = image.shape[1], image.shape[0]  # 4. Resize (if needed) Keep original size if possible
    resized_image = cv2.resize(sharpened_image, (width, height), interpolation=cv2.INTER_LINEAR)
    thresholded_image = cv2.adaptiveThreshold(resized_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)   # 6. Thresholding using Adaptive Thresholding (better for text)
    return thresholded_image

def make_zip(source: str, dest: str) -> None:
    source, dest = Path(source), Path(dest)
    base_name = dest.parent / dest.stem
    fmt = dest.suffix.replace(".", "")
    root_dir = source.parent
    base_dir = source.name
    shutil.make_archive(str(base_name), fmt, root_dir, base_dir)

In [4]:
df = pd.read_csv('train1.csv')
df.head()

Unnamed: 0,image_link,image_name,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61Am7goxaFL.jpg,61Am7goxaFL.jpg,178778,depth,3.9 centimetre
1,https://m.media-amazon.com/images/I/51R8AlSuVKL.jpg,51R8AlSuVKL.jpg,347320,depth,100.0 inch
2,https://m.media-amazon.com/images/I/51KghPIV6JL.jpg,51KghPIV6JL.jpg,826444,depth,90.0 millimetre
3,https://m.media-amazon.com/images/I/51gqMA7+K0L.jpg,51gqMA7+K0L.jpg,494658,depth,53.0 centimetre
4,https://m.media-amazon.com/images/I/51YDWb+0+mL.jpg,51YDWb+0+mL.jpg,861555,depth,217.0 millimetre


In [9]:
processed_dir = '/content/train_images_processed'
processed_zip = processed_dir + '.zip'
os.makedirs(processed_dir, exist_ok=True)

In [7]:
for index, row in df.iterrows():
    filelink, filename = row['image_link'], row['image_name']
    print(index, end=' - ')
    image = np.array(Image.open(BytesIO(requests.get(filelink).content)))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    if image is not None:
        image = preprocess(image)
        processed_image_path = os.path.join(processed_dir, filename)
        cv2.imwrite(processed_image_path, image)

make_zip(processed_dir, processed_zip)

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 80 - 81 - 82 - 83 - 84 - 85 - 86 - 87 - 88 - 89 - 90 - 91 - 92 - 93 - 94 - 95 - 96 - 97 - 98 - 99 - 100 - 101 - 102 - 103 - 104 - 105 - 106 - 107 - 108 - 109 - 110 - 111 - 112 - 113 - 114 - 115 - 116 - 117 - 118 - 119 - 120 - 121 - 122 - 123 - 124 - 125 - 126 - 127 - 128 - 129 - 130 - 131 - 132 - 133 - 134 - 135 - 136 - 137 - 138 - 139 - 140 - 141 - 142 - 143 - 144 - 145 - 146 - 147 - 148 - 149 - 150 - 151 - 152 - 153 - 154 - 155 - 156 - 157 - 158 - 159 - 160 - 161 - 162 - 163 - 164 - 165 - 166 - 167 - 168 - 169 - 170 - 171 - 172 - 173 - 174 - 175 - 176 - 177 - 178 - 179 - 180 - 181 - 182 - 183 - 184 - 

In [12]:
df = pd.read_csv('test1.csv')
df.head()

Unnamed: 0,image_link,image_name,group_id,entity_name
0,https://m.media-amazon.com/images/I/51L1yXhbZnL.jpg,51L1yXhbZnL.jpg,277199,depth
1,https://m.media-amazon.com/images/I/41M9DvRHG6L.jpg,41M9DvRHG6L.jpg,648011,depth
2,https://m.media-amazon.com/images/I/613lVUs+pIS.jpg,613lVUs+pIS.jpg,302672,depth
3,https://m.media-amazon.com/images/I/51vsGUC2+oL.jpg,51vsGUC2+oL.jpg,302672,depth
4,https://m.media-amazon.com/images/I/619ST4JsOyL.jpg,619ST4JsOyL.jpg,881883,depth


In [13]:
processed_dir = '/content/test_images_processed'
processed_zip = processed_dir + '.zip'
os.makedirs(processed_dir, exist_ok=True)

In [14]:
for index, row in df.iterrows():
    filelink, filename = row['image_link'], row['image_name']
    print(index, end=' - ')
    image = np.array(Image.open(BytesIO(requests.get(filelink).content)))
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    if image is not None:
        image = preprocess(image)
        processed_image_path = os.path.join(processed_dir, filename)
        cv2.imwrite(processed_image_path, image)

0 - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 80 - 81 - 82 - 83 - 84 - 85 - 86 - 87 - 88 - 89 - 90 - 91 - 92 - 93 - 94 - 95 - 96 - 97 - 98 - 99 - 100 - 101 - 102 - 103 - 104 - 105 - 106 - 107 - 108 - 109 - 110 - 111 - 112 - 113 - 114 - 115 - 116 - 117 - 118 - 119 - 120 - 121 - 122 - 123 - 124 - 125 - 126 - 127 - 128 - 129 - 130 - 131 - 132 - 133 - 134 - 135 - 136 - 137 - 138 - 139 - 140 - 141 - 142 - 143 - 144 - 145 - 146 - 147 - 148 - 149 - 150 - 151 - 152 - 153 - 154 - 155 - 156 - 157 - 158 - 159 - 160 - 161 - 162 - 163 - 164 - 165 - 166 - 167 - 168 - 169 - 170 - 171 - 172 - 173 - 174 - 175 - 176 - 177 - 178 - 179 - 180 - 181 - 182 - 183 - 184 - 

In [15]:
make_zip(processed_dir, processed_zip)

In [19]:
!du -sh /content/test_images_processed.zip

247M	/content/test_images_processed.zip
