In [None]:
# Enable autoreload
%load_ext autoreload
%autoreload 2

# Pylint parameters
%config Completer.use_jedi = False

# Measure Runtime
# !pip install ipython-autotime
# %load_ext autotime

# Mute warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
!pwd

In [None]:
train_data = pd.read_csv('../raw_data/shopee-product-matching/train.csv') 
test_data = pd.read_csv('../raw_data/shopee-product-matching/test.csv')
ss_data = pd.read_csv('../raw_data/shopee-product-matching/sample_submission.csv', index_col = 0)

In [None]:
train_df = train_data.copy()
test_df = train_data.copy()
ss_df = train_data.copy()

In [None]:
train_df.size

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df['image'].shape

In [None]:
train_df.nunique()

In [None]:
train_df.isna().sum()

In [None]:
# pip install scikit-image

In [None]:
from skimage import io
import os

In [None]:
def trans_img(file_name):
    filename = os.path.abspath(os.path.join('../raw_data/shopee-product-matching/train_images', file_name))
    return io.imread(filename)

In [None]:
img_1 = trans_img('00039780dfc94d01db8676fe789ecd05.jpg')

In [None]:
plt.imshow(img_1)

# Image preprocessing

## Resizing

We want to reshape the images in smaller size (state-of-the-art networks are trained on (224,224,3) images / tensors)

In [None]:
from skimage import io
import os

def get_img_size():
    images = [image for image in train_df['image']]
    img_shape = []
    for image in images[11:76]:
        img = trans_img(image)
        x = img.shape
        img_shape.append(x)
        size_img = pd.DataFrame(img_shape)
    return size_img

In [None]:
img_size_df = get_img_size()
img_size_df.sort_values(by=0, ascending=False)

In [None]:
max(img_size_df[1]), min(img_size_df[1])

In [None]:
sizes_set = set(img_size_df[0])
def sizes_ratio():
    for size in sizes_set:
        return img_size_df[0].value_counts()/len(train_df['image'])*100

In [None]:
len(sizes_set)

In [None]:
x = sizes_ratio()
x.sort_values(ascending=False)


In [None]:
def get_unsquarred_ratio() :
    records = img_size_df.to_records(index=False)
    list_sizes = list(records)
    print(list_sizes)
    unsquarred =[]
    for x in list_sizes:
        if x[0]!=x[1]:
            unsquarred.append(x[0])
    return f"Ratio of unsquarred images : {round(len(unsquarred)/len(train_df['image']), 3)*100} %"

In [None]:
get_unsquarred_ratio()

Turning images into arrays of size (224, 224, 3) 

In [None]:
from PIL import Image

def drop_unsquarred():
    df_squarred = []
    images = [image for image in train_df['image']]
    for image in images:
        image_size=Image.open(f"../raw_data/shopee-product-matching/train_images/{image}").size
        if image_size[0] == image_size[1]:
            df_squarred.append(image)
    return df_squarred

In [None]:
df_squarred = drop_unsquarred()
df_squarred

In [None]:
def resize_save_unsquarred():
    img_squarred = []
    for image in df_squarred:
        image_open=Image.open(f"../raw_data/shopee-product-matching/train_images/{image}").resize((100,100))
        image_open.save(f"../raw_data/shopee-product-matching/train_images/resized/{image}")
        img_squarred.append(image_open)
    return img_squarred

In [None]:
img_squarred = resize_save_unsquarred()

In [None]:
plt.imshow(img_squarred[1100])

In [None]:
# train_df['resized_image'] = pd.read_csv('../raw_data/shopee-product-matching/train_images/resized/train.csv') 

In [None]:
from tensorflow.keras import models
from tensorflow.keras import layers

# Minimal Network + Common tricks + First hyperparameters tests

model = models.Sequential()

# First convolution & max-pooling
model.add(layers.Conv2D(100, (10,4), strides=(2,2), input_shape=(28, 28, 1), padding='same'))
model.add(layers.MaxPool2D(pool_size=(3,3)))

# Second convolution & max-pooling
model.add(layers.Conv2D(32, (3,3), strides=(2,2), padding='same'))
model.add(layers.MaxPool2D(pool_size=(2,2)))

model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

# ––– This is where your Data Scientist skills begin

In [None]:
from skimage.transform import resize
from skimage import io
import os

def resize_images():
    images = [image for image in train_df['image']]
    # img_shape = []
    for image in images[0:10]:
        image_path =  f'{image}'
        filename = os.path.abspath(os.path.join('../raw_data/shopee-product-matching/train_images/resized/', image_path))
        img = io.imread(filename)
        x = img.shape
        # img_shape.append(x)
        # print(x)
        # print(img_shape)
        # size_img = pd.DataFrame(img_shape)
        # print(size_img)
        # print(img_shape)
        # for img in size_img.loc[0,[0, 1]]:
            #print(img[0],img[1])
        #image_resized = resize(img, (img_shape[0][0] // img_shape[0][0] * 100, img_shape[0][1] // img_shape[0][1] * 100), anti_aliasing=True)
        # train_df['shape_resized'] = x
    return train_df

In [None]:
resize_images()

## Intensity normalization

Neural networks converge faster if the inputs are somewhat normalized. Therefore we want to transform the image pixels with values between 0 and 255 (for each color) into values between -1 and 1, thanks to Keras libraries (or just by dividing all the data by 255)

## Data augmentation

Creation of additionnal data to explore : 
- Mirror
- Crop
- rotations
- slight transformation of the colors
- change of the textures
- "photoshop effects": blur, halo, ...
- deformations
- ...

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape

model = Sequential()
model.add(Reshape((5*5*1,), input_shape=(5,5,1)))  # This flattens the (5, 5, 1) image to a vector of size 25
model.add(Dense(100, activation='relu'))
model.add(Dense(10, activation='softmax')) 

# Preprocessing on titles 

In [None]:
# pip install nltk

- Lowercase
- remove numbers
- remove punctuation
- remove stop words
- select important words with stemmer

In [None]:
#labels
print(f"label_group unique values: {train_df['label_group'].nunique()}")

In [None]:
groups_df = train_df["label_group"].value_counts().reset_index()
groups_df.columns = ["group", "count"]
print("Max no. of apparitions in 1 group: {}".format(groups_df["count"].max()), "\n" +
      "Min no. of apparitions in 1 group: {}".format(groups_df["count"].min()))

In [None]:
# Plot labels distribution
sns.set_style("whitegrid")
plt.figure(figsize = (12, 6))
plt.title('Group Count Distribution', fontsize = '15')
sns.kdeplot(groups_df['count'], fill = True, 
            color = '#f15335', 
            edgecolor = 'black', alpha = 0.9)
plt.xlabel('Label count')
plt.show();

In [None]:
# Plot titles length distribution
sns.set_style("whitegrid")
plt.figure(figsize = (12, 6))
plt.title('Distribution of title length', fontsize = '15')
sns.kdeplot(train_df['title'].apply(lambda x: len(x)), fill = True, 
            color = '#f15335', 
            edgecolor = 'black', alpha = 0.9)
plt.xlabel('Title length')
plt.show();

In [None]:
# !pip install -U textblob
# !python -m textblob.download_corpora

In [None]:
import string 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob

unpreproc_title = train_df["title"][1]
print(f"Before: {unpreproc_title}")
lower_title = unpreproc_title.lower()
print(f"Lower case: {lower_title}")
punct = "!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]:"
rem_punct = lower_title.translate(str.maketrans('','',string.punctuation))
print(f"Remove punctuation: {rem_punct}")
rem_whitespaces = rem_punct.strip()
print(f"Remove whitespaces: {rem_whitespaces}")
tokenize = word_tokenize(rem_whitespaces)
print(f"Tokenized: {tokenize}")
rem_stop_words = [word for word in tokenize if not word in stopwords.words()]
print(f"Remove stopwords: {rem_stop_words}")
lemmatizer = WordNetLemmatizer()
lemmatized_text = [lemmatizer.lemmatize(word) for word in rem_stop_words]
print(f"Lemmatization: {lemmatized_text}")
pos_text = TextBlob(' '.join(lemmatized_text))
print(f"Part_Of_Speech: {pos_text.tags}")


In [None]:
def preproc_title(title):
    title = title.lower()
    title = title.translate(str.maketrans('','',string.punctuation))
    title = title.strip()
    tokens_title = word_tokenize(title)
    tokens_title = [word for word in tokens_title if not word in stopwords.words()]
    lemmatizer = WordNetLemmatizer()
    lemm_text = [lemmatizer.lemmatize(word) for word in tokens_title]
    preproc_title = ' '.join(lemm_text)
    return preproc_title

def get_part_of_speech(prepped_title):
    part_of_speech = TextBlob(prepped_title)
    part_of_speech = ' '.join([j for (i, j) in pos_text.tags])
    return part_of_speech

In [None]:
train_df["preproc_title"] = train_df["title"].apply(lambda x: preproc_title(x))

In [None]:
train_df["part_of_speech"] = train_df["preproc_title"].apply(lambda x: get_part_of_speech(x))

In [None]:
# Read in prepped data
# train_df_prep = pd.read_csv("../raw_data/shopee-preprocessed-data/train_title_prepped.csv")
# train_df_prep["label_group"] = train_df_prep["label_group"].astype(str)