In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import os
import cv2

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Join text and image data tables using product_id - this will be used to match up each image to its respective product_name and category

tabular_data_1 = pd.read_csv('cleaned_tabular_data.csv', index_col=0)
tabular_data_2 = pd.read_csv('Images.csv', index_col=0)
tabular_data_3 = tabular_data_1.join(tabular_data_2.set_index('product_id'), on='id', lsuffix='_left', rsuffix='_right')
list(tabular_data_3.columns)
tabular_data_3.rename({'id_left':'product_id', 'id_right':'image_id'}, axis=1, inplace=True)
tabular_data_3.drop({'id','category2','category3','category4','category5','price','location','url','page_id','bucket_link','create_time'}, axis=1, inplace=True)
# tabular_data_3.shape
tabular_data_3.drop_duplicates(subset='product_id', keep='first', inplace=True)
# tabular_data_3.head(15)
tabular_data_3.reset_index(inplace=True)
tabular_data_4 = tabular_data_3.drop({'index','product_id','product_name','description','date_posted','image_ref'}, axis=1)
# tabular_data_4.head()
# print(type(tabular_data_4))
print(tabular_data_4['category1'].value_counts())

Home & Garden                        714
Health & Beauty                      560
Office Furniture & Equipment         547
Music, Films, Books & Games          544
Computers & Software                 533
Appliances                           475
Other Goods                          468
DIY Tools & Materials                462
Video Games & Consoles               451
Sports, Leisure & Travel             420
Phones, Mobile Phones & Telecoms     402
Clothes, Footwear & Accessories      375
Baby & Kids Stuff                    360
Name: category1, dtype: int64


In [29]:
# Assign a number to each category and create a column for the image arrays of each product

tabular_data_4['code'] = pd.factorize(tabular_data_4['category1'])[0] + 1
tabular_data_4 = tabular_data_4[['category1','code','image_id']]
tabular_data_4['img_array'] = ''
tabular_data_4 = tabular_data_4.sample(frac=1).reset_index(drop=True)
tabular_data_4.dropna(subset=['category1','code','image_id','img_array'], inplace=True)
tabular_data_4.head()

Unnamed: 0,category1,code,image_id,img_array
0,Computers & Software,11,14189054-31aa-4d60-89fa-56412cf3cafb,
1,Other Goods,7,3048b94a-527b-4e9e-b883-63a2d4d487c8,
2,Health & Beauty,8,bdd6a467-8e48-4dda-a3a1-8c5b9ba6cbcd,
3,Video Games & Consoles,13,924d13f7-f582-4b34-a685-0a7c65e8402d,
4,"Music, Films, Books & Games",4,717a446d-c3c2-41e7-bd2c-bd71bd0c3895,


In [30]:
# Iterates over each product in table from cell 1 and uses its image_id to find the image in the local directory, before converting the image to an array and placing the array in the img_array column

path_to_images_cleaned_128x128 = "images_cleaned_128x128/"
for i in range(6228):
    x = tabular_data_4.iat[i,2]
    y = (f"128x128_cleaned_{x}")
    # print(y)
    img_array = mpimg.imread(f'{path_to_images_cleaned_128x128}{y}.jpg')
    # imgplot = plt.imshow(img_array)
    img_array_list = list(img_array)
    tabular_data_4.iat[i,3] = img_array
    i+=1
tabular_data_4.head(100)


Unnamed: 0,category1,code,image_id,img_array
0,Computers & Software,11,14189054-31aa-4d60-89fa-56412cf3cafb,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
1,Other Goods,7,3048b94a-527b-4e9e-b883-63a2d4d487c8,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
2,Health & Beauty,8,bdd6a467-8e48-4dda-a3a1-8c5b9ba6cbcd,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
3,Video Games & Consoles,13,924d13f7-f582-4b34-a685-0a7c65e8402d,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,"Music, Films, Books & Games",4,717a446d-c3c2-41e7-bd2c-bd71bd0c3895,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
...,...,...,...,...
95,Home & Garden,1,cc1e0ae4-3c70-4f92-9402-40ad63ef27a5,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
96,"Sports, Leisure & Travel",9,6f369bb4-1791-445f-b377-9c44adc5355e,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
97,Health & Beauty,8,ce92b965-86b1-424e-b2c4-a8ae7cea19ad,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
98,Health & Beauty,8,613e66f8-c306-4bd2-9973-b4f9025d260d,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."


In [31]:
# One hot encodings

category_list = list(tabular_data_4['category1'].unique())
print(category_list)
category_list_as_numbers = pd.Series(list(category_list))
one_hot_encodings_dataframe = pd.get_dummies(tabular_data_4['category1'])
one_hot_encodings_dataframe

['Computers & Software ', 'Other Goods ', 'Health & Beauty ', 'Video Games & Consoles ', 'Music, Films, Books & Games ', 'Office Furniture & Equipment ', 'Home & Garden ', 'Appliances ', 'Clothes, Footwear & Accessories ', 'DIY Tools & Materials ', 'Baby & Kids Stuff ', 'Sports, Leisure & Travel ', 'Phones, Mobile Phones & Telecoms ']


Unnamed: 0,Appliances,Baby & Kids Stuff,"Clothes, Footwear & Accessories",Computers & Software,DIY Tools & Materials,Health & Beauty,Home & Garden,"Music, Films, Books & Games",Office Furniture & Equipment,Other Goods,"Phones, Mobile Phones & Telecoms","Sports, Leisure & Travel",Video Games & Consoles
0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6306,0,0,0,0,0,1,0,0,0,0,0,0,0
6307,0,0,0,1,0,0,0,0,0,0,0,0,0
6308,0,0,0,0,1,0,0,0,0,0,0,0,0
6309,0,0,0,0,0,0,1,0,0,0,0,0,0


In [34]:
# Join two dataframes together

df_concat = pd.concat([tabular_data_4, one_hot_encodings_dataframe], axis=1)
df_concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6228 entries, 0 to 6310
Data columns (total 17 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   category1                          6228 non-null   object
 1   code                               6228 non-null   int64 
 2   image_id                           6228 non-null   object
 3   img_array                          6228 non-null   object
 4   Appliances                         6228 non-null   uint8 
 5   Baby & Kids Stuff                  6228 non-null   uint8 
 6   Clothes, Footwear & Accessories    6228 non-null   uint8 
 7   Computers & Software               6228 non-null   uint8 
 8   DIY Tools & Materials              6228 non-null   uint8 
 9   Health & Beauty                    6228 non-null   uint8 
 10  Home & Garden                      6228 non-null   uint8 
 11  Music, Films, Books & Games        6228 non-null   uint8 
 12  Office

In [35]:
z = df_concat.iat[1,3]
print(z)
print(type(z))

[[[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [ 30  43  52]
  [ 24  45  50]
  [ 18  46  49]]

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [ 32  42  52]
  [ 28  42  51]
  [ 25  42  49]]

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [ 34  40  54]
  [ 38  37  51]
  [ 39  37  51]]

 ...

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [180 147 168]
  [162 137 159]
  [147 125 148]]

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [176 147 167]
  [157 135 156]
  [147 128 148]]

 [[  0   0   0]
  [  0   0   0]
  [  0   0   0]
  ...
  [174 149 168]
  [171 150 169]
  [173 156 175]]]
<class 'numpy.ndarray'>


In [38]:
df_concat.head(5)
classification_model_df = df_concat.drop({'category1','code','image_id'}, axis=1)
classification_model_df.head(5)

Unnamed: 0,img_array,Appliances,Baby & Kids Stuff,"Clothes, Footwear & Accessories",Computers & Software,DIY Tools & Materials,Health & Beauty,Home & Garden,"Music, Films, Books & Games",Office Furniture & Equipment,Other Goods,"Phones, Mobile Phones & Telecoms","Sports, Leisure & Travel",Video Games & Consoles
0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,1,0,0,0,0,0,0,0,0,0
1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,0,0,0,1,0,0,0
2,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,1,0,0,0,0,0,0,0
3,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0,0,0,0,0,0,0,0,0,0,0,0,1
4,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,0,1,0,0,0,0,0


In [39]:
classification_model_df.to_pickle("./classification_model_df.pkl")

In [40]:
unpickled_df = pd.read_pickle("./classification_model_df.pkl")
unpickled_df.head(10)

Unnamed: 0,img_array,Appliances,Baby & Kids Stuff,"Clothes, Footwear & Accessories",Computers & Software,DIY Tools & Materials,Health & Beauty,Home & Garden,"Music, Films, Books & Games",Office Furniture & Equipment,Other Goods,"Phones, Mobile Phones & Telecoms","Sports, Leisure & Travel",Video Games & Consoles
0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,1,0,0,0,0,0,0,0,0,0
1,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,0,0,0,1,0,0,0
2,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,1,0,0,0,0,0,0,0
3,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0,0,0,0,0,0,0,0,0,0,0,0,1
4,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,0,1,0,0,0,0,0
5,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,0,0,1,0,0,0,0
6,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",0,0,0,0,0,0,1,0,0,0,0,0,0
7,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,1,0,0,0,0,0,0
8,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,1,0,0,0,0,0,0,0,0,0
9,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,0,0,0,0,0,1,0,0,0,0,0,0


In [8]:
# load pickle file into new dataframe
image_dataframe = pd.read_pickle('images_data.pkl')
image_dataframe['image_array'] = image_dataframe['image_array'].apply(lambda x: x.flatten())

# features
X = list(image_dataframe['image_array'])

# target
y = list(image_dataframe['category'])

# pipeline?

# parameters?

# split data in to train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# split test data into val/test
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

# instantiate and fit Logistic Regression
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

# predict
predictions = logistic_regression.predict(X_test)

# calculate accuracy score
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {accuracy}')

FileNotFoundError: [Errno 2] No such file or directory: 'images_data.pkl'