In [7]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from PIL import Image

In [2]:
# Get the cluster labels and ids
df3 = pd.read_csv('three_cluster_model_labels.csv')
df3.drop(columns = 'Unnamed: 0', inplace = True)
df4 = pd.read_csv('four_cluster_model_labels.csv')
df4.drop(columns = 'Unnamed: 0', inplace = True)
display(df3.head(), df4.head())

Unnamed: 0,vessel_ID,image_ID,labels
0,18688,15819,1
1,17496,13961,2
2,17334,13780,1
3,19762,18235,1
4,28755,40646,2


Unnamed: 0,vessel_ID,image_ID,labels
0,18688,15819,2
1,17496,13961,0
2,17334,13780,2
3,19762,18235,2
4,28755,40646,1


In [3]:
# Create displays of random images by cluster for the three cluster model
cluster_labels = df3['labels'].unique()
for label in cluster_labels:
    temp = df3[df3['labels'] == label].sample(30)
    images = []
    for index in temp.index:
        filename = 'drawing_' + str(df3.loc[index, 'vessel_ID'])
        filename = filename + '_' + str(df3.loc[index, 'image_ID'])
        filename = '../images/' + filename + '.jpg'
        with Image.open(filename) as img:
            img.thumbnail((256, 256))
            img = img.convert(mode = 'RGB')
            images.append(img)
    new_img = Image.new('RGB', (1536, 1280))
    for i in range(30):
        x = (i % 6) * 256
        y = (i % 5) * 256
        new_img.paste(images[i], (x, y)) 
    new_img.save(f'images/three_cluster_visual_{label}.jpg')

In [5]:
# Create displays of random images by cluster for the four cluster model
cluster_labels = df4['labels'].unique()
for label in cluster_labels:
    temp = df4[df4['labels'] == label].sample(30)
    images = []
    for index in temp.index:
        filename = 'drawing_' + str(df4.loc[index, 'vessel_ID'])
        filename = filename + '_' + str(df4.loc[index, 'image_ID'])
        filename = '../images/' + filename + '.jpg'
        with Image.open(filename) as img:
            img.thumbnail((256, 256))
            img = img.convert(mode = 'RGB')
            images.append(img)
    new_img = Image.new('RGB', (1536, 1280))
    for i in range(30):
        x = (i % 6) * 256
        y = (i % 5) * 256
        new_img.paste(images[i], (x, y)) 
    new_img.save(f'images/four_cluster_visual_{label}.jpg')

In [18]:
# Get the data from the vessels.json file
with open('../lcp_data/vessels.json') as f:
    vessels_df = pd.DataFrame(json.load(f))

In [20]:
# Explore the vessels data
vessels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16717 entries, 0 to 16716
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       16717 non-null  int64  
 1   category                 560 non-null    object 
 2   chron                    13837 non-null  object 
 3   uid                      560 non-null    object 
 4   site                     16699 non-null  object 
 5   site_ref                 0 non-null      object 
 6   ware_id                  9303 non-null   float64
 7   created_at               16717 non-null  object 
 8   updated_at               16717 non-null  object 
 9   inventory_number         16717 non-null  object 
 10  shape_id                 15638 non-null  float64
 11  funct_cat                7631 non-null   object 
 12  shape_sub_group          13389 non-null  object 
 13  shape_rtype              13391 non-null  object 
 14  shape_sub_type        

In [27]:
# Try to figure out what chron is.
display(vessels_df['chron'].value_counts().head(20))

                                    2722
1015-960 BCE                         240
3rd - 2nd century BCE                238
650-700 CE                           156
2nd century BCE - 1st century CE     150
1st c. CE                            138
2nd century BCE                      135
c. 6th-5th BCE                       125
Mid 10th - 9th c. BCE                107
3rd-2nd century BCE                  104
3rd -1st century BCE                 102
1st century - 2nd century CE         101
2nd c. BCE                            97
c. 16th-15th c. BCE                   96
1600-1550 BCE                         89
2nd-1st century BCE                   89
mid-6th c. BCE                        87
7-6th century BCE                     84
37 BCE- 70 BC                         82
6400-6000 BCE                         82
Name: chron, dtype: int64

In [35]:
# Look at some of the numeric data
display(vessels_df['thickness'].value_counts())
display(vessels_df['height'].value_counts())
display(vessels_df['preserved_height'].value_counts())
display(vessels_df['true_height'].value_counts())
display(vessels_df['diameter_rim'].value_counts())
display(vessels_df['estimated_diameter_rim'].value_counts())

0.0     15241
0.5       209
0.4       205
0.6       148
0.7       136
        ...  
1.06        1
0.63        1
0.77        1
0.87        1
9.0         1
Name: thickness, Length: 93, dtype: int64

preserved    2777
true         1105
Name: height, dtype: int64

0.0      12875
4.0        124
3.0         83
5.0         78
4.5         70
         ...  
23.7         1
25.9         1
29.3         1
130.0        1
2.92         1
Name: preserved_height, Length: 299, dtype: int64

0.0     12855
5.0        31
4.0        29
4.5        27
3.0        26
        ...  
51.0        1
32.2        1
46.5        1
22.4        1
44.4        1
Name: true_height, Length: 274, dtype: int64

true         3476
estimated     912
Name: diameter_rim, dtype: int64

0.0     12366
10.0       81
11.0       42
12.0       42
9.0        41
        ...  
10.1        1
7.9         1
15.9        1
2.44        1
26.5        1
Name: estimated_diameter_rim, Length: 139, dtype: int64