# Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import json

# Load the Data

In [2]:
# Load the vessels data
with open('../lcp_data/vessels.json') as file:
    df = pd.DataFrame(json.load(file))
df.head(20)

Unnamed: 0,id,category,chron,uid,site,site_ref,ware_id,created_at,updated_at,inventory_number,...,diameter_base,estimated_diameter_base,true_diameter_base,slug,acknowledgement,site_id,functional_category_id,old_shape_name,has_petrography,discarded_at
0,7,Pot Pot Pot,5th-4th centuries BCE,LCP-KFNUW,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,21.0,2012-09-17T20:25:44.756Z,2022-06-06T06:48:23.688Z,Mizpe Yammim 19/165,...,estimated,11.0,,mizpe-yammim-19-165,,1154.0,8.0,Juglet,False,
1,8,Pot,5th-4th century BCE,LCP-VBTDB,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,21.0,2012-09-17T20:30:46.389Z,2018-05-22T07:42:14.334Z,Mizpe Yammim 22/452,...,,0.0,0.0,mizpe-yammim-22-452,,1154.0,8.0,Juglet,False,
2,9,Pot,5th-4th century BCE,LCP-SBBGA,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,21.0,2012-09-17T20:32:02.507Z,2018-03-17T14:10:18.616Z,Mizpe Yammim 8/398,...,,0.0,0.0,mizpe-yammim-8-398,,1154.0,8.0,Juglet,False,
3,10,Pot,3rd-mid-2nd c. BCE,LCP-EAWCN,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,28.0,2012-09-17T20:53:53.147Z,2021-12-06T11:10:42.903Z,Qedesh K00P168,...,,0.0,0.0,qedesh-k00p168,,1525.0,1.0,Cooking pot,False,
4,11,Pot,2nd c. BCE,LCP-SUNLQ,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,28.0,2012-09-17T20:54:45.445Z,2022-07-15T21:45:23.124Z,Qedesh K00P058,...,,0.0,0.0,qedesh-k00p058,,1525.0,1.0,Casserole/caccabé,False,
5,14,Pot,1200BCE - 1150BCE,LCP-USWLF,"{'id': 1243, 'name': 'Tel Miqne/Ekron', 'chron...",,29.0,2012-09-17T21:07:57.254Z,2016-03-15T17:46:30.643Z,Miqne INE.4.392/1,...,,0.0,0.0,miqne-ine-4-392-1,,1243.0,10.0,"Bowl, large",False,
6,16,Pot,300 BCE - 150 BCE,LCP-SPESO,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,494.0,2012-09-17T21:16:02.498Z,2020-03-15T18:05:52.499Z,Qedesh K09P046,...,true,,5.1,qedesh-k09p046,,1525.0,2.0,Saucer,False,
7,17,Pot,200 BCE - 140 BCE,LCP-PFGSF,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,127.0,2012-09-17T21:17:15.643Z,2020-03-14T20:17:34.857Z,Qedesh K00P157,...,true,,7.8,qedesh-k00p157,,1525.0,2.0,Plate,False,
8,18,Pot,5th-4th c. BCE,LCP-SQPVL,"{'id': 1178, 'name': 'Tel Anafa', 'chron': Non...",,155.0,2012-09-17T21:24:44.936Z,2021-07-25T13:20:02.782Z,Tel Anafa PW 49/TA79P49,...,,0.0,0.0,tel-anafa-pw-49-ta79p49,,1178.0,8.0,Juglet,False,
9,19,Pot Pot,5th-4th century BCE,LCP-BCHMS,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,74.0,2012-09-17T21:26:15.697Z,2018-03-17T13:52:38.793Z,Mizpe Yammim 15/362,...,,0.0,0.0,mizpe-yammim-15-362,,1154.0,8.0,Juglet,False,


# Data Cleaning

In [3]:
# Call info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16717 entries, 0 to 16716
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       16717 non-null  int64  
 1   category                 560 non-null    object 
 2   chron                    13837 non-null  object 
 3   uid                      560 non-null    object 
 4   site                     16699 non-null  object 
 5   site_ref                 0 non-null      object 
 6   ware_id                  9303 non-null   float64
 7   created_at               16717 non-null  object 
 8   updated_at               16717 non-null  object 
 9   inventory_number         16717 non-null  object 
 10  shape_id                 15638 non-null  float64
 11  funct_cat                7631 non-null   object 
 12  shape_sub_group          13389 non-null  object 
 13  shape_rtype              13391 non-null  object 
 14  shape_sub_type        

In [4]:
# Look for nearly empty columns
filter = (df.isna().sum() / df.shape[0] > 0.95)
cols = df.loc[:, filter.values].columns
df.drop(columns = cols, inplace = True)

In [5]:
# We won't use any ids for modelling.
df.drop(columns = ['ware_id', 
                   'shape_id', 
                   'user_id',
                   'petrofabric_id',
                   'site_id',
                   'functional_category_id'],
        inplace = True)

In [6]:
# Remove some more columns unneccesary for modeling.
df.drop(columns = ['created_at',
                   'updated_at',
                   'inventory_number'],
        inplace = True)

## Label cleaning

In [7]:
# Make a list of columns we decide to drop after closer inspection
cols_to_drop = []

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16717 entries, 0 to 16716
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       16717 non-null  int64  
 1   chron                    13837 non-null  object 
 2   site                     16699 non-null  object 
 3   funct_cat                7631 non-null   object 
 4   shape_sub_group          13389 non-null  object 
 5   shape_rtype              13391 non-null  object 
 6   color                    13911 non-null  object 
 7   firing                   13548 non-null  object 
 8   surface_treatment        13919 non-null  object 
 9   manu_tech                16717 non-null  object 
 10  arch_context             13990 non-null  object 
 11  privacy_status           16717 non-null  int64  
 12  site_type                13999 non-null  object 
 13  period                   8390 non-null   object 
 14  petrofab_visual_desc  

### Clean the 'funct_cat' column

In [9]:
# Clean the 'funct_cat' column.
df['funct_cat'].value_counts()

                                    1655
Dining/Drinking/Serving             1630
Household/Utility                   1521
Transport/Storage                   1005
Cooking/Kitchen                      727
Household/Storage                    485
Table ware                            83
Lighting                              83
Food production                       80
Cosmetic/Toilette/Medicine            67
Smoking                               48
Cooking/Kitchen, Food production      46
Cooking                               39
Dining/Serving                        31
Ceramic Production                    23
Agriculture                           13
Transport, storage                     9
Food production/service                9
cooking wares                          8
food/drink service                     8
sugar production                       7
Transport/Stroage                      7
Burial                                 5
food service                           5
 Cooking        

In [10]:
df['funct_cat'].replace({'Household/Storage': 'Household/Utility',
                         'Table ware': 'Dining/Drinking/Serving',
                         'Food production': 'Cooking/Kitchen',
                         'Cooking/Kitchen, Food production': 'Cooking/Kitchen',
                         'Cooking': 'Cooking/Kitchen',
                         'Transport, storage': 'Transport/Storage',
                         'Dining/Serving': 'Dining/Drinking/Serving',
                         'Food production/service': 'Cooking/Kitchen',
                         'food/drink service': 'Dining/Drinking/Serving',
                         'cooking wares': 'Cooking/Kitchen',
                         'food service': 'Dining/Drinking/Service',
                         ' Cooking': 'Cooking/Kitchen',
                         'sugar production': 'Cooking/Kitchen',
                         'drinking vessel': 'Dining/Drinking/Serving',
                         ' food/drink service': 'Dining/Drinking/Serving',
                         ' cooking': 'Cooking/Kitchen',
                         'to drink': 'Dining/Drinking/Serving',
                         'cooking': 'Cooking/Kitchen',
                         'Water jug': 'Dining/Drinking/Serving',
                         'Waster': 'Dining/Drinking/Serving',
                         'Dining/Drinking/Service': 'Dining/Drinking/Serving',
                         'Table': 'Household/Utility',
                         'Tableware': 'Household/Utility',
                         'Fine ware': 'Household/Utility',
                         'table ware': 'Household/Utility',
                         'brick': 'Building material',
                         'Container': 'Storage',
                         'perfume vessels': 'Cosmetic/Toilette/Medicine',
                         'Transport/Stroage': 'Transport/Storage',
                         'Lighting': 'Household/Utility',
                         'lamp': 'Household/Utility',
                         'lighting': 'Household/Utility',
                         'Storage': 'Household/Utility'}).value_counts()

Household/Utility             2097
Dining/Drinking/Serving       1760
                              1655
Transport/Storage             1021
Cooking/Kitchen                923
Cosmetic/Toilette/Medicine      68
Smoking                         48
Ceramic Production              23
Agriculture                     13
Burial                           5
Building material                5
Dining/Drinking/Service          5
Ritual                           3
Ritual, Toy                      1
lighting                         1
Storage                          1
Figurine                         1
Miniature                        1
Name: funct_cat, dtype: int64

> There is still work to be done on this feature. I'm having issues dealing with some of the remaining labels. For example, I'm not able to change the label 'Dining/Drinking/Service.' Not sure why.

### Clean the shape_sub_group column

In [11]:
# Get the labels
df['shape_sub_group'].value_counts().head(30)

                                                             12501
Marine Transport Containers                                    187
Milkbowl                                                        18
Black polished bowl with graffito                               14
Bowl                                                            13
tableware                                                       12
Beirut 3                                                        11
Low-necked jar                                                  10
Gray wide-mouthed storage jar                                    9
Thickened rim saucer                                             8
Phoenician Amphora                                               8
cooking-pot                                                      6
Jar                                                              5
Base of black polished bowl with graffito                        5
Black polished bowl                                           

> Probably not worth using this feature for modeling.

In [12]:
# Add shape_sub_group to the list of columns to drop.
cols_to_drop.append('shape_sub_group')
cols_to_drop

['shape_sub_group']

### Clean the shape_rtype column

In [13]:
df['shape_rtype'].value_counts().head(30)

                                                                                      6943
Closed                                                                                 193
Closed Shape                                                                           192
Carinated Shoulder Amphorae                                                            188
Open                                                                                   175
Open Shape                                                                             168
LH Jiyeh Type 1                                                                         55
open                                                                                    51
Incurved rim                                                                            44
LH Jiyeh Type 2                                                                         35
Type M3a: thickened rim, bag-shaped body (Rohmer 2020, Type M3a, fig. 1.1, p. 43).      35

In [14]:
# We drop this column as well.
cols_to_drop.append('shape_rtype')
cols_to_drop

['shape_sub_group', 'shape_rtype']

### Clean the color column

In [15]:
df['color'].value_counts().head(30)

                                                                             5473
10R5/6                                                                        123
2.5YR5/6                                                                      121
2.5YR6/8                                                                      107
2.5YR5/8                                                                       82
Clay reddish yellow (7.5 YR 6/6).                                              76
Red clay                                                                       70
Clay reddish yellow (5 YR 6/6).                                                52
2.5YR6/6                                                                       51
10R4/6                                                                         50
10R5/8                                                                         43
smooth, bright whitish grey, flecked with small and medium lime fragments      40
10YR8/3         

> Not sure what the various codes mean. Will look into this and maybe consider cleaning this column.

### CLean the firing column

In [16]:
df['firing'].value_counts().head(30)

                     7900
Fully fired          1003
fully fired           320
Break: 2.5YR7/6       219
Break: 2.5YR6/6       205
Full fired            105
Break: 5YR7/6          95
Break: 2.5YR5/6        88
Break: 5YR6/6          81
Black core.            76
Break: 5YR5/6          73
Fully Fired            66
Thick black core.      61
Break: 2.5YR8/4        57
Break: 2.5YR7/4        53
Break: 10R7/6          52
Grey core.             47
Break: 5YR8/4          47
Break: 2.5YR6/4        40
Break: 5YR6/4          40
Break: 2.5YR6/8        39
Break: 5YR7/4          36
Oxidized               34
Break: 2.5YR5/8        34
Thick grey core.       32
Break: 10R5/6          29
Break: 10R6/6          29
Grey core              28
Break: 7.5YR5/4        28
Break: 7.5YR4/6        27
Name: firing, dtype: int64

In [17]:
df['firing'].replace({'Fully fired': 'Fully Fired',
                      'fully fired': 'Fully Fired',
                      'Full fired': 'Fully Fired',
                      'Black core.': 'Black Core',
                      'Thick black core.': 'Black Core',
                      'Black core': 'Black Core',
                      'Black core, up to 75*': 'Black Core',
                      'Grey core.': 'Grey Core',
                      'Thick grey core.': 'Grey Core',
                      'Grey core': 'Grey Core',
                      'wide gray core': 'Gray Core',
                      'Gray Core': 'Grey Core',
                      'wide light grey core': 'Grey Core',
                      'narrow light grey core': 'Grey Core'}).value_counts().head(50)

                                   7900
Fully Fired                        1494
Break: 2.5YR7/6                     219
Break: 2.5YR6/6                     205
Black Core                          155
Grey Core                           136
Break: 5YR7/6                        95
Break: 2.5YR5/6                      88
Break: 5YR6/6                        81
Break: 5YR5/6                        73
Break: 2.5YR8/4                      57
Break: 2.5YR7/4                      53
Break: 10R7/6                        52
Break: 5YR8/4                        47
Break: 5YR6/4                        40
Break: 2.5YR6/4                      40
Break: 2.5YR6/8                      39
Break: 5YR7/4                        36
Oxidized                             34
Break: 2.5YR5/8                      34
Break: 10R5/6                        29
Break: 10R6/6                        29
Break: 7.5YR5/4                      28
Break: 7.5YR4/6                      27
c. 600°C                             27


### Clean the surface treatment column


In [18]:
df['surface_treatment'].value_counts().head(30)

                                                                                        6023
dull slip on entire vessel                                                               459
dull slip on entire vessel, partially rubbed off                                         348
dull slip on entire vessel, rubbed off                                                   268
dull slip on exterior                                                                    223
Voids.                                                                                   141
dull slip on exterior, partially rubbed off                                              137
Red slipped                                                                               81
wet-smoothed                                                                              65
dull glossy slip on entire vessel                                                         58
self-slipped, slightly powdery surface                                

### Clean the manu_tech column 

In [19]:
df['manu_tech'] = df['manu_tech'].apply(','.join)

In [34]:
df['manu_tech'].replace({'Wheel-made': 'wheel_made',
                         'Wheelmade': 'wheel_made',
                         'wheelmade': 'wheel_made',
                         'Wheel made': 'wheel_made',
                         'Mold made': 'mold_made',
                         'other,wheel finished': 'wheel_made,other'}).value_counts().head(20)

wheel_made                                  8543
                                            7255
hand_made_coil                               360
hand_made_slab                               180
mold_made                                    171
wheel_made,hand_made_coil,hand_made_slab      25
wheel_made,hand_made_coil                     21
wheel_made,mold_made                          15
other,Handmade                                15
other,Hand made                               13
ware,wheel_made                               11
other,Hand-made                               10
other,Carved                                  10
hand_made_coil,hand_made_slab                  6
wheel_made,other                               4
wheel_made,hand_made_slab                      3
other                                          2
other,Fine ware                                2
mold_made,hand_made_coil                       2
ware,hand_made_coil                            2
Name: manu_tech, dty