# Imports

In [1]:
# Imports
import numpy as np
import pandas as pd
import json

# Load the Data

In [2]:
# Load the vessels data
with open('../lcp_data/vessels.json') as file:
    df = pd.DataFrame(json.load(file))
df.head(20)

Unnamed: 0,id,category,chron,uid,site,site_ref,ware_id,created_at,updated_at,inventory_number,...,diameter_base,estimated_diameter_base,true_diameter_base,slug,acknowledgement,site_id,functional_category_id,old_shape_name,has_petrography,discarded_at
0,7,Pot Pot Pot,5th-4th centuries BCE,LCP-KFNUW,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,21.0,2012-09-17T20:25:44.756Z,2022-06-06T06:48:23.688Z,Mizpe Yammim 19/165,...,estimated,11.0,,mizpe-yammim-19-165,,1154.0,8.0,Juglet,False,
1,8,Pot,5th-4th century BCE,LCP-VBTDB,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,21.0,2012-09-17T20:30:46.389Z,2018-05-22T07:42:14.334Z,Mizpe Yammim 22/452,...,,0.0,0.0,mizpe-yammim-22-452,,1154.0,8.0,Juglet,False,
2,9,Pot,5th-4th century BCE,LCP-SBBGA,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,21.0,2012-09-17T20:32:02.507Z,2018-03-17T14:10:18.616Z,Mizpe Yammim 8/398,...,,0.0,0.0,mizpe-yammim-8-398,,1154.0,8.0,Juglet,False,
3,10,Pot,3rd-mid-2nd c. BCE,LCP-EAWCN,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,28.0,2012-09-17T20:53:53.147Z,2021-12-06T11:10:42.903Z,Qedesh K00P168,...,,0.0,0.0,qedesh-k00p168,,1525.0,1.0,Cooking pot,False,
4,11,Pot,2nd c. BCE,LCP-SUNLQ,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,28.0,2012-09-17T20:54:45.445Z,2022-07-15T21:45:23.124Z,Qedesh K00P058,...,,0.0,0.0,qedesh-k00p058,,1525.0,1.0,Casserole/caccabé,False,
5,14,Pot,1200BCE - 1150BCE,LCP-USWLF,"{'id': 1243, 'name': 'Tel Miqne/Ekron', 'chron...",,29.0,2012-09-17T21:07:57.254Z,2016-03-15T17:46:30.643Z,Miqne INE.4.392/1,...,,0.0,0.0,miqne-ine-4-392-1,,1243.0,10.0,"Bowl, large",False,
6,16,Pot,300 BCE - 150 BCE,LCP-SPESO,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,494.0,2012-09-17T21:16:02.498Z,2020-03-15T18:05:52.499Z,Qedesh K09P046,...,true,,5.1,qedesh-k09p046,,1525.0,2.0,Saucer,False,
7,17,Pot,200 BCE - 140 BCE,LCP-PFGSF,"{'id': 1525, 'name': 'Qedesh', 'chron': None, ...",,127.0,2012-09-17T21:17:15.643Z,2020-03-14T20:17:34.857Z,Qedesh K00P157,...,true,,7.8,qedesh-k00p157,,1525.0,2.0,Plate,False,
8,18,Pot,5th-4th c. BCE,LCP-SQPVL,"{'id': 1178, 'name': 'Tel Anafa', 'chron': Non...",,155.0,2012-09-17T21:24:44.936Z,2021-07-25T13:20:02.782Z,Tel Anafa PW 49/TA79P49,...,,0.0,0.0,tel-anafa-pw-49-ta79p49,,1178.0,8.0,Juglet,False,
9,19,Pot Pot,5th-4th century BCE,LCP-BCHMS,"{'id': 1154, 'name': 'Mizpe Yammim', 'chron': ...",,74.0,2012-09-17T21:26:15.697Z,2018-03-17T13:52:38.793Z,Mizpe Yammim 15/362,...,,0.0,0.0,mizpe-yammim-15-362,,1154.0,8.0,Juglet,False,


# Data Cleaning

In [3]:
# Call info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16717 entries, 0 to 16716
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       16717 non-null  int64  
 1   category                 560 non-null    object 
 2   chron                    13837 non-null  object 
 3   uid                      560 non-null    object 
 4   site                     16699 non-null  object 
 5   site_ref                 0 non-null      object 
 6   ware_id                  9303 non-null   float64
 7   created_at               16717 non-null  object 
 8   updated_at               16717 non-null  object 
 9   inventory_number         16717 non-null  object 
 10  shape_id                 15638 non-null  float64
 11  funct_cat                7631 non-null   object 
 12  shape_sub_group          13389 non-null  object 
 13  shape_rtype              13391 non-null  object 
 14  shape_sub_type        

In [4]:
# Look for nearly empty columns
filter = (df.isna().sum() / df.shape[0] > 0.95)
cols = df.loc[:, filter.values].columns
df.drop(columns = cols, inplace = True)

In [5]:
# We won't use any ids for modelling.
df.drop(columns = ['ware_id', 
                   'shape_id', 
                   'user_id',
                   'petrofabric_id',
                   'site_id',
                   'functional_category_id'],
        inplace = True)

In [6]:
# Remove some more columns unneccesary for modeling.
df.drop(columns = ['created_at',
                   'updated_at',
                   'inventory_number'],
        inplace = True)

## Label cleaning

In [7]:
# Make a list of columns we decide to drop after closer inspection
cols_to_drop = []

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16717 entries, 0 to 16716
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       16717 non-null  int64  
 1   chron                    13837 non-null  object 
 2   site                     16699 non-null  object 
 3   funct_cat                7631 non-null   object 
 4   shape_sub_group          13389 non-null  object 
 5   shape_rtype              13391 non-null  object 
 6   color                    13911 non-null  object 
 7   firing                   13548 non-null  object 
 8   surface_treatment        13919 non-null  object 
 9   manu_tech                16717 non-null  object 
 10  arch_context             13990 non-null  object 
 11  privacy_status           16717 non-null  int64  
 12  site_type                13999 non-null  object 
 13  period                   8390 non-null   object 
 14  petrofab_visual_desc  

### Clean the 'funct_cat' column

In [9]:
# Clean the 'funct_cat' column.
df['funct_cat'].value_counts()

                                    1655
Dining/Drinking/Serving             1630
Household/Utility                   1521
Transport/Storage                   1005
Cooking/Kitchen                      727
Household/Storage                    485
Table ware                            83
Lighting                              83
Food production                       80
Cosmetic/Toilette/Medicine            67
Smoking                               48
Cooking/Kitchen, Food production      46
Cooking                               39
Dining/Serving                        31
Ceramic Production                    23
Agriculture                           13
Transport, storage                     9
Food production/service                9
cooking wares                          8
food/drink service                     8
sugar production                       7
Transport/Stroage                      7
Burial                                 5
food service                           5
 Cooking        

In [10]:
df['funct_cat'].replace({'Household/Storage': 'Household/Utility',
                         'Table ware': 'Dining/Drinking/Serving',
                         'Food production': 'Cooking/Kitchen',
                         'Cooking/Kitchen, Food production': 'Cooking/Kitchen',
                         'Cooking': 'Cooking/Kitchen',
                         'Transport, storage': 'Transport/Storage',
                         'Dining/Serving': 'Dining/Drinking/Serving',
                         'Food production/service': 'Cooking/Kitchen',
                         'food/drink service': 'Dining/Drinking/Serving',
                         'cooking wares': 'Cooking/Kitchen',
                         'food service': 'Dining/Drinking/Service',
                         ' Cooking': 'Cooking/Kitchen',
                         'sugar production': 'Cooking/Kitchen',
                         'drinking vessel': 'Dining/Drinking/Serving',
                         ' food/drink service': 'Dining/Drinking/Serving',
                         ' cooking': 'Cooking/Kitchen',
                         'to drink': 'Dining/Drinking/Serving',
                         'cooking': 'Cooking/Kitchen',
                         'Water jug': 'Dining/Drinking/Serving',
                         'Waster': 'Dining/Drinking/Serving',
                         'Dining/Drinking/Service': 'Dining/Drinking/Serving',
                         'Table': 'Household/Utility',
                         'Tableware': 'Household/Utility',
                         'Fine ware': 'Household/Utility',
                         'table ware': 'Household/Utility',
                         'brick': 'Building material',
                         'Container': 'Storage',
                         'perfume vessels': 'Cosmetic/Toilette/Medicine',
                         'Transport/Stroage': 'Transport/Storage',
                         'Lighting': 'Household/Utility',
                         'lamp': 'Household/Utility',
                         'lighting': 'Household/Utility',
                         'Storage': 'Household/Utility'}).value_counts()

Household/Utility             2097
Dining/Drinking/Serving       1760
                              1655
Transport/Storage             1021
Cooking/Kitchen                923
Cosmetic/Toilette/Medicine      68
Smoking                         48
Ceramic Production              23
Agriculture                     13
Burial                           5
Building material                5
Dining/Drinking/Service          5
Ritual                           3
Ritual, Toy                      1
lighting                         1
Storage                          1
Figurine                         1
Miniature                        1
Name: funct_cat, dtype: int64

> There is still work to be done on this feature. I'm having issues dealing with some of the remaining labels. For example, I'm not able to change the label 'Dining/Drinking/Service.' Not sure why.

### Clean the shape_sub_group column

In [11]:
# Get the labels
df['shape_sub_group'].value_counts().head(30)

                                                             12501
Marine Transport Containers                                    187
Milkbowl                                                        18
Black polished bowl with graffito                               14
Bowl                                                            13
tableware                                                       12
Beirut 3                                                        11
Low-necked jar                                                  10
Gray wide-mouthed storage jar                                    9
Thickened rim saucer                                             8
Phoenician Amphora                                               8
cooking-pot                                                      6
Jar                                                              5
Base of black polished bowl with graffito                        5
Black polished bowl                                           

> Probably not worth using this feature for modeling.

In [12]:
# Add shape_sub_group to the list of columns to drop.
cols_to_drop.append('shape_sub_group')
cols_to_drop

['shape_sub_group']

### Clean the shape_rtype column

In [13]:
df['shape_rtype'].value_counts().head(30)

                                                                                      6943
Closed                                                                                 193
Closed Shape                                                                           192
Carinated Shoulder Amphorae                                                            188
Open                                                                                   175
Open Shape                                                                             168
LH Jiyeh Type 1                                                                         55
open                                                                                    51
Incurved rim                                                                            44
LH Jiyeh Type 2                                                                         35
Type M3a: thickened rim, bag-shaped body (Rohmer 2020, Type M3a, fig. 1.1, p. 43).      35

In [14]:
# We drop this column as well.
cols_to_drop.append('shape_rtype')
cols_to_drop

['shape_sub_group', 'shape_rtype']

### Clean the color column

In [15]:
df['color'].value_counts().head(30)

                                                                             5473
10R5/6                                                                        123
2.5YR5/6                                                                      121
2.5YR6/8                                                                      107
2.5YR5/8                                                                       82
Clay reddish yellow (7.5 YR 6/6).                                              76
Red clay                                                                       70
Clay reddish yellow (5 YR 6/6).                                                52
2.5YR6/6                                                                       51
10R4/6                                                                         50
10R5/8                                                                         43
smooth, bright whitish grey, flecked with small and medium lime fragments      40
10YR8/3         

> Not sure what the various codes mean. Will look into this and maybe consider cleaning this column.

### Clean the firing column

In [16]:
df['firing'].value_counts().head(30)

                     7900
Fully fired          1003
fully fired           320
Break: 2.5YR7/6       219
Break: 2.5YR6/6       205
Full fired            105
Break: 5YR7/6          95
Break: 2.5YR5/6        88
Break: 5YR6/6          81
Black core.            76
Break: 5YR5/6          73
Fully Fired            66
Thick black core.      61
Break: 2.5YR8/4        57
Break: 2.5YR7/4        53
Break: 10R7/6          52
Grey core.             47
Break: 5YR8/4          47
Break: 2.5YR6/4        40
Break: 5YR6/4          40
Break: 2.5YR6/8        39
Break: 5YR7/4          36
Oxidized               34
Break: 2.5YR5/8        34
Thick grey core.       32
Break: 10R5/6          29
Break: 10R6/6          29
Grey core              28
Break: 7.5YR5/4        28
Break: 7.5YR4/6        27
Name: firing, dtype: int64

In [17]:
df['firing'].replace({'Fully fired': 'Fully Fired',
                      'fully fired': 'Fully Fired',
                      'Full fired': 'Fully Fired',
                      'Black core.': 'Black Core',
                      'Thick black core.': 'Black Core',
                      'Black core': 'Black Core',
                      'Black core, up to 75*': 'Black Core',
                      'Grey core.': 'Grey Core',
                      'Thick grey core.': 'Grey Core',
                      'Grey core': 'Grey Core',
                      'wide gray core': 'Gray Core',
                      'Gray Core': 'Grey Core',
                      'wide light grey core': 'Grey Core',
                      'narrow light grey core': 'Grey Core'}).value_counts().head(50)

                                   7900
Fully Fired                        1494
Break: 2.5YR7/6                     219
Break: 2.5YR6/6                     205
Black Core                          155
Grey Core                           136
Break: 5YR7/6                        95
Break: 2.5YR5/6                      88
Break: 5YR6/6                        81
Break: 5YR5/6                        73
Break: 2.5YR8/4                      57
Break: 2.5YR7/4                      53
Break: 10R7/6                        52
Break: 5YR8/4                        47
Break: 5YR6/4                        40
Break: 2.5YR6/4                      40
Break: 2.5YR6/8                      39
Break: 5YR7/4                        36
Oxidized                             34
Break: 2.5YR5/8                      34
Break: 10R5/6                        29
Break: 10R6/6                        29
Break: 7.5YR5/4                      28
Break: 7.5YR4/6                      27
c. 600°C                             27


### Clean the surface treatment column


In [18]:
df['surface_treatment'].value_counts().head(30)

                                                                                        6023
dull slip on entire vessel                                                               459
dull slip on entire vessel, partially rubbed off                                         348
dull slip on entire vessel, rubbed off                                                   268
dull slip on exterior                                                                    223
Voids.                                                                                   141
dull slip on exterior, partially rubbed off                                              137
Red slipped                                                                               81
wet-smoothed                                                                              65
dull glossy slip on entire vessel                                                         58
self-slipped, slightly powdery surface                                

### Clean the manu_tech column 

In [19]:
# Convert the entries to strings
df['manu_tech'] = df['manu_tech'].apply(','.join)

In [20]:
# Consolidate the labels
df['manu_tech'].replace({'Wheel-made': 'wheel_made',
                         'Wheelmade': 'wheel_made',
                         'wheelmade': 'wheel_made',
                         'Wheel made': 'wheel_made',
                         'Mold made': 'mold_made',
                         'other,wheel finished': 'wheel_made,other',
                         'other,Handmade': 'other,hand_made',
                         'other,Hand made': 'other,hand_made',
                         'other,Handmade; Inc*': 'other,hand_made',
                         'other,hand_made; Inc*': 'other,hand_made'},
                         regex = True).value_counts().head(30)

wheel_made                                                                                                                                                                                                                                                             8543
                                                                                                                                                                                                                                                                       7255
hand_made_coil                                                                                                                                                                                                                                                          360
hand_made_slab                                                                                                                                                                                      

> Still need to figure out how to use regex to clean the remaining data.

### Clean the arch_context column

In [21]:
df['arch_context'].value_counts().head(30)

                                                              3145
Surface                                                        490
House K26                                                      304
Paphos Agora                                                   215
Cistern                                                        212
City of David, Y. Shiloh                                       145
Ophel, E. Mazar                                                128
Subterranean Complex 169                                       126
Citadel summit                                                 114
Communal tomb, Area A                                           99
surface survey                                                  98
House K20, cellar, phase 8, adobe brick rubble                  94
Fortress                                                        89
Citadel Mound, SW zone, Op. 17, “Square Pit”, dump deposit      87
Under a mosaic                                                

> This seems to be a description of where the sherd was found. We don't think this will be particularly useful for data modeling. We will drop this column.

In [22]:
# Add the arch_context column to the list of columns to drop.
cols_to_drop.append('arch_context')
cols_to_drop

['shape_sub_group', 'shape_rtype', 'arch_context']

### Clean the privacy_status column

In [23]:
df['privacy_status'].value_counts()

1    16350
2      256
3       83
0       28
Name: privacy_status, dtype: int64

> This probably isn't useful, but will confirm before dropping.

### Clean the site_type column

In [24]:
df['site_type'].replace({'city': 'City',
                         'Large city': 'City',
                         'settlement': 'Settlement',
                         'village': 'Village'}).value_counts().head(30)

City                                       2976
                                           2623
town, house, domestic                      2275
Settlement                                  848
Village                                     418
city and port                               267
Harbor                                      232
Port                                        226
town                                        220
Fortified settlement                        170
Fortified citadel                           135
cemetery                                    120
Cistern                                     108
Urban                                       106
sanctuary                                   106
fortification                               104
Big fortified citadel                        97
Tomb                                         97
khan/inn                                     95
Fortified little citadel and settlement      89
Cemetery                                

### Clean the period column

In [25]:
df['period'].replace({'': 'Missing'}).value_counts().head(30)

Hellenistic                      903
Byzantine                        704
Roman                            532
Missing                          463
Iron Age IIC                     364
Iron Age IIB                     350
Byzantine, Early Islamic         315
Iron Age IIA                     284
Middle Bronze Age II             251
Persian Period                   246
Hellenistic, Early Roman         215
Late Hellenistic                 203
Early Islamic                    185
Early Bronze Age II              160
Mamluk-Ottoman Period            148
Late Hellenistic, Early Roman    145
Iron Age I                       135
Early Roman                      129
Late Bronze Age II               128
Late Iron Age                    109
Iron Age                         106
Ptolemaic                        104
Roman Period                     102
Late Bronze Age                   81
Chalcolithic                      76
Iron Age IIB-C                    68
Late Period                       61
P

### Clean the petrofab_visual_desc column

In [26]:
df['petrofab_visual_desc'].value_counts().head(30)

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   9372
The matrix is ferruginous calcareous clay with small quantities of silt-size quartz and carbonate relicts. The clay minerals of the matrix had changed their opt

> This column is not useful for our model so we drop it.

In [27]:
# Drop the petrofab_visual_desc column
cols_to_drop.append('petrofab_visual_desc')
cols_to_drop

['shape_sub_group', 'shape_rtype', 'arch_context', 'petrofab_visual_desc']

### Clean the start_year column

In [28]:
df['start_year'].value_counts()

-300.0     1037
-200.0      897
 0.0        547
-100.0      380
-600.0      323
           ... 
 2200.0       1
-665.0        1
-680.0        1
-610.0        1
-2900.0       1
Name: start_year, Length: 254, dtype: int64

In [29]:
# Check for missing values in 'start year'
df['start_year'].isna().sum()

5989

> The labeling convention seems inconsistent for this column. There are years that should probably be negative. (2200 .e.g.) 

### Clean the end_year column

In [30]:
df['end_year'].value_counts()

-101.0     1047
-1.0        713
 99.0       688
 199.0      516
 700.0      415
           ... 
-275.0        1
-3100.0       1
 3100.0       1
-530.0        1
 730.0        1
Name: end_year, Length: 282, dtype: int64

> Same issue as above. Need to fix the labels that should be negative somehow.

### Clean the references_data column

In [31]:
df['references_data'].head(30)

0     {}
1     {}
2     {}
3     {}
4     {}
5     {}
6     {}
7     {}
8     {}
9     {}
10    {}
11    {}
12    {}
13    {}
14    {}
15    {}
16    {}
17    {}
18    {}
19    {}
20    {}
21    {}
22    {}
23    {}
24    {}
25    {}
26    {}
27    {}
28    {}
29    {}
Name: references_data, dtype: object

> Need to investigate this further.

### Clean the thickness column

In [32]:
df['thickness'].value_counts()

0.0     15241
0.5       209
0.4       205
0.6       148
0.7       136
        ...  
1.06        1
0.63        1
0.77        1
0.87        1
9.0         1
Name: thickness, Length: 93, dtype: int64

In [33]:
# Determine how many entries larger than 0.
(df['thickness'].astype('float64') > 0).sum()

1465

> Not many nonzero values. We may end up dropping this column.

### Clean the height column

In [34]:
df['height'].value_counts()

preserved    2777
true         1105
Name: height, dtype: int64

### Clean the preserved_height column

In [35]:
df['preserved_height'].value_counts()

0.0      12875
4.0        124
3.0         83
5.0         78
4.5         70
         ...  
23.7         1
25.9         1
29.3         1
130.0        1
2.92         1
Name: preserved_height, Length: 299, dtype: int64

In [36]:
(df['preserved_height'].astype('float64' ) > 0).sum()

2723

### Clean the true_height column

In [37]:
df['true_height'].value_counts()

0.0     12855
5.0        31
4.0        29
4.5        27
3.0        26
        ...  
51.0        1
32.2        1
46.5        1
22.4        1
44.4        1
Name: true_height, Length: 274, dtype: int64

In [38]:
# Get the number of nonzero entries
(df['true_height'].astype('float64') > 0).sum()

1078

### Clean the diameter_rim column

In [39]:
df['diameter_rim'].value_counts()

true         3476
estimated     912
Name: diameter_rim, dtype: int64

### Clean the estimated_diameter_rim

In [40]:
df['estimated_diameter_rim'].value_counts()

0.0     12366
10.0       81
11.0       42
12.0       42
9.0        41
        ...  
10.1        1
7.9         1
15.9        1
2.44        1
26.5        1
Name: estimated_diameter_rim, Length: 139, dtype: int64

In [41]:
# Get the number of nonzero entries
(df['estimated_diameter_rim'].astype('float64') > 0).sum()

867

### Clean the true_diameter_rim column

In [42]:
df['true_diameter_rim'].value_counts()

0.0     12401
10.0      179
12.0      175
14.0      170
18.0      155
        ...  
5.3         1
21.5        1
41.0        1
1.0         1
22.5        1
Name: true_diameter_rim, Length: 278, dtype: int64

In [43]:
# Get the number of nonzero entries
(df['true_diameter_rim'].astype('float64') > 0).sum()

3385

### Clean the max_diameter column

In [44]:
df['max_diameter'].value_counts()

                  8567
estimated          120
true                87
16                  20
15                  19
                  ... 
24.3 cm (body)       1
25.8 cm              1
35 cm (body)         1
4.2-4.22 cm          1
N/A                  1
Name: max_diameter, Length: 398, dtype: int64

> Not sure how to interpret this. The "estimated" and true "labels" seemed to reference other columns in this data set. 

### Clean the estimated_max_diameter column

In [45]:
df['estimated_max_diameter'].value_counts()

0.0     16510
13.0        9
20.0        7
12.0        6
19.0        5
21.0        5
17.0        5
10.0        5
28.0        5
32.0        4
16.0        4
9.0         3
30.0        3
24.0        3
26.0        3
22.0        3
5.0         2
12.5        2
35.0        2
15.0        2
34.0        2
29.0        2
9.5         2
40.0        2
11.0        2
6.0         1
13.2        1
18.5        1
13.5        1
19.6        1
64.0        1
51.2        1
18.6        1
22.4        1
35.4        1
12.8        1
18.0        1
44.0        1
23.0        1
36.0        1
50.0        1
42.0        1
8.0         1
14.0        1
17.5        1
38.0        1
7.0         1
6.2         1
8.5         1
31.0        1
55.0        1
29.5        1
47.5        1
Name: estimated_max_diameter, dtype: int64

In [46]:
# Get the percentage of entries equal to 0
(df['estimated_max_diameter'].value_counts()[0]) / len(df)

0.9876173954656936

> There's a lot of data missing. We should probably drop this column.

### Clean the true_max_diameter column

In [47]:
df['true_max_diameter'].value_counts()

0.0     16510
14.0        5
12.0        4
16.2        3
12.5        2
        ...  
13.0        1
26.0        1
8.5         1
37.0        1
5.7         1
Name: true_max_diameter, Length: 65, dtype: int64

> A lot of missing values in this column.

### Clean the diameter_base column

In [48]:
df['diameter_base'].value_counts()

true         1125
estimated     164
Name: diameter_base, dtype: int64

### Clean the estimated_diameter_base column

In [49]:
df['estimated_diameter_base'].value_counts()

0.0     15438
5.0        13
6.0        11
8.0        10
11.0        8
        ...  
8.7         1
6.9         1
6.3         1
5.9         1
32.0        1
Name: estimated_diameter_base, Length: 64, dtype: int64

> Large number of missing values

### Clean the true_diameter_base column

In [50]:
df['true_diameter_base'].value_counts()

0.0     15454
5.0        57
6.0        52
4.0        48
3.0        41
        ...  
0.5         1
9.3         1
65.3        1
43.5        1
14.6        1
Name: true_diameter_base, Length: 184, dtype: int64

> If the missing values aren't spread out too much, we may be able to construct a complete dataset using many if not most of the columns.

### Clean the slug column

In [51]:
df['slug'].value_counts()

mizpe-yammim-19-165      1
lachish-l-2610-b-7743    1
lachish-l-2610-b-7562    1
lachish-l-2610-b-7733    1
lachish-l-2611-b-7730    1
                        ..
o14-7                    1
t18-18-1                 1
m13-5                    1
6-4                      1
alasehir-3260            1
Name: slug, Length: 16717, dtype: int64

> Not sure what to do with this. Need a subject-matter-expert.

### Clean the acknowledgement column

In [52]:
df['acknowledgement'].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     9144
<p>This material is a product of the Syro-Belgian Tell Tweini Excavation Project, headed by Dr. Al-Maqdissi and Dr. Badawi, of the Syrian Department of Antiquities, and Prof. Dr. Joachim Bre

In [53]:
# Can't do much with this column.
cols_to_drop.append('acknowledgement')
cols_to_drop

['shape_sub_group',
 'shape_rtype',
 'arch_context',
 'petrofab_visual_desc',
 'acknowledgement']

### Clean the old_shape_name column

In [54]:
df['old_shape_name'].value_counts().head(30)

Bowl                       1350
Jar                         944
Cooking pot                 622
Amphora                     533
Plate                       515
Bowl, small                 491
Jug                         389
Storage jar                 296
Krater                      247
Casserole                   184
Pitcher                     171
Lid                         145
Lamp                        139
Juglet                      125
Beaker                      113
Stand                       108
Cup                          97
Grinding bowl/mortarium      78
Casserole/caccabé            67
Pithos                       64
Basin                        55
Tobacco pipe                 48
Flask                        47
Saucer                       45
Platter                      44
Bottle                       43
Unguentarium                 40
Dish                         40
Amphoriskos                  36
Cooking  pot                 35
Name: old_shape_name, dtype: int64

> We'll come back and work on condensing the labels here.

### Clean the has_petrography column

In [55]:
df['has_petrography'].value_counts()

False    11224
True      5493
Name: has_petrography, dtype: int64

> Nothing to do in this column.