# Covertype Data Set Preprocessing

In [None]:
%pip install gcsfs

In [1]:
import numpy as np
import pandas as pd
import tensorflow_data_validation as tfdv


from sklearn.model_selection import train_test_split

### Set the paths

In [2]:
FULL_DATASET = '../covertype.csv'
TRAIN_DATASET='../covertype_train.csv'
VALIDATE_DATASET='../covertype_validate.csv'
TESTING_DATASET='../covertype_testing.csv'
SERVING_DATASET='../covertype_serving.csv'

DATASET_PATH = 'gs://workshop-datasets/covertype/orig/covtype.data'

### Load the dataset

In [3]:
df = pd.read_csv(DATASET_PATH, header=None)
print(df.shape)
df.head()

(581012, 55)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


### Configure soil type and wilderness area domains

In [4]:
soil_type = [
"1", "2702", "Cathedral family - Rock outcrop complex, extremely stony.",
"2", "2703", "Vanet - Ratake families complex, very stony.",
"3", "2704", "Haploborolis - Rock outcrop complex, rubbly.",
"4", "2705", "Ratake family - Rock outcrop complex, rubbly.",
"5", "2706", "Vanet family - Rock outcrop complex complex, rubbly.",
"6", "2717", "Vanet - Wetmore families - Rock outcrop complex, stony.",
"7", "3501", "Gothic family.",
"8", "3502", "Supervisor - Limber families complex.",
"9", "4201", "Troutville family, very stony.",
"10", "4703", "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
"11", "4704", "Bullwark - Catamount families - Rock land complex, rubbly.",
"12", "4744", "Legault family - Rock land complex, stony.",
"13", "4758", "Catamount family - Rock land - Bullwark family complex, rubbly.",
"14", "5101", "Pachic Argiborolis - Aquolis complex.",
"15", "5151", "unspecified in the USFS Soil and ELU Survey.",
"16", "6101", "Cryaquolis - Cryoborolis complex.",
"17", "6102", "Gateview family - Cryaquolis complex.",
"18", "6731", "Rogert family, very stony.",
"19", "7101", "Typic Cryaquolis - Borohemists complex.",
"20", "7102", "Typic Cryaquepts - Typic Cryaquolls complex.",
"21", "7103", "Typic Cryaquolls - Leighcan family, till substratum complex.",
"22", "7201", "Leighcan family, till substratum, extremely bouldery.",
"23", "7202", "Leighcan family, till substratum - Typic Cryaquolls complex.",
"24", "7700", "Leighcan family, extremely stony.",
"25", "7701", "Leighcan family, warm, extremely stony.",
"26", "7702", "Granile - Catamount families complex, very stony.",
"27", "7709", "Leighcan family, warm - Rock outcrop complex, extremely stony.",
"28", "7710", "Leighcan family - Rock outcrop complex, extremely stony.",
"29", "7745", "Como - Legault families complex, extremely stony.",
"30", "7746", "Como family - Rock land - Legault family complex, extremely stony.",
"31", "7755", "Leighcan - Catamount families complex, extremely stony.",
"32", "7756", "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
"33", "7757", "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
"34", "7790", "Cryorthents - Rock land complex, extremely stony.",
"35", "8703", "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
"36", "8707", "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
"37", "8708", "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
"38", "8771", "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
"39", "8772", "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
"40", "8776", "Moran family - Cryorthents - Rock land complex, extremely stony.",
]

wilderness_area = [
"Rawah",
"Neota",
"Commanche Peak",
"Cache la Poudre",
]

### Map one-hot encoded values to categorical domains

In [5]:
soil = df.loc[:, 14:53].apply(lambda x: soil_type[1::3][x.to_numpy().nonzero()[0][0]], axis=1)
soil

0         7745
1         7745
2         4744
3         7746
4         7745
          ... 
581007    2703
581008    2703
581009    2703
581010    2703
581011    2703
Length: 581012, dtype: object

In [6]:
wilderness = df.loc[:, 10:13].apply(lambda x: wilderness_area[x.to_numpy().nonzero()[0][0]], axis=1)
wilderness

0                  Rawah
1                  Rawah
2                  Rawah
3                  Rawah
4                  Rawah
               ...      
581007    Commanche Peak
581008    Commanche Peak
581009    Commanche Peak
581010    Commanche Peak
581011    Commanche Peak
Length: 581012, dtype: object

### Create a dataset with column names and categorical values replacing one-hot encoded soil type and wilderness areas

In [7]:
COLUMN_NAMES = [
    'Elevation', 
    'Aspect', 
    'Slope', 
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area',
    'Soil_Type',
    'Cover_Type']

df_processed = pd.concat([df.loc[:, 0:9], wilderness, soil, df.loc[:, 54]], axis=1, ignore_index=True)
df_processed.columns = COLUMN_NAMES
df_processed

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,7745,5
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,7745,5
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,4744,2
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,7746,2
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,7745,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche Peak,2703,3
581008,2391,152,19,67,12,95,240,237,119,845,Commanche Peak,2703,3
581009,2386,159,17,60,7,90,236,241,130,854,Commanche Peak,2703,3
581010,2384,170,15,60,5,90,230,245,143,864,Commanche Peak,2703,3


### Save the dataset to CSV file

In [8]:
df_processed.to_csv(FULL_DATASET, header=True, index=False)

### Create training, validation, testing and serving splits.

In [9]:
df_train, df_other = train_test_split(df_processed, train_size=431012, shuffle=True)
df_validate = df_other.iloc[0:50000]
df_testing = df_other.iloc[50000:100000]
df_serving = df_other.iloc[100000:150000]
print(df_train.shape)
print(df_validate.shape)
print(df_testing.shape)
print(df_serving.shape)

(431012, 13)
(50000, 13)
(50000, 13)
(50000, 13)


In [10]:
df_train.to_csv(TRAIN_DATASET, header=True, index=False)
df_validate.to_csv(VALIDATE_DATASET, header=True, index=False)
df_testing.to_csv(TESTING_DATASET, header=True, index=False)
df_serving.to_csv(SERVING_DATASET, header=True, index=False)

#### Generate and visualize statistics for the training split

In [11]:
train_stats = tfdv.generate_statistics_from_csv(
    data_location=TRAIN_DATASET
)

tfdv.visualize_statistics(train_stats)



Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


#### Infer and display schema

In [16]:
schema = tfdv.infer_schema(train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Aspect',INT,required,,-
'Cover_Type',INT,required,,-
'Elevation',INT,required,,-
'Hillshade_3pm',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Wilderness_Area',"'Cache la Poudre', 'Commanche Peak', 'Neota', 'Rawah'"


#### Fine tune the schema

In [27]:
from tensorflow_metadata.proto.v0 import schema_pb2

#schema = schema_pb2.Schema()

#schema.feature.add(name='Elevation', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Aspect', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Slope', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Horizontal_Distance_To_Hydrology', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Vertical_Distance_To_Hydrology', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Horizontal_Distance_To_Roadways', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Hillshade_9am', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Hillshade_Noon', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Hillshade_3pm', type=schema_pb2.FeatureType.FLOAT)
#schema.feature.add(name='Horizontal_Distance_To_Fire_Points', type=schema_pb2.FeatureType.FLOAT)

#schema.feature.add(name='Wilderness_Area', type=schema_pb2.FeatureType.BYTES)
#schema.feature.add(name='Soil_Type', type=schema_pb2.FeatureType.BYTES)

#schema.feature.add(name='Cover_Type', type=schema_pb2.FeatureType.INT)
#tfdv.set_domain(schema, 'Cover_Type', schema_pb2.IntDomain(min=1, max=7, is_categorical=True))

tfdv.get_feature(schema, 'Soil_Type').type = schema_pb2.FeatureType.BYTES
tfdv.set_domain(schema, 'Soil_Type', schema_pb2.StringDomain(name='Soil_Type', value=soil_type[1::3]))

tfdv.set_domain(schema, 'Cover_Type', schema_pb2.IntDomain(name='Cover_Type', min=1, max=7, is_categorical=True))



In [28]:
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Aspect',INT,required,,-
'Cover_Type',INT,required,,"[1,7]"
'Elevation',INT,required,,-
'Hillshade_3pm',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Wilderness_Area',"'Cache la Poudre', 'Commanche Peak', 'Neota', 'Rawah'"
'Soil_Type',"'2702', '2703', '2704', '2705', '2706', '2717', '3501', '3502', '4201', '4703', '4704', '4744', '4758', '5101', '5151', '6101', '6102', '6731', '7101', '7102', '7103', '7201', '7202', '7700', '7701', '7702', '7709', '7710', '7745', '7746', '7755', '7756', '7757', '7790', '8703', '8707', '8708', '8771', '8772', '8776'"


In [29]:
stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)

train_stats = tfdv.generate_statistics_from_csv(
    data_location=TRAIN_DATASET,
    stats_options=stats_options
)

tfdv.visualize_statistics(train_stats)

In [30]:
stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)

train_stats = tfdv.generate_statistics_from_csv(
    data_location=VALIDATE_DATASET,
    stats_options=stats_options
)

tfdv.visualize_statistics(train_stats)

In [31]:
stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)

train_stats = tfdv.generate_statistics_from_csv(
    data_location=TESTING_DATASET,
    stats_options=stats_options
)

tfdv.visualize_statistics(train_stats)

In [32]:
stats_options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)

train_stats = tfdv.generate_statistics_from_csv(
    data_location=SERVING_DATASET,
    stats_options=stats_options
)

tfdv.visualize_statistics(train_stats)