# Covertype Data Set Preprocessing

In [None]:
%pip install gcsfs

In [2]:
import numpy as np
import pandas as pd
import tensorflow_data_validation as tfdv

from sklearn.model_selection import train_test_split

### Set the paths

In [3]:
FULL_DATASET = '../covertype.csv'
TRAINING_DATASET='../covertype_training.csv'
TRAINING_DATASET_WITH_MISSING = '../covertype_training_missing.csv'
EVALUATION_DATASET='../covertype_evaluation.csv'
EVALUATION_DATASET_WITH_ANOMALIES='../covertype_evaluation_anomalies.csv'
SERVING_DATASET='../covertype_serving.csv'

ORIGINAL_DATASET_PATH = 'gs://workshop-datasets/covertype/orig/covtype.data'

## Preprocess the original dataset

### Load the dataset

In [None]:
df = pd.read_csv(ORIGINAL_DATASET_PATH, header=None)
print(df.shape)
df.head()

### Configure soil type and wilderness area domains

In [None]:
soil_type = [
"1", "2702", "Cathedral family - Rock outcrop complex, extremely stony.",
"2", "2703", "Vanet - Ratake families complex, very stony.",
"3", "2704", "Haploborolis - Rock outcrop complex, rubbly.",
"4", "2705", "Ratake family - Rock outcrop complex, rubbly.",
"5", "2706", "Vanet family - Rock outcrop complex complex, rubbly.",
"6", "2717", "Vanet - Wetmore families - Rock outcrop complex, stony.",
"7", "3501", "Gothic family.",
"8", "3502", "Supervisor - Limber families complex.",
"9", "4201", "Troutville family, very stony.",
"10", "4703", "Bullwark - Catamount families - Rock outcrop complex, rubbly.",
"11", "4704", "Bullwark - Catamount families - Rock land complex, rubbly.",
"12", "4744", "Legault family - Rock land complex, stony.",
"13", "4758", "Catamount family - Rock land - Bullwark family complex, rubbly.",
"14", "5101", "Pachic Argiborolis - Aquolis complex.",
"15", "5151", "unspecified in the USFS Soil and ELU Survey.",
"16", "6101", "Cryaquolis - Cryoborolis complex.",
"17", "6102", "Gateview family - Cryaquolis complex.",
"18", "6731", "Rogert family, very stony.",
"19", "7101", "Typic Cryaquolis - Borohemists complex.",
"20", "7102", "Typic Cryaquepts - Typic Cryaquolls complex.",
"21", "7103", "Typic Cryaquolls - Leighcan family, till substratum complex.",
"22", "7201", "Leighcan family, till substratum, extremely bouldery.",
"23", "7202", "Leighcan family, till substratum - Typic Cryaquolls complex.",
"24", "7700", "Leighcan family, extremely stony.",
"25", "7701", "Leighcan family, warm, extremely stony.",
"26", "7702", "Granile - Catamount families complex, very stony.",
"27", "7709", "Leighcan family, warm - Rock outcrop complex, extremely stony.",
"28", "7710", "Leighcan family - Rock outcrop complex, extremely stony.",
"29", "7745", "Como - Legault families complex, extremely stony.",
"30", "7746", "Como family - Rock land - Legault family complex, extremely stony.",
"31", "7755", "Leighcan - Catamount families complex, extremely stony.",
"32", "7756", "Catamount family - Rock outcrop - Leighcan family complex, extremely stony.",
"33", "7757", "Leighcan - Catamount families - Rock outcrop complex, extremely stony.",
"34", "7790", "Cryorthents - Rock land complex, extremely stony.",
"35", "8703", "Cryumbrepts - Rock outcrop - Cryaquepts complex.",
"36", "8707", "Bross family - Rock land - Cryumbrepts complex, extremely stony.",
"37", "8708", "Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.",
"38", "8771", "Leighcan - Moran families - Cryaquolls complex, extremely stony.",
"39", "8772", "Moran family - Cryorthents - Leighcan family complex, extremely stony.",
"40", "8776", "Moran family - Cryorthents - Rock land complex, extremely stony.",
]

wilderness_area = [
"Rawah", "Rawah Wilderness Area",
"Neota", "Neota Wilderness Area",
"Commanche", "Comanche Peak Wilderness Area",
"Cache", "Cache la Poudre Wilderness Area"
]

### Map one-hot encoded values to categorical domains

In [None]:
soil = df.loc[:, 14:53].apply(lambda x: soil_type[1::3][x.to_numpy().nonzero()[0][0]], axis=1)
soil

In [None]:
wilderness = df.loc[:, 10:13].apply(lambda x: wilderness_area[0::2][x.to_numpy().nonzero()[0][0]], axis=1)
wilderness

### Create a dataset with column names and categorical values replacing one-hot encoded soil type and wilderness areas

In [None]:
COLUMN_NAMES = [
    'Elevation', 
    'Aspect', 
    'Slope', 
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
    'Wilderness_Area',
    'Soil_Type',
    'Cover_Type']

df_full = pd.concat([df.loc[:, 0:9], wilderness, soil, df.loc[:, 54]], axis=1, ignore_index=True)
df_full.columns = COLUMN_NAMES
df_full

### Save the dataset to CSV file

In [None]:
df_full.to_csv(FULL_DATASET, header=True, index=False)

## Create training, validation, testing and serving splits.

In [4]:
df_full = df = pd.read_csv(FULL_DATASET)
df_full

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,Rawah,7745,5
1,2590,56,2,212,-6,390,220,235,151,6225,Rawah,7745,5
2,2804,139,9,268,65,3180,234,238,135,6121,Rawah,4744,2
3,2785,155,18,242,118,3090,238,238,122,6211,Rawah,7746,2
4,2595,45,2,153,-1,391,220,234,150,6172,Rawah,7745,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,Commanche,2703,3
581008,2391,152,19,67,12,95,240,237,119,845,Commanche,2703,3
581009,2386,159,17,60,7,90,236,241,130,854,Commanche,2703,3
581010,2384,170,15,60,5,90,230,245,143,864,Commanche,2703,3


In [5]:
df_full.Soil_Type.value_counts()

7745    115247
7202     57752
7756     52519
7757     45154
7201     33373
4703     32634
7746     30170
4744     29971
7755     25666
7700     21278
4758     17431
8771     15573
8772     13806
4704     12410
2705     12396
7102      9259
8776      8750
2703      7525
2717      6575
2704      4823
7101      4021
6102      3422
2702      3031
6101      2845
7702      2589
6731      1899
8703      1891
7790      1611
2706      1597
4201      1147
7709      1086
7710       946
7103       838
5101       599
7701       474
8708       298
3502       179
8707       119
3501       105
5151         3
Name: Soil_Type, dtype: int64

In [6]:
df_5151 = df_full[df_full['Soil_Type']==5151]
df_no_5151 = df_full[df_full['Soil_Type']!=5151]

In [7]:
df_train, df_other = train_test_split(df_no_5151, train_size=431009, stratify=df_no_5151.Cover_Type)
df_evaluate, df_serving = train_test_split(df_other, train_size=75000, stratify=df_other.Cover_Type)
df_serving = df_serving.drop(columns=['Cover_Type'])
print(df_train.shape)
print(df_evaluate.shape)
print(df_serving.shape)

(431009, 13)
(75000, 13)
(75000, 12)


Add some missing values to the training split.

In [8]:
df_train_missing = df_train.reset_index(drop=True)
df_train_missing.loc[0:8999, 'Horizontal_Distance_To_Hydrology'] = None
df_train_missing

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,3168,60,16,,0,1415,231,205,102,371,Commanche,7700,1
1,2553,40,16,,-15,782,220,204,115,60,Rawah,7745,2
2,2936,57,11,,40,4422,228,216,120,6260,Rawah,4744,2
3,2759,10,10,,61,1969,209,220,148,2123,Rawah,4744,2
4,3294,338,5,,0,4184,209,233,162,3852,Rawah,8771,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
431004,3103,202,21,443.0,85,2533,205,253,174,2986,Commanche,7757,2
431005,2408,10,11,446.0,166,1556,208,218,146,691,Cache,2717,3
431006,2217,68,18,30.0,-11,930,236,200,89,967,Cache,2706,4
431007,3259,131,13,0.0,0,1209,241,232,118,1218,Rawah,7201,1


Create the evaluation split where some values of Slope are more than 90 degrees and 3 examples have 5151 code for soil type, which is not present in the training split.

In [9]:
df_evaluate_anomalies = df_evaluate.reset_index(drop=True)
df_evaluate_anomalies.loc[0:4, 'Slope'] = 110
df_evaluate_anomalies = pd.concat([df_evaluate_anomalies, df_5151])
df_evaluate_anomalies

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2862,294,110,120,18,1273,154,229,213,1323,Commanche,7757,1
1,2988,210,110,90,0,5366,211,251,174,2024,Rawah,7202,1
2,3107,51,110,212,-13,2455,223,227,139,2322,Commanche,7756,1
3,3189,5,110,362,49,1765,188,194,139,1900,Commanche,8772,1
4,2623,162,110,30,1,1106,230,243,144,6332,Rawah,7745,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74998,2905,254,9,242,47,1914,200,246,185,1170,Commanche,7756,2
74999,2849,199,32,182,98,1405,190,247,169,1825,Commanche,4758,2
241543,2078,34,10,0,0,212,219,218,134,484,Cache,5151,6
241544,2080,13,19,30,0,192,198,197,132,499,Cache,5151,6


In [10]:
df_evaluate_anomalies.Soil_Type.value_counts()

7745    14850
7202     7497
7756     6884
7757     5831
7201     4326
4703     4240
7746     3949
4744     3794
7755     3212
7700     2828
4758     2285
8771     1979
8772     1752
2705     1631
4704     1564
7102     1172
8776     1100
2703      989
2717      866
2704      616
7101      521
6102      443
2702      397
6101      380
7702      299
6731      245
8703      239
2706      203
7790      186
4201      145
7709      135
7710      127
7103      106
5101       75
7701       46
8708       40
3502       23
3501       16
8707        9
5151        3
Name: Soil_Type, dtype: int64

### Save the splits to local files.

In [None]:
df_train.to_csv(TRAINING_DATASET, header=True, index=False)
df_train_missing.to_csv(TRAINING_DATASET_WITH_MISSING, header=True, index=False)
df_evaluate.to_csv(EVALUATION_DATASET, header=True, index=False)
df_evaluate_anomalies.to_csv(EVALUATION_DATASET_WITH_ANOMALIES, header=True, index=False)
df_serving.to_csv(SERVING_DATASET, header=True, index=False)

### Copy the splits to GCS

In [None]:
!gsutil cp $TRAINING_DATASET gs://workshop-datasets/covertype/training/
!gsutil cp $TRAINING_DATASET_WITH_MISSING gs://workshop-datasets/covertype/training_missing/
!gsutil cp $EVALUATION_DATASET gs://workshop-datasets/covertype/evaluation/
!gsutil cp $EVALUATION_DATASET_WITH_ANOMALIES gs://workshop-datasets/covertype/evaluation_anomalies/
!gsutil cp $SERVING_DATASET gs://workshop-datasets/covertype/serving/