In [1]:
# Import libraries
import os
import requests
import pandas as pd
import tensorflow as tf
import tensorflow_data_validation as tfdv
from sklearn.model_selection import train_test_split
print('TF version:', tf.__version__)
print('TFDV version:', tfdv.version.__version__)

2025-02-23 17:23:06.291161: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-23 17:23:06.291813: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-23 17:23:06.293763: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-23 17:23:06.299285: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-23 17:23:06.310796: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registe

TF version: 2.16.2
TFDV version: 1.16.1


In [2]:
from data_preparation import *
from model_creation import *
from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [3]:
## download the dataset
# Directory of the raw data files
_data_root = './data/covertype'
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')

# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)
    
df = cargar_datos(_data_filepath)

Elevation                              int64
Aspect                                 int64
Slope                                  int64
Horizontal_Distance_To_Hydrology       int64
Vertical_Distance_To_Hydrology         int64
Horizontal_Distance_To_Roadways        int64
Hillshade_9am                          int64
Hillshade_Noon                         int64
Hillshade_3pm                          int64
Horizontal_Distance_To_Fire_Points     int64
Wilderness_Area                       object
Soil_Type                             object
Cover_Type                             int64
dtype: object


In [4]:
# Se seleccionan solo las variables numéricas
df_numerico = df.select_dtypes(include=['number'])

In [5]:
df_numerico.dtypes

Elevation                             int64
Aspect                                int64
Slope                                 int64
Horizontal_Distance_To_Hydrology      int64
Vertical_Distance_To_Hydrology        int64
Horizontal_Distance_To_Roadways       int64
Hillshade_9am                         int64
Hillshade_Noon                        int64
Hillshade_3pm                         int64
Horizontal_Distance_To_Fire_Points    int64
Cover_Type                            int64
dtype: object

In [6]:
X = df_numerico.drop(['Cover_Type'], axis=1)
y = df_numerico['Cover_Type']

In [7]:
# Aplicar SelectKBest
selector = SelectKBest(f_classif, k=8)
X_new = selector.fit_transform(X, y)

# Obtener los nombres de las columnas seleccionadas
columnas_seleccionadas = X.columns[selector.get_support()]

# Convertir nuevamente en DataFrame
X_new = pd.DataFrame(X_new, columns=columnas_seleccionadas, index=X.index)

In [8]:
# Suponiendo que X_train es un DataFrame y y_train es una Serie o DataFrame
df_new = pd.concat([X_new, y], axis=1)

In [9]:
# Validate objective data distribution
df_new['Cover_Type'].value_counts(normalize=True)

1    0.488111
0    0.364078
2    0.062201
6    0.034810
5    0.029930
4    0.016282
3    0.004587
Name: Cover_Type, dtype: float64

In [10]:
X_train, X_test, y_train, y_test, preprocessor = preparar_datos(df_new)

In [11]:
# Load train frata from DataFrame
train_stats = tfdv.generate_statistics_from_dataframe(X_train)

In [12]:
# Compute stats for training data
# Analyze data distribution and check if transformation are required
tfdv.visualize_statistics(train_stats)

In [13]:
# Infer schema 
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Elevation',INT,required,,-
'Slope',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Vertical_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'__index_level_0__',INT,required,,-


In [14]:
# Compute stats for evaluation data
eval_stats = tfdv.generate_statistics_from_dataframe(X_test)