In [21]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN


In [None]:
df = pd.read_csv('../data/raw/Plantaciones_TOTAL.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4484178 entries, 0 to 4484177
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID PLANTACION          int64  
 1   PERIODO                float64
 2   DEPARTAMENTO           object 
 3   PROVINCIA              object 
 4   DISTRITO               object 
 5   ARFFS                  object 
 6   SEDE                   object 
 7   UBIGEO                 int64  
 8   FINALIDAD              object 
 9   NUMERO CERTIFICADO     object 
 10  TITULAR                object 
 11  TIPO PERSONA           object 
 12  TIPO DOCUMENTO         object 
 13  NUMERO DOCUMENTO       object 
 14  REGIMEN TENENCIA       object 
 15  TIPO PLANTACION        object 
 16  ESPECIE                object 
 17  SUPERFICIE PLANTACION  float64
 18  FECHA CORTE            int64  
dtypes: float64(2), int64(3), object(14)
memory usage: 650.0+ MB


In [4]:
df.drop(columns='ID PLANTACION', inplace=True)

In [None]:
df["PERIODO"] = df["PERIODO"].fillna(0).astype(int)


In [6]:
df["FECHA CORTE"] = pd.to_datetime(df["FECHA CORTE"].astype(str), format="%Y%m%d", errors='coerce')
df["FECHA CORTE"] = df["FECHA CORTE"].map(lambda x: x.toordinal() if pd.notnull(x) else 0)


In [7]:
df["NUMERO DOCUMENTO"] = df["NUMERO DOCUMENTO"].astype(str)

In [8]:
df.head()

Unnamed: 0,PERIODO,DEPARTAMENTO,PROVINCIA,DISTRITO,ARFFS,SEDE,UBIGEO,FINALIDAD,NUMERO CERTIFICADO,TITULAR,TIPO PERSONA,TIPO DOCUMENTO,NUMERO DOCUMENTO,REGIMEN TENENCIA,TIPO PLANTACION,ESPECIE,SUPERFICIE PLANTACION,FECHA CORTE
0,2016,ANCASH,YUNGAY,YUNGAY,ATFFS Áncash,HUARAZ,22001,PRODUCCIÓN,02-ANC/REG-PLT-2016-064,"CHAVEZ BARTOLOME, IGNACIO",PERSONA NATURAL,DNI,33323776,PREDIO PRIVADO,ARBOLES,Eucalyptus globulus,24.74,738216
1,2018,AMAZONAS,UTCUBAMBA,JAMALCA,GORE Amazonas,POMACOCHAS,10705,PRODUCCIÓN,01-AMA-BON/REG-PLT-2018-127,"ROJAS TARRILLO, ALEX",PERSONA NATURAL,DNI,33656772,PREDIO PRIVADO,ARBOLES,Pinus patula,0.61,738216
2,2018,AMAZONAS,UTCUBAMBA,JAMALCA,GORE Amazonas,POMACOCHAS,10705,PRODUCCIÓN,01-AMA-BON/REG-PLT-2018-129,"CAMACHO SANCHEZ, SANTIAGO",PERSONA NATURAL,DNI,33656754,PREDIO PRIVADO,ARBOLES,Eucalyptus saligna,0.58,738216
3,2018,AMAZONAS,UTCUBAMBA,JAMALCA,GORE Amazonas,POMACOCHAS,10705,PRODUCCIÓN,01-AMA-BON/REG-PLT-2018-135,"REGALADO TERRONES, JOSÉ PRIMITIVO",PERSONA NATURAL,DNI,45281628,PREDIO PRIVADO,ARBOLES,Pinus patula,0.96,738216
4,2018,AMAZONAS,UTCUBAMBA,JAMALCA,GORE Amazonas,POMACOCHAS,10705,PRODUCCIÓN,01-AMA-BON/REG-PLT-2018-139,"OBLITAS TARRILLO, RAMIRO",PERSONA NATURAL,DNI,45771363,PREDIO PRIVADO,ARBOLES,Eucalyptus saligna,0.5,738216


In [9]:
df.drop(columns=[
    "NUMERO CERTIFICADO",  # ID único
    "NUMERO DOCUMENTO",    # Identificador personal
    "TITULAR",             # Nombre no relevante para clustering
    "UBIGEO"               # Redundante si ya usamos DEPARTAMENTO, PROVINCIA, DISTRITO, y SEDE
], inplace=True)


In [10]:
categorical_cols = [
    "DEPARTAMENTO", "PROVINCIA", "DISTRITO",
    "ARFFS", "SEDE", "FINALIDAD", "TIPO PERSONA",
    "TIPO DOCUMENTO", "REGIMEN TENENCIA",
    "TIPO PLANTACION", "ESPECIE"
]


le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))


In [11]:
df.head()

Unnamed: 0,PERIODO,DEPARTAMENTO,PROVINCIA,DISTRITO,ARFFS,SEDE,FINALIDAD,TIPO PERSONA,TIPO DOCUMENTO,REGIMEN TENENCIA,TIPO PLANTACION,ESPECIE,SUPERFICIE PLANTACION,FECHA CORTE
0,2016,1,167,801,13,16,0,2,1,5,0,202,24.74,738216
1,2018,0,160,313,14,26,0,2,1,5,0,395,0.61,738216
2,2018,0,160,313,14,26,0,2,1,5,0,204,0.58,738216
3,2018,0,160,313,14,26,0,2,1,5,0,395,0.96,738216
4,2018,0,160,313,14,26,0,2,1,5,0,204,0.5,738216


In [12]:
df.drop(columns=[
    "ARFFS",
    "TIPO DOCUMENTO",
    "FECHA CORTE"
], inplace=True)


In [None]:
df.head()

Unnamed: 0,PERIODO,DEPARTAMENTO,PROVINCIA,DISTRITO,SEDE,FINALIDAD,TIPO PERSONA,REGIMEN TENENCIA,TIPO PLANTACION,ESPECIE,SUPERFICIE PLANTACION
0,2016,1,167,801,16,0,2,5,0,202,24.74
1,2018,0,160,313,26,0,2,5,0,395,0.61
2,2018,0,160,313,26,0,2,5,0,204,0.58
3,2018,0,160,313,26,0,2,5,0,395,0.96
4,2018,0,160,313,26,0,2,5,0,204,0.5


In [None]:
selected_features = [
    "PERIODO",
    "DEPARTAMENTO",
    "PROVINCIA",
    "DISTRITO",
    "SEDE",
    "FINALIDAD",
    "TIPO PERSONA",
    "REGIMEN TENENCIA",
    "TIPO PLANTACION",
    "ESPECIE",
    "SUPERFICIE PLANTACION"
]

df_filtered = df[selected_features]



In [19]:
df_filtered.head()

Unnamed: 0,PERIODO,DEPARTAMENTO,PROVINCIA,DISTRITO,SEDE,FINALIDAD,TIPO PERSONA,REGIMEN TENENCIA,TIPO PLANTACION,ESPECIE,SUPERFICIE PLANTACION
0,2016,1,167,801,16,0,2,5,0,202,24.74
1,2018,0,160,313,26,0,2,5,0,395,0.61
2,2018,0,160,313,26,0,2,5,0,204,0.58
3,2018,0,160,313,26,0,2,5,0,395,0.96
4,2018,0,160,313,26,0,2,5,0,204,0.5


In [None]:
numeric_cols = ["PERIODO", "SUPERFICIE PLANTACION"]

# --- 2) StandardScaler SOLO a esas columnas --------
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# --- 3) guarda la matriz para Go -------------------
df.to_csv("matrix.csv", index=False)

# --- 4) guarda medias y sigmas en JSON -------------
scaler_json = {
    "cols":  numeric_cols,
    "mean":  scaler.mean_.tolist(),
    "std":   scaler.scale_.tolist()
}


NameError: name 'json' is not defined

In [22]:
with open("scaler.json", "w") as f:
    json.dump(scaler_json, f, indent=2)

print("✅ matrix.csv  +  scaler.json listos")

✅ matrix.csv  +  scaler.json listos
