In [28]:
import pandas as pd

def load_data():
    csv_path = "train.csv"
    return pd.read_csv(csv_path)

database = load_data()
database.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [35]:
database.columns

Index(['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37', 'revenue'],
      dtype='object')

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = database.columns[5:-1]
cat_attribs = ["City", "City Group", "Type"]

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Imputación de valores faltantes usando la mediana
    ('scaler', StandardScaler())  # Escalado de características para normalizar los datos
    ])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder())
    ])

full_pipeline = ColumnTransformer([
    ("categorical", cat_pipeline, cat_attribs),
    ("numerical", num_pipeline, num_attribs)
    ])

data_prepared = full_pipeline.fit_transform(database)
data_labels = database["revenue"].copy()

In [51]:
print("Número de columnas en data_prepared:", data_prepared.shape[1])
print("Nombres de las columnas en data_prepared:")

Número de columnas en data_prepared: 76
Nombres de las columnas en data_prepared:


In [48]:
data_prepared

array([[ 0.        ,  0.        ,  0.        , ...,  0.57563408,
         0.18982104,  1.61595083],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526],
       [ 0.        ,  0.        ,  0.        , ..., -0.59268991,
        -0.53255348, -0.62592526]])

In [49]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(data_prepared, data_labels)

In [50]:
from sklearn.metrics import r2_score
predictions = lin_reg.predict(data_prepared)
r2 = r2_score(data_labels, predictions)
print(f"Coeficiente de determinación (R²): {r2}")

Coeficiente de determinación (R²): 0.5100986105191018
