# Carregando os dados

In [17]:
import pandas as pd

def load_data(data_path: str) -> pd.DataFrame:
    data = pd.read_csv(data_path)
    return data

df = load_data("../data/ames.csv")
df.head()

Unnamed: 0,Order,PID,MS.SubClass,MS.Zoning,Lot.Frontage,Lot.Area,Street,Alley,Lot.Shape,Land.Contour,...,Pool.Area,Pool.QC,Fence,Misc.Feature,Misc.Val,Mo.Sold,Yr.Sold,Sale.Type,Sale.Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


# Escolhendo features de interesse (15)

#### Usamos o site "American Statistical Association" para entender melhor o que cada coluna significa
##### link (https://jse.amstat.org/v19n3/decock/DataDocumentation.txt)


- As features escolhidas foram:
    - Numéricas:
        - Gr.Liv.Area (tamanho da area de estar, todo mundo gosta de um lugar aconchegante e espaçoso pra receber os amigos né?)

        - Garage.Cars 

        - Lot.Area (area total do lote)

        - TotRms.AbvGrd (n de comodos sem contar no porao, mais comodos --> casa maior)

        - Year.Remod.Add (reformas recentes tendem a aumentar o preço, alem disso, uma casa antiga pode ter sido reformada recentemente e ficar melhor que uma casa nova, por isso não é interessante olhar pro ano de construção da casa)

        - Total.Bsmt.SF (area do porao)

        - X1st.Flr.SF (area do 1o andar)

        - X2nd.Flr.SF (area do 2o andar)

        - BsmtFull.Bath (banheiro do porao)

        - BsmtHalf.Bath

        - Full.Bath

        - Half.Bath

        - Pool.Area

    - Categóricas:
        - Utilities

        - Neighborhood (sabemos que nos EUA eles valorizam muito o bairro, por conta da escolha que os filhos vão, vizinhança...)

        - Overall.Qual (qualidade da casa, esse é obv que é importante)

### Note que escolhemos mais de 15 features, mas vamos combinar algumas delas na seção "Feature Engineering"

In [18]:
def filter_data(df: pd.DataFrame) -> pd.DataFrame:
    selected_features = [

        # Numéricas:
        "Gr.Liv.Area",
        "Garage.Cars",
        "Lot.Area",
        "TotRms.AbvGrd",
        "Year.Remod.Add",  
        "Total.Bsmt.SF",   
        "X1st.Flr.SF",     
        "X2nd.Flr.SF",     
        "Bsmt.Full.Bath",   
        "Bsmt.Half.Bath",   
        "Full.Bath",       
        "Half.Bath",       
        "Pool.Area",

        # Categóricas:
        "Utilities",
        "Neighborhood",
        "Overall.Qual",

        # target:
        "SalePrice"
    ]
    
    filtered_df = df[selected_features]
    
    return filtered_df


df = filter_data(df)

df.head()

Unnamed: 0,Gr.Liv.Area,Garage.Cars,Lot.Area,TotRms.AbvGrd,Year.Remod.Add,Total.Bsmt.SF,X1st.Flr.SF,X2nd.Flr.SF,Bsmt.Full.Bath,Bsmt.Half.Bath,Full.Bath,Half.Bath,Pool.Area,Utilities,Neighborhood,Overall.Qual,SalePrice
0,1656,2.0,31770,7,1960,1080.0,1656,0,1.0,0.0,1,0,0,AllPub,NAmes,6,215000
1,896,1.0,11622,5,1961,882.0,896,0,0.0,0.0,1,0,0,AllPub,NAmes,5,105000
2,1329,1.0,14267,6,1958,1329.0,1329,0,0.0,0.0,1,1,0,AllPub,NAmes,6,172000
3,2110,2.0,11160,8,1968,2110.0,2110,0,1.0,0.0,2,1,0,AllPub,NAmes,7,244000
4,1629,2.0,13830,6,1998,928.0,928,701,0.0,0.0,2,1,0,AllPub,Gilbert,5,189900


# Feature engineering 

In [19]:
from datetime import datetime

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy() 
    
    # 2025 :)
    current_year = datetime.now().year 
    df['Years.Snc.Remod'] = current_year - df['Year.Remod.Add']
    

    df['Total.SF'] = df['Total.Bsmt.SF'].fillna(0) + \
                     df['X1st.Flr.SF'].fillna(0) + \
                     df['X2nd.Flr.SF'].fillna(0)
    
    
    df['Total.Bath'] = df['Bsmt.Full.Bath'].fillna(0) + \
                       (0.5 * df['Bsmt.Half.Bath'].fillna(0)) + \
                       df['Full.Bath'].fillna(0) + \
                       (0.5 * df['Half.Bath'].fillna(0))
    
    
    df['Has.Pool'] = (df['Pool.Area'] > 0).astype(int)
    

    columns_to_drop = [
        'Year.Remod.Add',
        'Total.Bsmt.SF',
        'X1st.Flr.SF',
        'X2nd.Flr.SF',
        'Bsmt.Full.Bath',
        'Bsmt.Half.Bath',
        'Full.Bath',
        'Half.Bath',
        'Pool.Area'
    ]
    
    # Remove as colunas originais
    df = df.drop(columns=columns_to_drop)
    
    return df

df = feature_engineering(df)
df.head()

Unnamed: 0,Gr.Liv.Area,Garage.Cars,Lot.Area,TotRms.AbvGrd,Utilities,Neighborhood,Overall.Qual,SalePrice,Years.Snc.Remod,Total.SF,Total.Bath,Has.Pool
0,1656,2.0,31770,7,AllPub,NAmes,6,215000,65,2736.0,2.0,0
1,896,1.0,11622,5,AllPub,NAmes,5,105000,64,1778.0,1.0,0
2,1329,1.0,14267,6,AllPub,NAmes,6,172000,67,2658.0,1.5,0
3,2110,2.0,11160,8,AllPub,NAmes,7,244000,57,4220.0,3.5,0
4,1629,2.0,13830,6,AllPub,Gilbert,5,189900,27,2557.0,2.5,0
