# 🏠 Ev Fiyat Tahmin Projesi - Feature Engineering ve Veri Ön İşleme

Bu notebook, eksik değer doldurma ve basit özellik türetme adımlarını içerir.


In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
print("✅ Kütüphaneler yüklendi")


In [None]:
train = pd.read_csv('datasets/housePrice/house_price_train.csv')
test = pd.read_csv('datasets/housePrice/house_price_test.csv')
df = pd.concat([train, test], ignore_index=True)
print(df.shape)
df.head(2)


In [None]:
# Eksik değer doldurma: sayısal = median, kategorik = mode
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

if len(num_cols):
    df[num_cols] = SimpleImputer(strategy='median').fit_transform(df[num_cols])
if len(cat_cols):
    df[cat_cols] = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])

print("Eksik değer doldurma tamamlandı")
df.isnull().sum().sum()


In [None]:
# Basit özellikler
if {'1stFlrSF','2ndFlrSF','TotalBsmtSF'}.issubset(df.columns):
    df['TotalSF'] = df['1stFlrSF'] + df['2ndFlrSF'] + df['TotalBsmtSF']

print("Yeni sütunlar:", [c for c in df.columns if c.endswith('SF')][-5:])
df[['TotalSF']].head() if 'TotalSF' in df.columns else df.head(1)


In [None]:
# Notebook çıktısı olarak sadece df şekli gösterelim (kaydetme opsiyonel)
print("Güncel şekil:", df.shape)


## 🧱 Aykırı Değer (Outlier) İşleme
Sayısal değişkenlerde IQR yöntemine göre alt/üst sınırları kullanarak kırpma (winsorize) uygularız.


In [None]:
def outlier_thresholds(dataframe, variable, low_quantile=0.10, up_quantile=0.90):
    q1 = dataframe[variable].quantile(low_quantile)
    q3 = dataframe[variable].quantile(up_quantile)
    iqr = q3 - q1
    low_limit = q1 - 1.5 * iqr
    up_limit = q3 + 1.5 * iqr
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

for col in num_cols:
    if col != 'SalePrice':
        replace_with_thresholds(df, col)

print('Aykırı değer baskılama tamamlandı (0.10-0.90 IQR yöntemi)')


In [None]:
no_cols = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu',
           'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
for col in [c for c in no_cols if c in df.columns]:
    df[col] = df[col].fillna('No')
print('"No" ile doldurulan sütunlar işlendi')


In [None]:
def quick_missing_imp(data, num_method='median', cat_length=17, target='SalePrice'):
    variables_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]
    temp_target = data[target] if target in data.columns else None
    data = data.apply(lambda x: x.fillna(x.mode()[0]) if (x.dtype == 'O' and len(x.unique()) <= cat_length) else x)
    if num_method == 'mean':
        data = data.apply(lambda x: x.fillna(x.mean()) if x.dtype != 'O' else x)
    else:
        data = data.apply(lambda x: x.fillna(x.median()) if x.dtype != 'O' else x)
    if temp_target is not None:
        data[target] = temp_target
    return data

df = quick_missing_imp(df, num_method='median', cat_length=17)
print('Eksik değer doldurma (quick_missing_imp) tamamlandı')


In [None]:
def rare_encoder(dataframe, rare_perc=0.01):
    temp_df = dataframe.copy()
    rare_columns = [col for col in temp_df.columns if temp_df[col].dtype == 'O' and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any()]
    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])
    return temp_df

df = rare_encoder(df, rare_perc=0.01)
print('Rare encoder uygulandı')


## 🧩 Hedefe Duyarlı Kategorik Kodlama (Target Mean Encoding)
Sınırlı örnekleme hatasından kaçınmak için basit smoothing ile ortalama kodlama uygularız.


In [None]:
# Ek özellikler (script ile hizalı)
if {'1stFlrSF','GrLivArea'}.issubset(df.columns):
    df['NEW_1st*GrLiv'] = df['1stFlrSF'] * df['GrLivArea']
if {'GarageArea','GrLivArea'}.issubset(df.columns):
    df['NEW_Garage*GrLiv'] = df['GarageArea'] * df['GrLivArea']

qual_cols = ["OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtCond", "BsmtFinType1",
             "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "FireplaceQu", "GarageQual", "GarageCond", "Fence"]
numeric_qual_cols = [col for col in qual_cols if col in df.columns and df[col].dtype in ['int64','float64']]
df['TotalQual'] = df[numeric_qual_cols].sum(axis=1) if numeric_qual_cols else 0

if {'GrLivArea','TotalBsmtSF'}.issubset(df.columns):
    df['NEW_TotalSqFeet'] = df['GrLivArea'] + df['TotalBsmtSF']
if {'GarageArea','LotArea'}.issubset(df.columns):
    df['NEW_GarageLotRatio'] = df['GarageArea'] / df['LotArea']
if {'MasVnrArea','NEW_TotalHouseArea'}.issubset(df.columns):
    df['NEW_MasVnrRatio'] = df['MasVnrArea'] / df['NEW_TotalHouseArea']
if {'LotArea','1stFlrSF','GarageArea','NEW_PorchArea','WoodDeckSF'}.issubset(df.columns):
    df['NEW_DifArea'] = (df['LotArea'] - df['1stFlrSF'] - df['GarageArea'] - df['NEW_PorchArea'] - df['WoodDeckSF'])
if {'OverallQual','OverallCond'}.issubset(df.columns):
    df['NEW_OverallGrade'] = df['OverallQual'] * df['OverallCond']
if {'YrSold','GarageYrBlt'}.issubset(df.columns):
    df['NEW_GarageSold'] = df['YrSold'] - df['GarageYrBlt']

print('Ek özellikler eklendi')


In [None]:
# Drop list (script ile hizalı)
drop_list = ['Street','Alley','LandContour','Utilities','LandSlope','Heating','PoolQC','MiscFeature','Neighborhood']
df = df.drop([c for c in drop_list if c in df.columns], axis=1)
print('Gereksiz sütunlar düşürüldü')


In [None]:
# Encoding (Label + One-Hot)

def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == 'O']
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtype != 'O']
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtype == 'O']
    cat_cols = [col for col in cat_cols + num_but_cat if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != 'O']
    num_cols = [col for col in num_cols if col not in num_but_cat]
    return cat_cols, cat_but_car, num_cols

cat_cols, cat_but_car, num_cols = grab_col_names(df)

from sklearn.preprocessing import LabelEncoder

def label_encoder(dataframe, binary_col):
    le = LabelEncoder()
    dataframe[binary_col] = le.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = [col for col in df.columns if df[col].dtype == 'O' and df[col].nunique() == 2]
for col in binary_cols:
    df = label_encoder(df, col)


def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    return pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)

df = one_hot_encoder(df, cat_cols, drop_first=True)
print('Encoding tamamlandı. Son şekil:', df.shape)


In [None]:
# Yalnızca train kısmında SalePrice mevcut olduğu için yalnız train'de fit edip df'e merge edeceğiz
train_len = len(train)
train_part = df.iloc[:train_len].copy()

def target_mean_encode(frame, column, target='SalePrice', m=100):
    global_mean = frame[target].mean()
    stats = frame.groupby(column)[target].agg(['mean','count'])
    smooth = (stats['count'] * stats['mean'] + m * global_mean) / (stats['count'] + m)
    return smooth

for col in ['MSZoning','Neighborhood','HouseStyle']:
    if col in df.columns and 'SalePrice' in train_part.columns:
        mapping = target_mean_encode(train_part, col)
        df[f'TME_{col}'] = df[col].map(mapping)

print("Target mean encoding (smoothing) uygulandı")


## 🔧 Gelişmiş Feature Engineering
`house_prediction.py`'daki mantığa paralel örnekler: toplam kat alanı, toplam bodrum alanı, oranlar ve yaş değişkenleri.


In [None]:
# Benzer türetmeler (var olan sütunlara göre korumalı koşullar)
if {'1stFlrSF','2ndFlrSF'}.issubset(df.columns):
    df['NEW_TotalFlrSF'] = df['1stFlrSF'] + df['2ndFlrSF']
if {'BsmtFinSF1','BsmtFinSF2'}.issubset(df.columns):
    df['NEW_TotalBsmtFin'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
if {'OpenPorchSF','EnclosedPorch','ScreenPorch','3SsnPorch','WoodDeckSF'}.issubset(df.columns):
    df['NEW_PorchArea'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['3SsnPorch'] + df['WoodDeckSF']
if {'YearRemodAdd','YearBuilt'}.issubset(df.columns):
    df['NEW_Restoration'] = df['YearRemodAdd'] - df['YearBuilt']
if {'YrSold','YearBuilt'}.issubset(df.columns):
    df['NEW_HouseAge'] = df['YrSold'] - df['YearBuilt']
if {'YrSold','YearRemodAdd'}.issubset(df.columns):
    df['NEW_RestorationAge'] = df['YrSold'] - df['YearRemodAdd']
if {'GarageYrBlt','YearBuilt'}.issubset(df.columns):
    df['NEW_GarageAge'] = df['GarageYrBlt'] - df['YearBuilt']
if {'GarageYrBlt','YearRemodAdd'}.issubset(df.columns):
    df['NEW_GarageRestorationAge'] = (df['GarageYrBlt'] - df['YearRemodAdd']).abs()

# Oranlar
if {'GrLivArea','LotArea'}.issubset(df.columns):
    df['NEW_LotRatio'] = df['GrLivArea'] / df['LotArea']
if {'NEW_TotalFlrSF','TotalBsmtSF','LotArea'}.issubset(df.columns):
    df['NEW_TotalHouseArea'] = df['NEW_TotalFlrSF'] + df['TotalBsmtSF']
    df['NEW_RatioArea'] = df['NEW_TotalHouseArea'] / df['LotArea']


## 💾 İşlenmiş Veriyi Kaydet (Opsiyonel)
Modelleme notebook'u için CSV olarak kaydedebilirsiniz.


In [None]:
# df.to_csv('processed_data.csv', index=False)
print('Hazır veri şekli:', df.shape)
