In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("C:\\\\Projects\\\\Maxims AAAI\\\\HW_1\\\\train_hw.csv")
test_df = pd.read_csv("C:\\\\Projects\\\\Maxims AAAI\\\\HW_1\\\\test_hw.csv")
df.head()

In [None]:
df.describe(include=["object"])


In [None]:
for column_indx, count_na in enumerate(df.isna().sum()):
    if count_na > 0:
        print(df.columns[column_indx], count_na)

In [None]:
df.columns

In [None]:
df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
        }, inplace=True,)

In [None]:
df.drop(
    ['GarageYrBlt','TotRmsAbvGrd','FirstFlrSF','GarageCars'],
    axis=1,
    inplace=True
)

In [None]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.mean()))
df['GarageArea'] = df.groupby('Neighborhood')['GarageArea'].transform(lambda x: x.fillna(x.mean()))

In [None]:
num_df = df.select_dtypes(exclude=['object'])

num_сols_with_missing = [col for col in num_df.columns if num_df[col].isnull().any()]
print(num_сols_with_missing)

df[num_сols_with_missing] = df[num_сols_with_missing].fillna(df[num_сols_with_missing].mean())

In [None]:
for column_indx, count_na in enumerate(df.isnull().sum()):
    if count_na > 0:
        print(df.columns[column_indx], count_na)

In [None]:
nan_df = (df.isnull().mean() * 100).reset_index()
nan_df

In [None]:
nan_df.columns = ["column_name", "percentage"]
nan_df.sort_values("percentage", ascending=False, inplace=True)
nan_df.head(10)

In [None]:
nan_columns = list(nan_df[nan_df.percentage > 80]['column_name'])
nan_columns

In [None]:
df.drop(nan_columns, inplace=True, axis=1)

In [None]:
def get_almost_constant_columns(df, dropna=True):
    cols = []
    for i in df:
        if dropna:
            counts = df[i].dropna().value_counts()
        else:
            counts = df[i].value_counts()
        most_popular_value_count = counts.iloc[0]
        if (most_popular_value_count / len(df)) * 100 > 96:
            cols.append(i)
    return cols

In [None]:
cat_df = df.select_dtypes(include=['object'])
overfit_cat = get_almost_constant_columns(cat_df)
df = df.drop(overfit_cat, axis=1)
overfit_cat

In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.1)
num_col = df.select_dtypes(exclude=['object'])

sel.fit(num_col)  # fit finds the features with low variance
sum(sel.get_support())

In [None]:
sel.get_support()

In [None]:
num_col.columns[~sel.get_support()]

In [None]:
cat = ['GarageType','GarageFinish','BsmtFinType2','BsmtExposure','BsmtFinType1', 
       'GarageCond','GarageQual','BsmtCond','BsmtQual','FireplaceQu',"KitchenQual",
       "HeatingQC",'ExterQual','ExterCond']
df[cat] = df[cat].fillna("NA")

In [None]:
for col, upper_bound in (
    ('LotFrontage', 200),
    ('LotArea', 100000),
    ('BsmtFinSF1', 4000),
    ('TotalBsmtSF', 5000),
    ('GrLivArea', 4000),
):
    df = df.drop(df[df[col] > upper_bound].index)

In [None]:
df['MSSubClass'].value_counts()
df['MSSubClass'] = df['MSSubClass'].apply(str)
ordinal_map = {'Ex': 5,'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
fintype_map = {'GLQ': 6,'ALQ': 5,'BLQ': 4,'Rec': 3,'LwQ': 2,'Unf': 1, 'NA': 0}
expose_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0}
fence_map = {'GdPrv': 4,'MnPrv': 3,'GdWo': 2, 'MnWw': 1,'NA': 0}
ord_col = ['ExterQual','ExterCond','BsmtQual', 'BsmtCond','HeatingQC','KitchenQual','GarageQual','GarageCond', 'FireplaceQu']
for col in ord_col:
    df[col] = df[col].map(ordinal_map)
    
fin_col = ['BsmtFinType1','BsmtFinType2']
for col in fin_col:
    df[col] = df[col].map(fintype_map)

df['BsmtExposure'] = df['BsmtExposure'].map(expose_map)

In [None]:
df['TotalLot'] = df['LotFrontage'] + df['LotArea']
df['TotalBsmtFin'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
df['TotalSF'] = df['TotalBsmtSF'] + df['SecondFlrSF']
df['TotalBath'] = df['FullBath'] + df['HalfBath']
df['TotalPorch'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['ScreenPorch']

In [None]:
cols = ['MasVnrArea','TotalBsmtFin','TotalBsmtSF','SecondFlrSF','WoodDeckSF','TotalPorch']

for col in cols:
    col_name = col+'_bin'
    df[col_name] = df[col].apply(lambda df: 1 if df > 0 else 0)

In [None]:
df = pd.get_dummies(df)

In [None]:
df.head(10)

In [None]:
from sklearn.preprocessing import RobustScaler

cols = df.select_dtypes(np.number).columns
# df = df.drop(["Id"], axis=1)
transformer = RobustScaler().fit(df[cols])
df[cols] = transformer.transform(df[cols])

In [None]:
df.head(10)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

In [None]:
degree = 2  # можно поменять на 3, 4 и т.д.
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())


In [None]:
X = df.drop(columns=["SalePrice"])
Y = df["SalePrice"]

In [None]:
model.fit(X, Y)

In [422]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(df, is_train=True, y_col='SalePrice', fitted_pipeline=None):
    df = df.copy()
    
    if is_train:
        y = df[y_col]
        X = df.drop(columns=[y_col])

        y = np.log1p(y)
    else:
        y = None
        X = df

    # Отделяем числовые и категориальные признаки
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()

    # Пайплайн для числовых признаков
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Пайплайн для категориальных признаков
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Общий трансформер
    full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

    if is_train:
        # fit + transform
        X_prepared = full_pipeline.fit_transform(X)
        return X_prepared, y, full_pipeline
    else:
        # только transform
        X_prepared = fitted_pipeline.transform(X)
        return X_prepared


In [426]:
train_df = pd.read_csv("C:\\Projects\\Maxims AAAI\\HW_1\\train_hw.csv")
test_df = pd.read_csv("C:\\Projects\\Maxims AAAI\\HW_1\\test_hw.csv")

In [None]:
# df = pd.read_csv("C:\\Projects\\Maxims AAAI\\HW_1\\train.csv")
# test_df = df.loc[~df.index.isin(train_df.index)]

In [427]:
# Препроцессинг train
X_train, y_train, pipeline = preprocess_data(train_df, is_train=True)

# Препроцессинг test
X_test = preprocess_data(test_df, is_train=False, fitted_pipeline=pipeline)

In [428]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

degree = 2  # можно поменять на 3, 4 и т.д.
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())


In [429]:
model.fit(X_train, y_train)

In [430]:
y_pred = model.predict(X_test)
y_pred_final = np.expm1(y_pred)

In [None]:
y_pred_final

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# y_test — реальные значения
# y_pred — предсказания модели

mae = mean_absolute_error(y_test, y_pred_final)
mse = mean_squared_error(y_test, y_pred_final)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R^2: {r2:.4f}")


In [431]:
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": y_pred_final
})

# 4. Сохраняем в файл
submission.to_csv("submission.csv", index=False)

In [None]:
print((y_pred < 0).sum())

In [None]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

def preprocess_house_prices(df):
    df = df.copy()
    
    # Удаляем столбец 'Id', если он есть
    if 'Id' in df.columns:
        df.drop('Id', axis=1, inplace=True)
    
    # Выделяем целевую переменную, если она присутствует
    y = None
    if 'SalePrice' in df.columns:
        y = df['SalePrice']
        df.drop('SalePrice', axis=1, inplace=True)
    
    # Определяем числовые и категориальные признаки
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    # Заполняем пропуски в числовых признаках медианой
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    
    # Заполняем пропуски в категориальных признаках наиболее частым значением
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    
    # Если после заполнения есть столбцы, где остались только NaN, можно их удалить
    numeric_cols = [col for col in numeric_cols if not df[col].isna().all()]
    df = df[numeric_cols + list(categorical_cols)]
    
    # Применяем one-hot encoding для категориальных переменных
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    # Масштабируем числовые признаки с помощью RobustScaler
    scaler = RobustScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df, y

# Пример использования:
train_df = pd.read_csv('C:\\Projects\\Maxims AAAI\\HW_1\\train_hw.csv')
test_df = pd.read_csv("C:\\Projects\\Maxims AAAI\\HW_1\\test_hw.csv")

X_train, y_train = preprocess_house_prices(train_df)
X_test, y = preprocess_house_prices(test_df)

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Пример: пусть у вас уже есть X, y
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Определяем числовые и категориальные признаки
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Предобработка
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("scaler", RobustScaler())
        ]), numeric_cols),
        
        ("cat", Pipeline([
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols)
    ]
)

# Модель с полиномиальными признаками
degree = 3
model = Pipeline([
    ("preprocessing", preprocessor),
    ("poly", PolynomialFeatures(degree=degree, include_bias=False)),
    ("reg", LinearRegression())
])

# Обучение
model.fit(X_train, y_train)

# Предсказание
y_pred = model.predict(X_test)
y_pred

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

degree = 2  # можно поменять на 3, 4 и т.д.
model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# y_pred_final = np.expm1(y_pred)
# len(y_pred_final)

In [None]:
submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": y_pred_final
})

# 4. Сохраняем в файл
submission.to_csv("submission.csv", index=False)

In [None]:
df = pd.read_csv("C:\\Projects\\Maxims AAAI\\HW_1\\train.csv")

test_df = df.drop(train_df.index)

y_test = test_df["SalePrice"]
X_test = test_df.drop(columns=["SalePrice"])
X_test = preprocess_data(test_df, is_train=False, fitted_pipeline=trained_pipeline)

y_pred = model.predict(X_test)
y_pred_final = np.expm1(y_pred)

In [None]:
from sklearn.metrics import mean_squared_log_error

msle = mean_squared_log_error(y_test, y_pred_final)
print(f"MSLE: {msle:.4f}")

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")