In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

In [None]:
df.shape

#### Описание данных  

Здесь описание некоторых полей датасета:  

- SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.  

  
- MSSubClass: The building class
- MSZoning: The general zoning classification
- LotFrontage: Linear feet of street connected to property
- LotArea: Lot size in square feet
- Street: Type of road access
- Alley: Type of alley access
- LotShape: General shape of property
- LandContour: Flatness of the property
- Utilities: Type of utilities available
- LotConfig: Lot configuration
- LandSlope: Slope of property
- Neighborhood: Physical locations within Ames city limits
- Condition1: Proximity to main road or railroad
- Condition2: Proximity to main road or railroad (if a second is present)
- BldgType: Type of dwelling
- HouseStyle: Style of dwelling
- OverallQual: Overall material and finish quality
- OverallCond: Overall condition rating
- YearBuilt: Original construction date
- YearRemodAdd: Remodel date
- RoofStyle: Type of roof
- RoofMatl: Roof material
- Exterior1st: Exterior covering on house
- Exterior2nd: Exterior covering on house (if more than one material)
- MasVnrType: Masonry veneer type
- MasVnrArea: Masonry veneer area in square feet
- ExterQual: Exterior material quality
- ExterCond: Present condition of the material on the exterior
- Foundation: Type of foundation
- BsmtQual: Height of the basement
- BsmtCond: General condition of the basement
- BsmtExposure: Walkout or garden level basement walls
- BsmtFinType1: Quality of basement finished area
- BsmtFinSF1: Type 1 finished square feet
- BsmtFinType2: Quality of second finished area (if present)
- BsmtFinSF2: Type 2 finished square feet
- BsmtUnfSF: Unfinished square feet of basement area
- TotalBsmtSF: Total square feet of basement area
- Heating: Type of heating
- HeatingQC: Heating quality and condition
- CentralAir: Central air conditioning
- Electrical: Electrical system
- 1stFlrSF: First Floor square feet
- 2ndFlrSF: Second floor square feet
- LowQualFinSF: Low quality finished square feet (all floors)
- GrLivArea: Above grade (ground) living area square feet
- BsmtFullBath: Basement full bathrooms
- BsmtHalfBath: Basement half bathrooms
- FullBath: Full bathrooms above grade
- HalfBath: Half baths above grade
- Bedroom: Number of bedrooms above basement level
- Kitchen: Number of kitchens
- KitchenQual: Kitchen quality
- TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
- Functional: Home functionality rating
- Fireplaces: Number of fireplaces
- FireplaceQu: Fireplace quality
- GarageType: Garage location
- GarageYrBlt: Year garage was built
- GarageFinish: Interior finish of the garage
- GarageCars: Size of garage in car capacity
- GarageArea: Size of garage in square feet
- GarageQual: Garage quality
- GarageCond: Garage condition
- PavedDrive: Paved driveway
- WoodDeckSF: Wood deck area in square feet
- OpenPorchSF: Open porch area in square feet
- EnclosedPorch: Enclosed porch area in square feet
- 3SsnPorch: Three season porch area in square feet
- ScreenPorch: Screen porch area in square feet
- PoolArea: Pool area in square feet
- PoolQC: Pool quality
- Fence: Fence quality
- MiscFeature: Miscellaneous feature not covered in other categories
- MiscVal: $Value of miscellaneous feature
- MoSold: Month Sold
- YrSold: Year Sold
- SaleType: Type of sale
- SaleCondition: Condition of sale  


**Подробное описание значений можно найти в файле data_dascription.txt**

In [None]:
df.info()

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('SalePrice', axis=1), df['SalePrice'], test_size=0.2)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
mean = y_train.mean()

In [None]:
y_pred = pd.DataFrame([mean]*X_test.shape[0])

In [None]:
y_pred.head()

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
X_train['LotFrontage'] = X_train['LotFrontage'].fillna(df['LotFrontage'].mean())
X_test['LotFrontage'] = X_test['LotFrontage'].fillna(df['LotFrontage'].mean())
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())

In [None]:
X_train['MasVnrArea'] = X_train['MasVnrArea'].fillna(df['MasVnrArea'].mean())
X_test['MasVnrArea'] = X_test['MasVnrArea'].fillna(df['MasVnrArea'].mean())
df['MasVnrArea'] = df['MasVnrArea'].fillna(df['MasVnrArea'].mean())

In [None]:
X_train['GarageYrBlt'] = X_train['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())
X_test['GarageYrBlt'] = X_test['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean())

In [None]:
num_features = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 
                'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 
                '1stFlrSF','2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 
                'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
                'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 
                'GarageCars', 'GarageArea', 'WoodDeckSF','OpenPorchSF', 
                'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
                'MiscVal', 'MoSold', 'YrSold']

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

In [None]:
lin_reg.fit(X_train[num_features], y_train)

In [None]:
y_pred_train = lin_reg.predict(X_train[num_features])

In [None]:
np.sqrt(mean_squared_error(y_train, y_pred_train))

In [None]:
y_pred = lin_reg.predict(X_test[num_features])

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

Примеры того, что можно сделать для улучшения результата итоговой модели:

по-другому разделить датасет на тренировочный и тестовый;


изменить признаки, на которых мы обучаем модель (num_features в примере).