# Анализ стоимости домов 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import operator
plt.style.use('bmh')
import warnings
warnings.filterwarnings('ignore')

# Получение первичной информации о данных

In [None]:
df = pd.read_csv('EDA_data/EDA_exsample_data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.count() 

# Очистка dataset

In [None]:
# Удаляем Id и колонки, содержащие  30% или более NaN значений.
df2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]
del df2['Id']
print("List of dropped columns:", end=" ")
for c in df.columns:
    if c not in df2.columns:
        print(c, end=", ")
print('\n')
df = df2.copy()

In [None]:
print(df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.displot(df['SalePrice'], color='#21BA72', bins=100,);# hist_kws={'alpha': 0.4});

#### Numerical data distribution¶


In [None]:
list(set(df.dtypes.tolist()))

In [None]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
col_al = dict(color='#21BA72', 
          alpha=0.4)
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8, **col_al);
#plt.show()

#### Correlation

In [None]:
df_num_corr = df_num.corr()['SalePrice'][:-1] 
# -1 т.к. последняя в списке SalePrice и нам не интересна ее корреляция самой с собой
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

In [None]:
# построим зависимости SalePrice от других параметров
for i in range(0, len(df_num.columns), 5):
    sns.pairplot(data=df_num,
                x_vars=df_num.columns[i:i+5],
                y_vars=['SalePrice'],
                )

#### Удаляем 0 значения и повторим процесс вычисления корреляций 

In [None]:
df_num.columns[:-1]

In [None]:
individual_features_df = []
for i in range(0, len(df_num.columns) - 1): # -1 because the last column is SalePrice
    tmpDf = df_num[[df_num.columns[i], 'SalePrice']]
    tmpDf = tmpDf[tmpDf[df_num.columns[i]] != 0]
    individual_features_df.append(tmpDf)

all_correlations = {feature.columns[0]: feature.corr()['SalePrice'][0] for feature in individual_features_df}

all_correlations = sorted(all_correlations.items(), key=operator.itemgetter(1))
for (key, value) in all_correlations:
    print("{:>15}: {:>15}".format(key, value))

#### Уже интересно! После небольшой очистки данных, найдены значимые значения корреляций. Теперь наша переменная Golden_features_list выглядит так:

In [None]:
golden_features_list = [key for key, value in all_correlations if abs(value) >= 0.5]
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

#### Возможно, получили список колонок для обучения модели ;-) 

# Feature to feature relationship


In [None]:
corr = df_num.drop('SalePrice', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

## Q -> Q (Quantitative to Quantitative relationship)

Давайте теперь рассмотрим количественные характеристики нашего фрейма данных и то, как они соотносятся с ценой продажи (SalePrice), которая также является количественной (отсюда отношение Q -> Q). 

Некоторые признаки нашего набора данных являются категориальными. Убираем стобцы категориальных признаков и получаем следующие столбцы:


In [None]:
quantitative_features_list = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF',
    '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 
    'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']
df_quantitative_values = df[quantitative_features_list]
df_quantitative_values.head()

In [None]:
features_to_analyse = [x for x in quantitative_features_list if x in golden_features_list]
features_to_analyse.append('SalePrice')
features_to_analyse

In [None]:
# строим регрессионную модель
fig, ax = plt.subplots(round(len(features_to_analyse) / 3), 3, figsize = (18, 12))

for i, ax in enumerate(fig.axes):
    if i < len(features_to_analyse) - 1:
        sns.regplot(x=features_to_analyse[i],y='SalePrice', data=df[features_to_analyse], ax=ax)

Мы видим, что такие признаки TotalBsmtSF, 1stFlrSF, GrLivArea имеют большой разброс, но что делать с этой информацией требует дополнительного изучения.

## C -> Q (Categorical to Quantitative relationship)

In [None]:
# quantitative_features_list[:-1] as the last column is SalePrice and we want to keep it
categorical_features = [a for a in quantitative_features_list[:-1] + df.columns.tolist() if (a not in quantitative_features_list[:-1]) or (a not in df.columns.tolist())]
df_categ = df[categorical_features]
df_categ.head()

In [None]:
df_not_num = df_categ.select_dtypes(include = ['O'])
print('There is {} non numerical features including:\n{}'.format(len(df_not_num.columns), df_not_num.columns.tolist()))

In [None]:
set(df_not_num['BsmtExposure'])

In [None]:
plt.figure(figsize = (10, 6))
ax = sns.boxplot(x='BsmtExposure', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

In [None]:
plt.figure(figsize = (12, 6))
ax = sns.boxplot(x='SaleCondition', y='SalePrice', data=df_categ)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)

In [None]:
fig, axes = plt.subplots(round(len(df_not_num.columns) / 3), 3, figsize=(12, 30))

for i, ax in enumerate(fig.axes):
    if i < len(df_not_num.columns):
        ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
        sns.countplot(x=df_not_num.columns[i], alpha=0.7, data=df_not_num, ax=ax)

fig.tight_layout()

## EDA with pandas_profiling

In [None]:
import pandas_profiling  #### ACHTUNG!!!
profile = df2.profile_report()
profile

In [None]:
profile.to_file("RE_report.json")