<h1><center> Predict House Prices </center></h1>

## About the Project

This project is from Kaggle competition ["House Prices - Advanced Regression Techniques"](https://www.kaggle.com/c/house-prices-advanced-regression-techniques). The main goal is predict the sales price for each house.

## Importing Libs

In [1]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

## Auxiliar Functions

In [2]:
def plot_multiples_graphs(groupbedBy, column, x, y):
    fig, ax = plt.subplots(x, y, figsize=(30, 25))
    sns.despine()
    column_grafic = 0
    row_grafic = 0
    for t, g in groupbedBy:
        sns.distplot(g[column], ax=ax[row_grafic, column_grafic],
                    rug=False, kde=False, bins=26, norm_hist=True).set_title(t)
        ax[row_grafic, column_grafic].grid(True, alpha=0.3)
        ax[row_grafic, column_grafic].xaxis.label
        ax[row_grafic, column_grafic].yaxis.label
        ax[row_grafic, column_grafic].spines['bottom']
        ax[row_grafic, column_grafic].spines['top']
        
        
        column_grafic += 1
        if (column_grafic > 1):
            column_grafic = 0
            row_grafic += 1

In [3]:
def formatar_grafico_barra_horizontal(ax, title="", title_size=18, x_offset=0.03,
                                      anotar=True, anot_format='{:.0f}', 
                                      xlabel=None, ylabel=None):
    ax.set_title(title, fontsize=title_size)
    ax.tick_params(labelsize=14)
    maior_valor_x = np.max([p.get_bbox().get_points()[:,0].max() for p in ax.patches])
    offset = maior_valor_x * x_offset
    
    if (anotar):
        sns.despine(bottom=True, left=True)
        for p in ax.patches:
            x = p.get_bbox().get_points()[:,0]
            y = p.get_bbox().get_points()[1,1]
            ax.annotate(anot_format.format(float(x.max())),
                        (x.max() + offset, y), 
                        ha='left', va='bottom', size=14)
        ax.set_xticklabels([])
        ax.set_xlabel('')
    else:
        sns.despine()
        ax.tick_params(axis='both', which='major', length=20)
        
    if xlabel is not None:
        ax.set_xlabel(xlabel, size=14)
    
    if ylabel is not None:
        ax.set_ylabel(ylabel, size=14)

    return ax

In [4]:
def plot_matriz_correlacao(df, k_colunas=None, anotar=False,
                           titulo='Heatmap', metodo='pearson'):
    sns.set(style="white")

    corr = df.corr(method=metodo)
    fig, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(240, 10, as_cmap=True)
    cols = df.columns
    if (k_colunas is not None):
        cols = corr.nlargest(kk_colunas, df.columns).index
        corr = np.corrcoef(df[cols].values.T)
    
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
        
    if (anotar):
        sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, square=True, 
                     center=0, vmin=-1.0, vmax=1.0, fmt='.2f', 
                     yticklabels=cols.values, xticklabels=cols.values)
    else:
        sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1.0, vmax=1.0, 
                    square=True) 
    
    plt.title(titulo, size=16)

## Loading Dataset

In [5]:
path_dir = Path().absolute()

In [6]:
path_data = str(path_dir) + '\\Data'

In [7]:
path_test = path_data + '\\test.csv'
path_train = path_data + '\\train.csv'
path_sample_submission = path_data + '\\sample_submission.csv'

In [8]:
path_dir

WindowsPath('C:/Users/nathalia.bedor/Desktop/project_support/Kaggle_Projects/House-Prices-Advanced-Regression-Techniques')

In [9]:
df_test = pd.read_csv(path_test)
df_train = pd.read_csv(path_train)

## Exploiting Dataset

In [10]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Removing columns with null values 

#### Dropping columns with more than 70% of null values

In [25]:
print('% Null values')
threshold = 0.70
columns_to_drop = df_train.columns[df_train.isnull().mean() > threshold]

for column in columns_to_drop:
    current_percentage = df_train[column].isnull().mean() * 100
    print(f'{column}: {current_percentage}%')

% Null values
Alley: 93.76712328767123%
PoolQC: 99.52054794520548%
Fence: 80.75342465753424%
MiscFeature: 96.30136986301369%


In [27]:
df_train.drop(columns=columns_to_drop, inplace=True)

#### Dropping rows with more than 70% of null values

In [35]:
df_train.loc[df_train.isnull().mean(axis=1) > threshold]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


### Removing columns with more than 80% of unique values

In [None]:
print('Columns with more than 80% unique values')
columns_to_drop = []
for c in df_train.columns:
    current_serie = df_train[c].value_counts(normalize=True)
    if len(current_serie[current_serie > 0.90]):
        print(c)
        columns_to_drop.append(c)
df_train.drop(columns=columns_to_drop, inplace=True)

### Analyzing numerical columns

In [None]:
df_numerical_columns = df_train.select_dtypes(include=['float64', 'int64']).columns

In [None]:
df_numerical_columns

In [None]:
df_numerical_columns = df_numerical_columns[1:]

In [None]:
df_numerical_columns

In [None]:
for c in df_numerical_columns:
    print(f'{c}: {df_train[c].nunique()}')

In [None]:
column_unique_value =  [c for c in df_numerical_columns if df_train[c].nunique() < 5]

In [None]:
for c in column_unique_value:
    print(f'{c} values:\n{df_train[c].value_counts()}')

In [None]:
ax = df_train.groupby('YrSold').size().plot(kind='line')
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')

In [None]:
ax2 = df_train.groupby('MoSold').size().plot(kind='bar')
ax2.tick_params(axis='x', colors='white')
ax2.tick_params(axis='y', colors='white')

In [None]:
gYrSold = df_train.groupby('YrSold')

In [None]:
plot_multiples_graphs(gYrSold, 'MoSold',3 , 2)

### Analyzing categorical columns

In [None]:
df_categorical_columns = df_train.select_dtypes(include=['object']).columns

In [None]:
categorical_columns = [c for c in df_categorical_columns]

In [None]:
categorical_columns

In [None]:

for c in categorical_columns:
    max_count = np.min([16, df_train[c].nunique()])
    fig_height = np.round(np.min([max_count * 1.5, 7]))
    freqs = (df_train[c].dropna().astype(str).value_counts(normalize=True, ascending=False) * 100)[:max_count]
    fig, ax = plt.subplots(figsize=(5, fig_height))
    sns.set_palette("rocket")
    sns.barplot(y=freqs.index, x=freqs, ax=ax, orient="h", order=freqs.index)
    ax = formatar_grafico_barra_horizontal(ax, anotar=True, anot_format='{:.2f} %',
                                           title=f"Valores mais comuns ({c})",
                                           ylabel="Valor",
                                           xlabel="Frequência")
    plt.show()

## Checking correlations between features

In [None]:
plot_matriz_correlacao(df_train[df_numerical_columns], anotar=True, 
                       titulo='Correlação linear das variáveis numéricas',
                      metodo='spearman')

In [None]:
corr_df_train = df_train.corr()

In [None]:
corr_df_train[corr_df_train["SalePrice"]>0.60]["SalePrice"]