In [None]:
# Important libraries

import numpy as np # for linear algebra
import pandas as pd # for data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Display all the columns of dataframes
pd.pandas.set_option('display.max_columns',None)

In [None]:
# Train dataset import (from kaggle input)
d_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

# rows x columns (data shape)
print(d_train.shape)

In [None]:
# First 5 records
d_train.head()

# Data Analysis
   1. Find all **missing** values
   2. Find all **numerical** variables
   3. Find the **distribution** of the numerical variables
   4. Find all the **categorical** variables
   5. **Cardinality** of categorical variables
   6. Discover the **outliers**
   7. Relationship between independent and dependent features (in this case, 'SalePrice' is the dependent one)

# Missing Values

1) Listing all features (columns) with missing values:

In [None]:
features_na = [features for features in d_train.columns if d_train[features].isnull().sum() >= 1]

2) Print the feature name and the percentage of missing values

In [None]:
for feature in features_na:
    print(feature, np.round((d_train[feature].isnull().mean())*100, 4),'% missing values.')

## Finding the relationship between missing values and target variable
Creates a variable that indicates '1' if the observation was missing or '0' if it wasn't (missing row for each variable/column), and than calculates the median of target variable where the information is missing or present (for the same variable).

In [None]:
for feature in features_na:
    data = d_train.copy()
    
    data[feature] = np.where(data[feature].isnull(),1,0)
    
    data.groupby(feature)['SalePrice'].median().plot.bar(color=['blue','orange'])
    plt.title(feature)
    plt.show()

With this relation, we can see that in mostly all of the features, missing values ('1') are related to high 'SalePrice' values - consideting this, we need to replace these values with meaningful data - not only delete it.

# Numerical features
Listing and visualizing numerical features (notice that not all of the numerical features are 'quantitative features' - one of them, i.e. is the 'id' of the houses and some of them, i.e. are dates):

In [None]:
# validade if the feature type is different than 'O' (letter), that means 'object' variable
num_features = [feature for feature in d_train.columns if d_train[feature].dtypes != 'O']

print('Number of numerical variables: ', len(num_features))

d_train[num_features].head()

### Temporal variables (i.e.: datetime variables)
In this dataset, we have 4 year (datetime) variables - 'YearBuilt', 'YearRemodAdd','GarageYrBlt', 'YrSold'. We can use a logic to find datetime variables:

In [None]:
year_feature = [feature for feature in num_features if 'Yr' in feature or 'Year' in feature]

year_feature

In [None]:
# Exploring these particular features' contents:
for feature in year_feature:
    print(feature, d_train[feature].sort_values().unique())

* Analysing temporal datetime variables (relation between year sold (grouped) and median house price):

In [None]:
# Grouping by 'year sold' feature and considering the median of 'sale price' for each group

d_train.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year sold')
plt.ylabel('Sale price')
plt.title('Year sold x House prices')

We can see that the relation seems to be against intuition. As the years go by, sales prices seem to decrease (maybe this shouldn't be happening). Thereby, we will compare the difference between all other 'years' features with 'SalePrice' - **these differences mean the 'age' of the buildings until it sales, because it computes the differences of each dates**:

In [None]:
for feature in year_feature:
    if feature !='YrSold':
        data = d_train.copy()
        # To capture the difference between year variable and year the house was sold:
        data[feature] = data['YrSold']- data[feature]
        
        plt.scatter(data[feature],data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

**Up here, these x-axis show the "age" of the buildings. We can see that** **newest building tend to have higher sale prices**.

### Discrete variables
We'll consider discrete variables those with less than 25 unique values (in this particular case). For that:

In [None]:
discrete_feature = [feature for feature in num_features if len(d_train[feature].unique())<25 and feature not in year_feature+['Id']]
print('Discrete variables count: {}'.format(len(discrete_feature)))

In [None]:
print(discrete_feature)
d_train[discrete_feature].head()

* Now, we can find if there is any relation between them (grouped) and the target variable ('SalePrice'):

In [None]:
for feature in discrete_feature:
    data=d_train.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature + ' x Sale price')
    plt.show()

### Continuous variables
Now that we have already defined the discrete variables, the logic we're going to use is the opposite of discrete ('not in discrete_feature neither year_feature'):

In [None]:
continuous_feature = [feature for feature in num_features if feature not in discrete_feature+year_feature+['Id']]
print('Continuous feature count: {}'.format(len(continuous_feature)))

* We can analyze the continuous values by creating histograms to understand their distribution:

In [None]:
for feature in continuous_feature:
    data=d_train.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel('Count of '+ feature)
    plt.title(feature+' distribution')
    plt.show()

### Normalization for continuous variables

Above, for continuous variables, we can see that most of them are not normaly distributed. We should do that by using **logarithmic transformation**:

In [None]:
for feature in continuous_feature:
    data=d_train.copy()
    if 0 in data[feature].unique(): # from now on, we will use this condition and it's because log of zero is undefined, so we apply the function excluding zeros
        pass
    else:
        data[feature] = np.log(data[feature])
        data['SalePrice'] = np.log(data['SalePrice']) # we should normalize both the feature and the sale price (x and y)
        plt.scatter(data[feature],data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('Sale Price')
        plt.title(feature)
        plt.show()

### Outliers check

In order to find out the outliers for **continuous variables**, we can use the boxplot method, in order to better visualize them:

In [None]:
for feature in continuous_feature:
    data=d_train.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature + ' boxplot (normalized)')
        plt.show()

# Categorical features
Now, it's time to analyze which variables are categorical and compare them:

In [None]:
categorical_features=[feature for feature in d_train.columns if data[feature].dtypes=='O']
# dtypes == 'O' means that the variable is an 'object' type (letter 'O') 
categorical_features

In [None]:
d_train[categorical_features].head()

### Cardinality
We should find out how many categories each and every features have (cardinality of each feature). 

In [None]:
for feature in categorical_features:
    print('The feature is "{}" and there are {} different categories.'.format(feature, len(d_train[feature].unique())))

## Finding the relationship between categorical features and independent feature ('SalePrice')
We can use **bar plot** and **median values** (considering we have lots of outliers, as seen above) to better visualize the relationship between categorical features and our target variable, 'SalePrice':

In [None]:
for feature in categorical_features:
    data=d_train.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature + ' x SalePrice')
    plt.show()