In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.__version__

In [None]:
# Read train and test data of 762 assignment
# Adapt path to files downloaded from https://www.kaggle.com/c/iowa-house-prices-regression-techniques/data
df = pd.read_csv('/Users/obi/04-Research/dsds/data/raw/DS19cn0_AmesHousing/train.csv', 
                 index_col=0)
test = pd.read_csv('/Users/obi/04-Research/dsds/data/raw/DS19cn0_AmesHousing/test.csv', 
                 index_col=0)

In [None]:
# Check first five rows of train set
df.head()

In [None]:
# Check column types
# 'object' indicates categorical variables, which need to be one-hot-encoded
df.info()

# Check missing values

In [None]:
# Check missing values
# .isna() returns boolean array, True values have value 1 in summation
df.isna().sum().sum()>0 # Yes, there are missing values

In [None]:
# Decision: Drop columns with missing values

df2 = df.dropna(axis=1)
df2.shape, df2.isna().sum().sum()>0 # df2 does not contain missing values

In [None]:
# Check column types
# 'object' indicates categorical variables
df2.info()

# Visualize categorical data

## Use catplot for categorical data

In [None]:
sns.__version__

In [None]:
# You might need to update seaborn on the Anaconda console
# conda install seaborn==0.9.0
sns.catplot(x='MSZoning', y='SalePrice', data=df2, kind='bar')

In [None]:
# Alternative implementation using .groupby
# Warning: different colour scheme
df2.groupby('MSZoning').SalePrice.mean().plot(kind='bar')

# Use lmplot for visualizing continous features

In [None]:
# LotArea seems to have some outliers
sns.lmplot(x='LotArea', y='SalePrice', data=df)

In [None]:
# (df.LotArea>100000) return boolean array, which contains 4 True values
np.sum(df.LotArea > 100000)

In [None]:
# Create new dataframe, which excludes outliers
df3 = df2[df.LotArea <= 100000]

In [None]:
# Check is outliers have been excluded
# plot indicates that a log-transformation of LotArea might be reasonable
sns.lmplot(x='LotArea', y='SalePrice', data=df3)

In [None]:
# Create a new dataframe in order to add new columns
df4 = df3.copy()

In [None]:
# Create new column with log-transformed LotArea
df4['log_LotArea'] = np.log(df4.LotArea)

In [None]:
# Visualized new column
# Looks fine, but perhaps polynomial features would be useful, too?
# e.g. df4['log_LotArea2'] = np.log(df4.LotArea)**2
sns.lmplot(x='log_LotArea', y='SalePrice', data=df4)

## Use Violin plot for comparing categories

In [None]:
# Create new column, which indicates if MSZoning is 'FV'
df4['MSZoning_FV'] = df4.MSZoning=='FV'

In [None]:
# Visualize distribution of sale prices for different land contours
# and zoning.
# FV zone seems to coincide with land contour 'Lvl'
# We have to check the data description at http://jse.amstat.org/v19n3/decock/DataDocumentation.txt
sns.violinplot(x='LandContour', 
               y='SalePrice', 
               data=df4, 
               hue = 'MSZoning_FV',
               split=True, inner="quart",
               )