## House Kaggle Competitions

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
path = r'C:\Users\ndalessandro\Desktop\House_Prices/'

In [3]:
def k_read_data(path, dataset):
    '''
    This function is to load your dataset and set workbench:
        >>> Parameter 1: "path" [str] = path to the data if needed.
        >>> Parameter 2: "dataset" [str] = dataset we want to fetch.
        ----------------------------------------------------------------------------------
        <<< Return: Load confirmation. Dataset shape description.
    '''
    df = pd.DataFrame(pd.read_csv(path + dataset))
    print('>>> Data successfully loaded. The DataFrame contains', df.shape[0], 'columns, and', df.shape[1], 'columns.')

In [4]:
def k_cleanUp_columns(df):
    '''
    This function map column names with values in lowercase and low_uderscore instead of spaces if apply.
        >>> Parameter 1: [DataFrame] Pandas DataFrame.
        ------------------------------------------------
        <<< Return: None
    '''
    df.rename(columns= {column: column.lower().replace(' ', '_') for column in df.columns.tolist()}, inplace = True)

In [5]:
def k_info(df):
    '''
    This function displays basic information about the given dataset.
        >>> Parameter 1: [DataFrame] Pandas DataFrame.
        ------------------------------------------------
        <<< Return: Dataset basic info.
    '''
    print('Column data types and description: \n')
    return df.info()

In [6]:
def k_describe(df):
    '''
    This function displays a basic description about the numerical features in the given dataset:
        >>> Parameter 1: [DataFrame] Pandas DataFrame.
        ------------------------------------------------
        <<< Return: Dataset basic description.
    '''
    print('Numerical values description:')
    return df.describe().round(2)

In [7]:
def k_numericalf_range(df): 
    '''
    This function displays de range of the numerical values in the dataset given:
        >>> Parameter 1: [DataFrame] Pandas DataFrame.
        ------------------------------------------------
        <<< Return: Numerical features name and range.
    '''
    print('Numerical features range:')
    return df.select_dtypes(include= [np.number]).apply(lambda x: x.max()- x.min())

In [8]:
def k_categoricalf_info(df):
    '''
    This function displays name and count of unique values of the categorical features in the given dataset
        >>> Parameter 1: [DataFrame] Pandas DataFrame.
        ------------------------------------------------
        <<< Return: Categorical features and count uniques
    '''
    print('Categorical features counts:')
    return df.select_dtypes(include= 'object').apply(lambda x: x.count())

In [91]:
def k_nullvaluesf_count(df):
    '''
    This function displays the features that contains null values as well as the correspondig count:
        >>> Parameter 1: [DataFrame] Pandas DataFrame.
        ------------------------------------------------
        <<< Return: Features with null values and count.
    '''
    print('Features with null values:')
    null_values = {}
    for i in df.columns[df.isnull().any()]:
        null_values[i] = df.shape[0] - df.loc[:, i].count()
    return null_values

In [92]:
data1 = pd.read_csv(path + 'test.csv')
data2 = pd.read_csv(path + 'train.csv')

In [93]:
k_cleanUp_columns(data1)

In [94]:
k_info(data1)

Column data types and description: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
id               1459 non-null int64
mssubclass       1459 non-null int64
mszoning         1455 non-null object
lotfrontage      1232 non-null float64
lotarea          1459 non-null int64
street           1459 non-null object
alley            107 non-null object
lotshape         1459 non-null object
landcontour      1459 non-null object
utilities        1457 non-null object
lotconfig        1459 non-null object
landslope        1459 non-null object
neighborhood     1459 non-null object
condition1       1459 non-null object
condition2       1459 non-null object
bldgtype         1459 non-null object
housestyle       1459 non-null object
overallqual      1459 non-null int64
overallcond      1459 non-null int64
yearbuilt        1459 non-null int64
yearremodadd     1459 non-null int64
roofstyle        1459 non-null object
roofmatl         1459 non-n

In [95]:
k_describe(data1)   

Numerical values description:


Unnamed: 0,id,mssubclass,lotfrontage,lotarea,overallqual,overallcond,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,...,garagearea,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold
count,1459.0,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.38,68.58,9819.16,6.08,5.55,1971.36,1983.66,100.71,439.2,...,472.77,93.17,48.31,24.24,1.79,17.06,1.74,58.17,6.1,2007.77
std,421.32,42.75,22.38,4955.52,1.44,1.11,30.39,21.13,177.63,455.27,...,217.05,127.74,68.88,67.23,20.21,56.61,30.49,630.81,2.72,1.3
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [96]:
k_numericalf_range(data1)

Numerical features range:


id                1458.0
mssubclass         170.0
lotfrontage        179.0
lotarea          55130.0
overallqual          9.0
overallcond          8.0
yearbuilt          131.0
yearremodadd        60.0
masvnrarea        1290.0
bsmtfinsf1        4010.0
bsmtfinsf2        1526.0
bsmtunfsf         2140.0
totalbsmtsf       5095.0
1stflrsf          4688.0
2ndflrsf          1862.0
lowqualfinsf      1064.0
grlivarea         4688.0
bsmtfullbath         3.0
bsmthalfbath         2.0
fullbath             4.0
halfbath             2.0
bedroomabvgr         6.0
kitchenabvgr         2.0
totrmsabvgrd        12.0
fireplaces           4.0
garageyrblt        312.0
garagecars           5.0
garagearea        1488.0
wooddecksf        1424.0
openporchsf        742.0
enclosedporch     1012.0
3ssnporch          360.0
screenporch        576.0
poolarea           800.0
miscval          17000.0
mosold              11.0
yrsold               4.0
dtype: float64

In [97]:
k_categoricalf_info(data1)

Categorical features counts:


mszoning         1455
street           1459
alley             107
lotshape         1459
landcontour      1459
utilities        1457
lotconfig        1459
landslope        1459
neighborhood     1459
condition1       1459
condition2       1459
bldgtype         1459
housestyle       1459
roofstyle        1459
roofmatl         1459
exterior1st      1458
exterior2nd      1458
masvnrtype       1443
exterqual        1459
extercond        1459
foundation       1459
bsmtqual         1415
bsmtcond         1414
bsmtexposure     1415
bsmtfintype1     1417
bsmtfintype2     1417
heating          1459
heatingqc        1459
centralair       1459
electrical       1459
kitchenqual      1458
functional       1457
fireplacequ       729
garagetype       1383
garagefinish     1381
garagequal       1381
garagecond       1381
paveddrive       1459
poolqc              3
fence             290
miscfeature        51
saletype         1458
salecondition    1459
dtype: int64

In [98]:
nulls = data1.columns[data1.isnull().any()]
nulls

Index(['mszoning', 'lotfrontage', 'alley', 'utilities', 'exterior1st',
       'exterior2nd', 'masvnrtype', 'masvnrarea', 'bsmtqual', 'bsmtcond',
       'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1', 'bsmtfintype2',
       'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'bsmtfullbath',
       'bsmthalfbath', 'kitchenqual', 'functional', 'fireplacequ',
       'garagetype', 'garageyrblt', 'garagefinish', 'garagecars', 'garagearea',
       'garagequal', 'garagecond', 'poolqc', 'fence', 'miscfeature',
       'saletype'],
      dtype='object')

In [103]:
k_nullvaluesf_count(data1)

Features with null values:


{'mszoning': 4,
 'lotfrontage': 227,
 'alley': 1352,
 'utilities': 2,
 'exterior1st': 1,
 'exterior2nd': 1,
 'masvnrtype': 16,
 'masvnrarea': 15,
 'bsmtqual': 44,
 'bsmtcond': 45,
 'bsmtexposure': 44,
 'bsmtfintype1': 42,
 'bsmtfinsf1': 1,
 'bsmtfintype2': 42,
 'bsmtfinsf2': 1,
 'bsmtunfsf': 1,
 'totalbsmtsf': 1,
 'bsmtfullbath': 2,
 'bsmthalfbath': 2,
 'kitchenqual': 1,
 'functional': 2,
 'fireplacequ': 730,
 'garagetype': 76,
 'garageyrblt': 78,
 'garagefinish': 78,
 'garagecars': 1,
 'garagearea': 1,
 'garagequal': 78,
 'garagecond': 78,
 'poolqc': 1456,
 'fence': 1169,
 'miscfeature': 1408,
 'saletype': 1}