In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [40]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train.shape, df_test.shape

((1460, 81), (1459, 80))

In [41]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## EDA 

### Corr HeatMap

plt.figure(figsize=(12,9))
sns.heatmap(df_train.corr())
plt.show()

### builtYear

df_train.groupby('YearBuilt')['SalePrice'].describe()

plt.figure(figsize=(18,8))

sns.boxplot(x='YearBuilt', y='SalePrice', data=df_train)
plt.xticks(rotation='vertical', fontsize=8)
plt.show()

### SaleType 

plt.figure(figsize=(18,8))

sns.boxplot(x='SaleType', y='SalePrice', data=df_train)
#plt.xticks(rotation='vertical', fontsize=8)
plt.show()

### YrSold

plt.figure(figsize=(18,8))

sns.boxplot(x='YrSold', y='SalePrice', data=df_train)
#plt.xticks(rotation='vertical', fontsize=8)
plt.show()

### TotalFlrSF, LowQualRatio

df_train['TotalFlrSF'] = df_train['1stFlrSF'] + df_train['2ndFlrSF']
df_train['LowQualRatio'] = round(df_train['LowQualFinSF'] / df_train['TotalFlrSF'], 2)

plt.figure(figsize=(18,8))

sns.boxplot(x='TotalFlrSF', y='SalePrice', data=df_train)
#plt.xticks(rotation='vertical', fontsize=8)
plt.show()

plt.figure(figsize=(15,6))

df_train.groupby('TotalFlrSF')['SalePrice'].describe()['mean'].plot()
plt.show()

plt.figure(figsize=(18,8))

sns.boxplot(x='LowQualRatio', y='SalePrice', data=df_train)
plt.xticks(rotation='vertical', fontsize=8)
plt.show()

## 질적 데이터 인코딩

In [25]:
qualitative = [f for f in df_train.columns if df_train.dtypes[f] == 'object']
qualitative

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [42]:
def qual_encoding_dict(df, column) :
    encoding = pd.DataFrame()
    encoding[column] = df[column].unique()
    encoding['SalePrice'] = df[[column, 'SalePrice']].groupby(column).mean()['SalePrice']
    encoding = encoding.sort_values('SalePrice').reset_index(drop=True)
    encoding = encoding.reset_index().set_index(column)
    encoding = encoding['index'].to_dict()
    
    return encoding

In [43]:
def qual_preprocess() :
    qualitative = [f for f in df_train.columns if df_train.dtypes[f] == 'object']
    
    enc_dicts = {}
    enc_labels = []
    
    for col in qualitative :
        enc_dicts[col] = qual_encoding_dict(df_train, col)
        
    for col in qualitative :
        encoded_label = col + '_E'
        df_train[encoded_label] = df_train[col].map(enc_dicts[col])
        df_test[encoded_label] = df_test[col].map(enc_dicts[col])
        enc_labels.append(encoded_label)
        
    return enc_labels, qualitative

In [44]:
enc_labels, origin_labels = qual_preprocess()
enc_labels, origin_labels

(['MSZoning_E',
  'Street_E',
  'Alley_E',
  'LotShape_E',
  'LandContour_E',
  'Utilities_E',
  'LotConfig_E',
  'LandSlope_E',
  'Neighborhood_E',
  'Condition1_E',
  'Condition2_E',
  'BldgType_E',
  'HouseStyle_E',
  'RoofStyle_E',
  'RoofMatl_E',
  'Exterior1st_E',
  'Exterior2nd_E',
  'MasVnrType_E',
  'ExterQual_E',
  'ExterCond_E',
  'Foundation_E',
  'BsmtQual_E',
  'BsmtCond_E',
  'BsmtExposure_E',
  'BsmtFinType1_E',
  'BsmtFinType2_E',
  'Heating_E',
  'HeatingQC_E',
  'CentralAir_E',
  'Electrical_E',
  'KitchenQual_E',
  'Functional_E',
  'FireplaceQu_E',
  'GarageType_E',
  'GarageFinish_E',
  'GarageQual_E',
  'GarageCond_E',
  'PavedDrive_E',
  'PoolQC_E',
  'Fence_E',
  'MiscFeature_E',
  'SaleType_E',
  'SaleCondition_E'],
 ['MSZoning',
  'Street',
  'Alley',
  'LotShape',
  'LandContour',
  'Utilities',
  'LotConfig',
  'LandSlope',
  'Neighborhood',
  'Condition1',
  'Condition2',
  'BldgType',
  'HouseStyle',
  'RoofStyle',
  'RoofMatl',
  'Exterior1st',
  'Exteri

In [45]:
df_train[enc_labels]

Unnamed: 0,MSZoning_E,Street_E,Alley_E,LotShape_E,LandContour_E,Utilities_E,LotConfig_E,LandSlope_E,Neighborhood_E,Condition1_E,...,GarageType_E,GarageFinish_E,GarageQual_E,GarageCond_E,PavedDrive_E,PoolQC_E,Fence_E,MiscFeature_E,SaleType_E,SaleCondition_E
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,2,0,2,0,...,1,1,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,1,0,3,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,0,0,0,0,0,0,0,17,0,...,0,0,0,0,0,0,0,0,0,0
1456,0,0,0,0,0,0,0,0,6,0,...,0,1,0,0,0,0,1,0,0,0
1457,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,3,1,0,0
1458,0,0,0,0,0,0,0,0,11,0,...,0,1,0,0,0,0,0,0,0,0


In [46]:
df_test[enc_labels]

Unnamed: 0,MSZoning_E,Street_E,Alley_E,LotShape_E,LandContour_E,Utilities_E,LotConfig_E,LandSlope_E,Neighborhood_E,Condition1_E,...,GarageType_E,GarageFinish_E,GarageQual_E,GarageCond_E,PavedDrive_E,PoolQC_E,Fence_E,MiscFeature_E,SaleType_E,SaleCondition_E
0,4.0,0,0,0,0,0.0,0,0,11,1,...,0,1,0,0,0,0,1,0,0.0,0
1,0.0,0,0,1,0,0.0,2,0,11,0,...,0,1,0,0,0,0,0,2,0.0,0
2,0.0,0,0,1,0,0.0,0,0,17,0,...,0,2,0,0,0,0,1,0,0.0,0
3,0.0,0,0,1,0,0.0,0,0,17,0,...,0,2,0,0,0,0,0,0,0.0,0
4,0.0,0,0,1,3,0.0,0,0,18,0,...,0,0,0,0,0,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1.0,0,0,0,0,0.0,0,0,14,0,...,4,3,3,2,0,0,0,0,0.0,0
1455,1.0,0,0,0,0,0.0,0,0,14,0,...,3,1,0,0,0,0,0,0,0.0,1
1456,0.0,0,0,0,0,0.0,0,0,4,0,...,1,1,0,0,0,0,0,0,0.0,1
1457,0.0,0,0,0,0,0.0,0,0,4,0,...,4,3,3,2,0,0,1,1,0.0,0


## 결측치

In [47]:
cols_fillna = ['PoolQC','MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageYrBlt', 'GarageType', 'Electrical',
               'KitchenQual', 'SaleType', 'Functional', 'Exterior2nd', 'Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'Utilities', 'LotFrontage', 'MasVnrArea']

for col in cols_fillna:
    df_train[col].fillna('None',inplace=True)
    df_test[col].fillna('None',inplace=True)

## 추가 column 생성 

In [48]:
for features in [df_train, df_test] :
    features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)
    features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [49]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 124 entries, Id to SaleCondition_E
dtypes: int64(78), object(46)
memory usage: 1.4+ MB


In [20]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               1459 non-null   int64  
 1   MSSubClass       1459 non-null   int64  
 2   LotFrontage      1459 non-null   object 
 3   LotArea          1459 non-null   int64  
 4   OverallQual      1459 non-null   int64  
 5   OverallCond      1459 non-null   int64  
 6   YearBuilt        1459 non-null   int64  
 7   YearRemodAdd     1459 non-null   int64  
 8   MasVnrArea       1459 non-null   object 
 9   BsmtFinSF1       1458 non-null   float64
 10  BsmtFinSF2       1458 non-null   float64
 11  BsmtUnfSF        1458 non-null   float64
 12  TotalBsmtSF      1458 non-null   float64
 13  1stFlrSF         1459 non-null   int64  
 14  2ndFlrSF         1459 non-null   int64  
 15  LowQualFinSF     1459 non-null   int64  
 16  GrLivArea        1459 non-null   int64  
 17  BsmtFullBath  

## 모델 테스트

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

In [50]:
df_train = df_train.drop(origin_labels, axis=1)
df_test = df_test.drop(origin_labels, axis=1)

In [51]:
y_train = df_train['SalePrice'].values
X_train = df_train.drop('SalePrice', axis=1)
X_train.shape, y_train.shape

((1460, 80), (1460,))

In [36]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               1460 non-null   int64 
 1   MSSubClass       1460 non-null   int64 
 2   LotFrontage      1460 non-null   object
 3   LotArea          1460 non-null   int64 
 4   OverallQual      1460 non-null   int64 
 5   OverallCond      1460 non-null   int64 
 6   YearBuilt        1460 non-null   int64 
 7   YearRemodAdd     1460 non-null   int64 
 8   MasVnrArea       1460 non-null   object
 9   BsmtFinSF1       1460 non-null   int64 
 10  BsmtFinSF2       1460 non-null   int64 
 11  BsmtUnfSF        1460 non-null   int64 
 12  TotalBsmtSF      1460 non-null   int64 
 13  1stFlrSF         1460 non-null   int64 
 14  2ndFlrSF         1460 non-null   int64 
 15  LowQualFinSF     1460 non-null   int64 
 16  GrLivArea        1460 non-null   int64 
 17  BsmtFullBath     1460 non-null   

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, random_state=34)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((1168, 80), (292, 80), (1168,), (292,))

In [53]:
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
mean_squared_error(y_valid, logistic.predict(X_valid))

ValueError: could not convert string to float: 'None'