# CLEANING

In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [137]:
data = pd.read_csv('../kc_house_data_train.csv')

### Unnamed and ID (DROP)

In [138]:
data.drop(columns=['Unnamed: 0', 'id'], inplace=True)

### Date (DROP)

Justification: Only two representative years. Don't want to overfit the model to these two years.

In [139]:
data.drop(columns='date', inplace=True)

In [140]:
data.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,365000.0,4,2.25,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,865000.0,5,3.0,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283


# TRAIN TEST SPLIT

In [141]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
print(len(train_set), "train +", len(test_set), "test")

13832 train + 3458 test


### Bedrooms

In [142]:
train_set.bedrooms = train_set.bedrooms.apply(lambda x: 1 if x <= 3 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [143]:
train_set.bedrooms.value_counts()

1     8172
4     4388
5     1036
6      190
7       28
8        9
9        4
10       3
11       1
33       1
Name: bedrooms, dtype: int64

In [144]:
train_set.bedrooms = train_set.bedrooms.apply(lambda x: 2 if (x > 1 and x <= 4) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [145]:
train_set.bedrooms.value_counts()

1     8172
2     4388
5     1036
6      190
7       28
8        9
9        4
10       3
11       1
33       1
Name: bedrooms, dtype: int64

In [146]:
train_set.bedrooms = train_set.bedrooms.apply(lambda x: 3 if x > 4 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [147]:
train_set.bedrooms.value_counts()

1    8172
2    4388
3    1272
Name: bedrooms, dtype: int64

In [148]:
train_set.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
2498,429900.0,1,2.5,2370,5353,2.0,0,0,3,8,2370,0,2009,0,98019,47.7333,-121.975,2130,6850
10932,285000.0,2,1.75,2080,13629,1.0,0,0,4,7,1040,1040,1955,0,98178,47.4866,-122.232,1780,14659


### Bathrooms

In [149]:
train_set.bathrooms = train_set.bathrooms.apply(lambda x: 1 if x <= 1.5 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [150]:
train_set.bathrooms.unique()

array([2.5 , 1.75, 1.  , 2.25, 3.5 , 2.  , 3.  , 3.75, 3.25, 2.75, 4.75,
       4.5 , 4.  , 5.  , 6.  , 5.25, 4.25, 8.  , 5.5 , 6.25, 5.75, 6.75,
       6.5 , 7.75])

In [151]:
train_set.bathrooms = train_set.bathrooms.apply(lambda x: 2 if (x > 1 and x<= 2.5) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [152]:
train_set.bathrooms.unique()

array([2.  , 1.  , 3.5 , 3.  , 3.75, 3.25, 2.75, 4.75, 4.5 , 4.  , 5.  ,
       6.  , 5.25, 4.25, 8.  , 5.5 , 6.25, 5.75, 6.75, 6.5 , 7.75])

In [153]:
train_set.bathrooms = train_set.bathrooms.apply(lambda x: 3 if x > 2.5 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [154]:
train_set.bathrooms = train_set.bathrooms.apply(lambda x: int(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [155]:
train_set.bathrooms.unique()

array([2, 1, 3])

In [156]:
train_set.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
2498,429900.0,1,2,2370,5353,2.0,0,0,3,8,2370,0,2009,0,98019,47.7333,-121.975,2130,6850
10932,285000.0,2,2,2080,13629,1.0,0,0,4,7,1040,1040,1955,0,98178,47.4866,-122.232,1780,14659


### Sqft_living

In [126]:
data.sqft_living = data.sqft_living.apply(lambda x: 10000 if x >= 10000 else x)

In [129]:
data.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,365000.0,2,2,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,865000.0,3,3,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283


### Sqft_lot

In [128]:
data.sqft_lot = data.sqft_lot.apply(lambda x: 50000 if x >= 50000 else x)

In [130]:
data.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,365000.0,2,2,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,865000.0,3,3,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283


### Floors

Keep as is

### Waterfront

Keep as is

### View

Keep as is

### Condition

Keep as is

### Grade

Keep as is

### Sqft_above

Keep as is

### Sqft_basement

Keep as is

### Yr_built

Keep as is

### Zipcode

### Lat & Long

### Sqft_living15

### Sqft_lot15