Let's go.

In [None]:
import pandas as pd

In [None]:
def openData():
    df=pd.read_csv("../data.csv")
    df.drop_duplicates()
    return df

house=openData()
house.info()
house['age'] = (1403-house['buildYear']) # adding age for better corr
house['price']/=1000000000 # easier control
house=house.dropna(subset=['price'])

just working a bit with the dataset to understand what's going on. you can pass this.

In [None]:
print(house.describe())

In [None]:
import matplotlib.pyplot as plt

**making the test set**

In [None]:
from zlib import crc32
import numpy as np
def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32
def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


adding an index to data as it's needed.

In [None]:
house_with_id = house.reset_index() # adds an `index` column
train_set, test_set = split_data_with_id_hash(house_with_id, 0.2, "index")

stratified sampling:

In [None]:
house["meter_cat"] = pd.cut(house["meter"],
bins=[0., 40, 80, 120, 160, 200., np.inf],
labels=[1, 2, 3, 4, 5, 6])
house["meter_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
# plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
stratSplits = []
for trainIndex, testIndex in splitter.split(house, house["meter_cat"]):
    stratTrainSetN = house.iloc[trainIndex]
    stratTestSetN = house.iloc[testIndex]
    stratSplits.append([stratTrainSetN, stratTestSetN])
    
stratTrainSet, stratTestSet = stratSplits[0]

In [None]:
for set_ in ( stratTrainSet ,stratTestSet):
    set_.drop("meter_cat", axis=1 , inplace=True)

_**START of the MODEL pre-process**_

In [None]:
house = stratTrainSet.copy()
corr_matrix = house.corr()
print(corr_matrix['price'].sort_values(ascending=False))

_note 1_ : wow it seems that meter is really important!

In [None]:

print(stratTrainSet.describe())
house = stratTrainSet.drop('price',axis=1)
labels = stratTrainSet['price'].copy()
print(labels.describe())

imputation

In [None]:
def imputeFrequent(cols,h):
    for col in cols:
        frequent = house[col].mode()
        h[[col]]=h[[col]].fillna(float(frequent),inplace=False)
    return h
house = imputeFrequent(['floor','rooms','meter','buildYear'], house)
house.describe()


*Handling district*

In [None]:
from sklearn.preprocessing import OneHotEncoder
houseCategory=house[['district']]
housingOneHot = OneHotEncoder().fit_transform(houseCategory)
housingOneHot.toarray()

number attributes:

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
houseNum=house.select_dtypes(np.number)
housingNumStd = std_scaler.fit_transform(houseNum)

from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
log_pop = log_transformer.transform(house[["meter"]])

**And finally The PIPELINE**

In [None]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
("impute", SimpleImputer(strategy='most_frequent')),
("standardize", StandardScaler()),
])
houseNumPrepared=num_pipeline.fit_transform(houseNum)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"),
OneHotEncoder(handle_unknown="ignore"))

preprocessing = ColumnTransformer([
    ('cat',cat_pipeline,['district'])
],remainder=num_pipeline)
housing_prepared = preprocessing.fit_transform(house)


**Train**

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=7))
reg.fit(house,labels)

**Result**
Linear regression: 5.641 rmse (wait, what? underFitting!)
Decision tree: 0.01 rmse (wait, what? overFitting!)

In [None]:
housing_predictions = reg.predict(house)
print(housing_predictions[:5]/1000000000)
print(labels.iloc[:5].values/1000000000)

In [None]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(labels, housing_predictions,squared=False)
print(rmse/1000000000)# in billion

**Really, Really Testing for overfitting**

In [None]:
from sklearn.model_selection import cross_val_score
realRMSE = -cross_val_score(reg, house, labels,
scoring="neg_root_mean_squared_error", cv=10)
pd.Series(realRMSE).describe() 

In [52]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = make_pipeline(preprocessing,
RandomForestRegressor(random_state=4))
forest_rmses = -cross_val_score(forest_reg, house, labels,
scoring="neg_root_mean_squared_error", cv=5, random_state=2)

TypeError: cross_val_score() got an unexpected keyword argument 'random_state'

In [51]:
print(pd.Series(forest_rmses).describe())
forest_reg.fit(house,labels)
housing_predictions = forest_reg.predict(house)
print(housing_predictions[:5])
print(labels.iloc[:5].values)


count     5.000000
mean      8.869477
std       2.478523
min       5.508141
25%       7.479745
50%       8.907142
75%      10.884948
max      11.567409
dtype: float64
[11.7146     12.72709643  2.5918      1.6134     23.625     ]
[13.   12.65  2.45  1.2  24.  ]
