Let's go.

In [None]:
import pandas as pd

In [None]:
def openData():
    df=pd.read_csv("../data.csv")
    df.drop_duplicates()
    return df
house=openData()
house.info()
house=house.drop(['district','test'],axis=1)
house=house.dropna(subset=['price'])

just working a bit with the dataset to understand what's going on. you can pass this.

In [None]:
print(house.describe())

In [None]:
import matplotlib.pyplot as plt
house.hist(bins=50, figsize=(12, 8))
plt.show()

**Deleting the outliers**

In [None]:
import numpy as np
from scipy import stats
outlierHigh = house["price"].quantile(0.99)
outlierLow = house["price"].quantile(0.01)
house.reset_index()
house=house[(house['price'] > outlierLow) & (house['price'] < outlierHigh)]
house.describe()

**making the test set**

In [None]:
from zlib import crc32
import numpy as np
def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32
def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


adding an index to data as it's needed.

In [None]:
house_with_id = house.reset_index() # adds an `index` column
train_set, test_set = split_data_with_id_hash(house_with_id, 0.2, "index")

stratified sampling:

In [None]:
print(house['meter'].describe())
house["meter_cat"] = pd.cut(house["meter"].astype(float),
bins=[0., 40, 80, 120, 160, 200., np.inf],
labels=[1, 2, 3, 4, 5, 6])
house["meter_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
# plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
stratSplits = []
for trainIndex, testIndex in splitter.split(house, house["meter_cat"]):
    stratTrainSetN = house.iloc[trainIndex]
    stratTestSetN = house.iloc[testIndex]
    stratSplits.append([stratTrainSetN, stratTestSetN])
    
stratTrainSet, stratTestSet = stratSplits[0]

In [None]:
for set_ in ( stratTrainSet ,stratTestSet):
    set_.drop("meter_cat", axis=1 , inplace=True)

_**START of the MODEL pre-process**_

In [None]:
house = stratTrainSet.copy()
corr_matrix = house.corr()
print(corr_matrix['price'].sort_values(ascending=False))

_note 1_ : wow it seems that meter is really important!

In [None]:

print(stratTrainSet.describe())
house = stratTrainSet.drop('price',axis=1)
labels = stratTrainSet[['price']].copy()
print(labels.describe())

imputation

In [None]:
def imputeFrequent(cols,h):
    for col in cols:
        frequent = house[col].mode()
        h[[col]]=h[[col]].fillna(float(frequent),inplace=False)
    return h
house = imputeFrequent(['floor','rooms','meter','buildYear'], house)
house.describe()


*Handling district*

number attributes:

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
houseNum=house.select_dtypes(np.number)
housingNumStd = std_scaler.fit_transform(houseNum)

from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
log_pop = log_transformer.transform(house[["meter"]])

**And finally The PIPELINE**

In [None]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
("impute", SimpleImputer(strategy='most_frequent')),
("standardize", StandardScaler()),
])
houseNumPrepared=num_pipeline.fit_transform(houseNum)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"))

preprocessing = ColumnTransformer([
],remainder=num_pipeline)
housing_prepared = preprocessing.fit_transform(house)


**Train**

**Really, Really Testing for overfitting**

In [199]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
forest_reg = make_pipeline(preprocessing,
RandomForestRegressor(random_state=4))
print(sklearn.metrics.get_scorer_names())
forest_mae = -cross_val_score(forest_reg, house, labels['price'],cv=7,scoring="neg_mean_absolute_error")
forest_rmse = -cross_val_score(forest_reg, house, labels['price'],cv=7,scoring="neg_mean_squared_error")

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'matthews_corrcoef', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'positive_likelihood_ratio', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weight

In [200]:
from math import sqrt
print(sqrt(forest_rmse.mean())/1e9,forest_rmse.std())
print(forest_mae.mean()/1e9,forest_mae.std())
print(type(labels['price']))
forest_reg.fit(house,labels['price'])
housing_predictions = forest_reg.predict(house)
print(housing_predictions[:5]/1e9)
print(labels['price'][:5]/1e9)


3.0297352090086807 2.0832913193623483e+18
1.5269846301908618 98308056.39135148
<class 'pandas.core.series.Series'>
[ 6.62326 28.811    4.9133   3.91633  3.76355]
2582     6.0000
1231    22.1000
2565     4.8000
1010     3.9375
2904     4.1000
Name: price, dtype: float64
