Let's go.

In [None]:
import pandas as pd

In [None]:
def openData():
    df=pd.read_csv("../data.csv")
    df.drop_duplicates()
    return df
df=openData()
df=df.drop(['district','test'],axis=1)
df=df.dropna(subset=['price'])
df.drop('rooms',axis=1,inplace=True)
df['random']=(df['averageDistrictValue'])**2
df.drop('floor',axis=1,inplace=True)

df.info()

just working a bit with the dataset to understand what's going on. you can pass this.

In [None]:
print(df.describe())

In [None]:
import matplotlib.pyplot as plt
df.hist(bins=25, figsize=(12, 8))
plt.show()

**Deleting the outliers**

In [None]:
import numpy as np
from scipy import stats
outlierHigh = df["price"].quantile(0.98)
outlierLow = df["price"].quantile(0.02)
df.reset_index()
df=df[(df['price'] > 2e9) & (df['price'] < 65e9)]
df=df[(df['meter'] > 35) & (df['meter'] < 2000)]
df=df[(df['buildYear'] > 1355) & (df['buildYear'] < 1404)]


print(outlierLow/1e9,outlierHigh/1e9)
df.describe()

**making the test set**

In [None]:
from zlib import crc32
import numpy as np
def is_id_in_test_set(identifier, test_ratio):
    return crc32(np.int64(identifier)) < test_ratio * 2**32
def split_data_with_id_hash(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


adding an index to data as it's needed.

In [None]:
house_with_id = df.reset_index() # adds an `index` column
train_set, test_set = split_data_with_id_hash(house_with_id, 0.2, "index")

stratified sampling:

In [None]:
print(df['meter'].describe())
df["meter_cat"] = pd.cut(df["meter"].astype(float),
bins=[0., 40, 80, 120, 160, 200., np.inf],
labels=[1, 2, 3, 4, 5, 6])
df["meter_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income category")
plt.ylabel("Number of districts")
# plt.show()

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
stratSplits = []
for trainIndex, testIndex in splitter.split(df, df["meter_cat"]):
    stratTrainSetN = df.iloc[trainIndex]
    stratTestSetN = df.iloc[testIndex]
    stratSplits.append([stratTrainSetN, stratTestSetN])
    
stratTrainSet, stratTestSet = stratSplits[0]

In [None]:
for set_ in ( stratTrainSet ,stratTestSet):
    set_.drop("meter_cat", axis=1 , inplace=True)

_**START of the MODEL pre-process**_

In [None]:
house = stratTrainSet.copy()
corr_matrix = house.corr()
print(corr_matrix['price'].sort_values(ascending=False))

_note 1_ : wow it seems that meter is really important!

In [None]:
house = stratTrainSet.drop('price',axis=1)
labels = stratTrainSet[['price']].copy()

**And finally The PIPELINE**

In [None]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
houseNum=house.select_dtypes(np.number)

num_pipeline = Pipeline([
("impute", SimpleImputer(strategy='most_frequent')),
])
houseNumPrepared=num_pipeline.fit_transform(houseNum)
cat_pipeline = make_pipeline(
SimpleImputer(strategy="most_frequent"))

preprocessing = ColumnTransformer([],remainder=num_pipeline)
preprocessing.fit(house)
preprocessing

**Train**

**Really, Really Testing for overfitting**

In [None]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
forest_reg = make_pipeline(preprocessing,
RandomForestRegressor(random_state=4))
print(sklearn.metrics.get_scorer_names())
forest_mape = -cross_val_score(forest_reg, house, labels['price'],cv=7,scoring="neg_mean_absolute_percentage_error")
forest_mae = -cross_val_score(forest_reg, house, labels['price'],cv=7,scoring="neg_mean_absolute_error")

In [None]:

print(forest_mae.mean()/1e9,forest_mae.std())
print(forest_mape.mean(),forest_mape.std())
print(type(labels['price']))
forest_reg.fit(house,labels['price'])
housing_predictions = forest_reg.predict(house)


**Fine tune

In [None]:
from sklearn.model_selection import GridSearchCV
full_pipeline = Pipeline([
("preprocessing", preprocessing),
("random_forest", RandomForestRegressor(random_state=4)),
])
param_grid = [
{'random_forest__max_features': [4],
'random_forest__n_estimators':[100],
'random_forest__max_depth':[20],
'random_forest__min_samples_leaf':[1],
}
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
scoring='neg_mean_absolute_error')
grid_search.fit(house,labels.values.ravel())

In [None]:
print(grid_search.best_params_)
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.head()
