In [5]:
import pandas as pd
import scipy as sc
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

from sklearn.externals.joblib import dump, load
import pickle
#open a folder for app files
import os
if not os.path.exists("forsalewebapp"):
    os.mkdir("forsalewebapp")

### import forsale df

In [22]:
forsale_df=pd.read_csv('PATH TO FILE HERE',sep=',',low_memory=False)

In [23]:
##dropping unwanted cols for the ml 
forsale_df_ml=forsale_df.drop(["Unnamed: 0","adress","sub_area",'date_update','is_sold'], axis=1)
repc=forsale_df_ml.pop('price')
forsale_df_ml['price']=repc

## Linear Regression model

In [24]:
X =forsale_df_ml.drop(['price'],axis=1)
y =forsale_df_ml['price']

### Label encoding and feature scaling

In [25]:
X = X.apply(LabelEncoder().fit_transform)

In [26]:
cols=X.columns.tolist()
for i in range(0, len(cols)):
    X[cols[i]] = scale(X[cols[i]])

### Modeling and scores

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [28]:
linear_model = LinearRegression()

linear_model.fit(X_train,y_train)

y_pred = linear_model.predict(X_test)
y_train_pred = linear_model.predict(X_train)

In [29]:
print("r2 score with LinearRegression model")
print("Accuracy on training data: " , r2_score(y_train, y_train_pred))
print("Accuracy on test data: ", r2_score(y_test, y_pred))

r2 score with LinearRegression model
Accuracy on training data:  0.3570794918768261
Accuracy on test data:  0.34603899631907


## Lasso model

### label encoding and scaling for categorial featurs

In [30]:
featurs =forsale_df_ml.drop(['price'],axis=1)

In [31]:
##create dummies and combine 
combine_dummies = pd.get_dummies(featurs)
result = combine_dummies.values

In [32]:
scaler = StandardScaler()
result = scaler.fit_transform(result)

In [33]:
X= result[:forsale_df_ml.shape[0]]
y=forsale_df_ml['price'].values

### Modeling and scores

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=15)

In [35]:
lasso_model= Lasso()

lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)
y_train_pred = lasso_model.predict(X_train)

In [36]:
print("r2 score with Lasso model")
print("Accuracy on training data: " , r2_score(y_train, y_train_pred))
print("Accuracy on test data: ", r2_score(y_test, y_pred))

r2 score with Lasso model
Accuracy on training data:  0.6021581500789062
Accuracy on test data:  0.632429764672518


## Rendom Forest Regressor

### Encoding categorial values

In [37]:
featurs=forsale_df_ml.drop(['price'],axis=1)
categ_col = ["aprtment_type", "city", "state_of_asset"]
featurs_dum = pd.get_dummies(featurs, columns=categ_col)
result=featurs_dum.values

### scaling

In [38]:
scaler = StandardScaler()
origin_scale_rf=StandardScaler().fit(result)
result = scaler.fit_transform(result)

In [39]:
X=result[:forsale_df_ml.shape[0]]
y=forsale_df_ml['price'].values

### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 70, 80, 90],
    'max_features': ['auto'],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [2, 4, 6],
    'n_estimators': [100, 144, 300, 600]
    }
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y_train)

##grid_search.best_params_ result
#   {'bootstrap': True,
#    'max_depth': 70,
#    'max_features': 'auto',
#    'min_samples_leaf': 2,
#    'min_samples_split': 4,
#    'n_estimators': 144}

### Modeling and scores

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [41]:
#rf_model=grid_search.best_estimator_
rf_model=RandomForestRegressor(bootstrap= True,max_depth=15,max_features=16,min_samples_leaf=2,min_samples_split= 2,n_estimators=144)
rf_model.fit(X_train,y_train)

y_pred = rf_model.predict(X_test)
y_train_pred = rf_model.predict(X_train)

print("r2 score with RendomForestReg model")
print("Accuracy on training data: " , r2_score(y_train, y_train_pred))
print("Accuracy on test data: ", r2_score(y_test, y_pred))

r2 score with RendomForestReg model
Accuracy on training data:  0.7668810226203149
Accuracy on test data:  0.6923211575366653


### Add BEST (randomforestreg) predicted prices to original df

In [42]:
final_labels = rf_model.predict(X)
final_result = pd.DataFrame({'salePrice': final_labels})

In [43]:
forsale_df_ml["predicted price"]=final_result['salePrice']

In [44]:
price_avg=forsale_df_ml["price"].mean()
pred_price_avg=forsale_df_ml["predicted price"].mean()

print("True prices average:",round(price_avg,2))
print("Predicted prices average:",round(pred_price_avg,2))

True prices average: 2312463.95
Predicted prices average: 2314699.45


### web app

In [45]:
#saving nessecry files for app
pickle.dump(rf_model,open("forsalewebapp/model_new_forsale.pickle", 'wb'))

dump(origin_scale_rf, 'forsalewebapp/std_scaler_forsale.bin', compress=True)

tmpdummies=featurs_dum[:1]
tmpdummies.to_csv('forsalewebapp\dum_col_forsale.csv', index = False)

In [4]:
import anvil.server

anvil.server.connect('KEY CODE TO SERVER')

In [6]:
@anvil.server.callable
def predict_forsale(aprtment_type, city, rooms, floor, size, balcony,
       building_floors, state_of_asset, parkings, is_aircon,
       is_borderwin, is_elevator, is_renovated, is_storeroom,
       is_furneter):
    tmpdf={"aprtment_type":aprtment_type,
           "city":city,
           "rooms":rooms,
           "floor":floor,
           "size":size,
           "balcony":balcony,
           "building_floors":building_floors,
           "state_of_asset":state_of_asset,
           "parkings":parkings,
           "is_aircon":int(is_aircon),
           "is_borderwin":int(is_borderwin),
           "is_elevator":int(is_elevator),
           "is_renovated":int(is_renovated),
           "is_storeroom":int(is_storeroom),
           "is_furneter":int(is_furneter)
          }
    tmpdf=pd.DataFrame(tmpdf, index=[0])
    
    dummy_new = pd.get_dummies(tmpdf)
    dum_col=pd.read_csv('forsalewebapp\dum_col_forsale.csv',sep=',')
    dummy_new=dummy_new.reindex(columns = dum_col.columns, fill_value=0)
    res=dummy_new.values
    sc=load('forsalewebapp\std_scaler_forsale.bin')
    new_res = sc.transform(res)
    
    laso_model=pickle.load(open('forsalewebapp/model_new_forsale.pickle','rb'))
    
    pred_price=laso_model.predict(new_res)
    pred_price=int(pred_price)
    
    return pred_price