In [10]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

In [11]:
df=pickle.load(open("df.data","rb"))
df.head()

Unnamed: 0,location,size,total_sqft,bath,balcony,price,Carpet Area,Plot Area,Super built-up Area,ready_to_move
0,56,2.0,1056.0,2.0,1.0,39.07,0,0,1,0
2,171,3.0,1440.0,2.0,3.0,62.0,0,0,0,1
3,4552,3.0,1521.0,3.0,1.0,95.0,0,0,1,1
4,56,2.0,1200.0,2.0,1.0,51.0,0,0,1,1
5,409,2.0,1170.0,2.0,1.0,38.0,0,0,1,1


In [12]:
X=df.drop('price',axis=1)
y=df['price']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Hypertunning parameters

In [14]:
# use tunning parameter to select the best model with best parameters

def find_best_model(X,y):
    model = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'Random_Forest': {
            'model': RandomForestRegressor(),
            'params': {
                'bootstrap': [True,False],   # sampling with replacement
                'max_features': ['auto', 'sqrt']   #auto take all features/sqrt tke root  
            }
        },
        'Xgboost': {
            'model': xgboost.XGBRegressor(),
            'params': {
                'max_depth' : [20,50,10],
                'learning_rate': [0.09,0.01,0.05],
                'n_estimators':[30,10,20]   # no of trees
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for model, config in model.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': model,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.707608,{'normalize': True}
1,Random_Forest,0.753952,"{'bootstrap': True, 'max_features': 'sqrt'}"
2,Xgboost,0.73567,"{'learning_rate': 0.09, 'max_depth': 10, 'n_es..."


In [15]:
# Know we know that random forest is best fit to the real estate data, we can use this model for further prediction
RF = RandomForestRegressor()
RF.fit(X_train,y_train)
RF.score(X_test,y_test)

0.732267432572881

In [16]:
# data now is ready to be used
import pickle
pickle.dump(find_best_model(X,y),open("model.data","wb"))

In [17]:
df=pickle.load(open("model.data","rb"))