In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as mp
import numpy as np
%matplotlib inline
matplotlib.rcParams["figure.figsize"] =(20,10)

In [None]:
data = pd.read_csv("Mumbai1.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.groupby("Area")['Area'].agg('count')

In [None]:
data.groupby("Car Parking")['Car Parking'].agg('count')

In [None]:
data.groupby("Landscaped Gardens")['Landscaped Gardens'].agg('count')

In [None]:
data.groupby("Swimming Pool")['Swimming Pool'].agg('count')

In [None]:
new_data_1 = data.drop(["Gymnasium","Intercom","Indoor Games","Jogging Track","Clubhouse","Gas Connection","Unnamed: 0","24x7 Security","Maintenance Staff","Landscaped Gardens","Swimming Pool","New/Resale"],axis='columns')
new_data_1.sample(5)

In [None]:
new_data = new_data_1.drop(["Children's Play Area"],axis='columns')
new_data.sample(5)

In [None]:
#data cleaning
new_data.isnull().sum()

In [None]:
new_data.duplicated().sum()

In [None]:
new_data.columns

In [None]:
new_data.rename(columns = {'No. of Bedrooms':'bhk'}, inplace= True)

In [None]:
new_data.rename(columns = {'Lift Available':'Lift'}, inplace= True)

In [None]:
new_data.rename(columns = {'Car Parking':'Parking'}, inplace= True)

In [None]:
new_data.rename(columns = {"Children's Play Area":'PlayArea'}, inplace= True)

In [None]:
new_data[new_data.bhk>10]

In [None]:
new_data.Area.unique()

In [None]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [None]:
new_data['price_per_sqft'] = new_data['Price']/new_data['Area']

In [None]:
new_data[~new_data['price_per_sqft'].apply(is_float)].head(10)

In [None]:
new_data.head(10)

In [None]:
len(new_data.Location.unique())

In [None]:
new_data[new_data.Area/new_data.bhk<300].head()

In [None]:
new_data.price_per_sqft.describe()

In [None]:
def remove_outliers(data):
  df_out = pd.DataFrame()
  for key, subdf in data.groupby('Location'):
    m = np.mean(subdf.price_per_sqft)
    st = np.std(subdf.price_per_sqft)
    reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
    df_out = pd.concat([df_out,reduced_df],ignore_index=True)
  return df_out

In [None]:
data_1 = remove_outliers(new_data)

In [None]:
data_1.to_csv("bhp.csv",index=False)

In [None]:
data_1.shape

In [None]:
data_1.head()

In [None]:
data_1.isnull().sum()

In [None]:
data_1.Location = data_1.Location.apply(lambda x: x.strip())
location_stats = data_1['Location'].value_counts(ascending=False)
location_stats

In [None]:
location_stats.values.sum()

In [None]:
len(location_stats[location_stats>10])

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
data_1.Location = data_1.Location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(data_1.Location.unique())

In [None]:
data_1.head()

In [None]:
def plot_scatter_chart(df,Location):
  bhk1 = df[(df.Location==Location)&(df.bhk==1)]
  bhk2 = df[(df.Location==Location)&(df.bhk==2)]
  bhk3 = df[(df.Location==Location)&(df.bhk==3)]
  matplotlib.rcParams['figure.figsize'] = (15,10)
  mp.scatter(bhk1.Area,bhk1.Price,marker='*',color='green',label='1 BHK', s=100)
  mp.scatter(bhk2.Area,bhk2.Price,color='blue',label='2 BHK', s=100)
  mp.scatter(bhk3.Area,bhk3.Price,marker='+',color='red',label='3 BHK', s=100)
  mp.xlabel("Total square Feet Area")
  mp.ylabel("Price")
  mp.title(Location)
  mp.legend()

In [None]:
plot = plot_scatter_chart(data_1,"Airoli")

In [None]:
def remove_bhk_outliers(data):
  exclude_indices = np.array([])
  for Location, Location_data in data.groupby('Location'):
    bhk_stats= {}
    for bhk, bhk_data in data.groupby('bhk'):
      bhk_stats[bhk] = {
          'mean': np.mean(bhk_data.price_per_sqft),
          'std' : np.std(bhk_data.price_per_sqft),
          'count': bhk_data.shape[0]
      }
      for bhk,bhk_data in Location_data.groupby('bhk'):
        stats = bhk_stats.get(bhk-1)
        if stats and stats['count']>5:
          exclude_indices = np.append(exclude_indices,bhk_data[bhk_data.price_per_sqft<(stats['mean'])].index.values)
    return data.drop(exclude_indices,axis='index')

In [None]:
data_1 = remove_bhk_outliers(data_1)

In [None]:
plot = plot_scatter_chart(data_1,"Airoli")

In [None]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)
mp.hist(data_1.price_per_sqft,rwidth=0.5)
mp.xlabel("Price per Square feet")
mp.ylabel("Count")

In [None]:
dummies = pd.get_dummies(data_1.Location)
dummies.head(3)

In [None]:
data_2 = pd.concat([data_1,dummies],axis='columns')
data_2.head()

In [None]:
data_2 = data_2.drop(['Location','price_per_sqft'],axis='columns')


In [None]:
data_2.head()

In [None]:
data_2.shape

In [None]:
X = data_2.drop('Price',axis='columns')

In [None]:
X.head()

In [None]:
y = data_2.Price
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3,shuffle = True)

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2,random_state=3)
cross_val_score(LinearRegression(),X,y,cv=cv)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model(X,y):
  algos = {
      'linear_regression':{
          'model': LinearRegression(),
          'params':{
              'normalize':[True,False]
          }
      },
      
      'lasso':{
          'model': Lasso(),
          'params':{
              'alpha':[1,2],
              'selection':['random','cyclic']
          }
      },

      'decision_tree':{
          'model':DecisionTreeRegressor(),
          'params':{
              'criterion': ['mse','friedman_mse'],
              'splitter' : ['best','random']
          }
      }
  }
  scores= []
  cv = ShuffleSplit(n_splits=5, test_size = 0.2, random_state= 0)
  for algo_name,config in algos.items():
    gs = GridSearchCV(config['model'],config['params'],cv = cv, return_train_score=False)
    gs.fit(X,y)
    scores.append({
        'model': algo_name,
        'best_score': gs.best_score_,
        'best_params': gs.best_params_
    })
  return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model(X,y)

In [None]:
X.columns

In [None]:
def predict(Location,Area,bhk,Lift,Parking):
  loc_index = np.where(X.columns==Location)[0][0]

  x = np.zeros(len(X.columns))
  x[0] = Area
  x[1] = bhk
  x[2] = Lift
  x[3] = Parking
  if loc_index >=0:
    x[loc_index] = 1
  return lr_clf.predict([x])[0]

In [None]:
predict('Kharghar',720,1,1,1)

In [None]:
predict('Andheri East',700,2,1,0)

In [None]:
import pickle
with open("Price_prediction_model.pickle","wb") as f:
    pickle.dump(lr_clf,f)

In [None]:
import json
columns = {
    'data_columns': [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [None]:
import joblib
joblib.dump(lr_clf,'price_predict_model.pkl')