In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [85]:
data = pd.read_csv('realest.csv')

In [164]:
class AnalysisDataAndFitLinearRegression:

    def __init__(self):
        self.version = 1

    def analyse_and_fit_lrm(self, path):
        # a path to a dataset is "./data/realest.csv"
        # dataset can be loaded by uncommenting the line bellow
        data = pd.read_csv(path)
        summary_dict = {'statistics': data.Tax.describe()[['mean', 'std', '50%', 'min', 'max']],
                        'dataframe': data[data['Space'] < 800].sort_values('Price', ascending=False).dropna(),
                        'number_of_observations': len(data[data['Lot'] >= np.percentile(data.Lot.dropna(), 80)])
                        }
        
        clean_data = data.dropna()
        y = clean_data.Price
        X = clean_data.drop(['Price'], axis=1)
        linreg = LinearRegression()
        linreg.fit(X, y)
        
        model_parameters = {'Intercept': linreg.intercept_,
                            'Bedroom': linreg.coef_[0],
                            'Space': linreg.coef_[1],
                            'Room': linreg.coef_[2],
                            'Lot': linreg.coef_[3], 
                            'Tax': linreg.coef_[4],
                            'Bathroom': linreg.coef_[5],
                            'Garage': linreg.coef_[6],
                            'Condition': linreg.coef_[7],
         }
        
        pred_data = pd.DataFrame([{'Bedroom': 3, 'Space': 1500, 'Room':8, 'Lot':40, 'Tax':1000, 'Bathroom': 2,  'Garage': 1, 'Condition': 0 }])
        price_prediction = linreg.predict(pred_data)[0]
        regression_dict = {"model_parameters": model_parameters, 
                           "price_prediction": price_prediction}

        return {'summary_dict': summary_dict, 
            'regression_dict': regression_dict
        }

    def __listwise_deletion(self, data: pd.DataFrame):
        data = pd.read_csv(path)
        return data.dropna()
    


In [167]:
model = AnalysisDataAndFitLinearRegression()

model.analyse_and_fit_lrm('realest.csv')

{'summary_dict': {'statistics': mean     911.707483
  std      443.263430
  50%      821.000000
  min      418.000000
  max     2752.000000
  Name: Tax, dtype: float64,
  'dataframe':      Price  Bedroom  Space  Room   Lot     Tax  Bathroom  Garage  Condition
  119   63.0      1.0  722.0   5.0  25.0   836.0       1.0     0.0        0.0
  41    61.0      1.0  785.0   5.0  25.0   817.0       1.0     0.0        0.0
  67    61.0      3.0  715.0   5.0  25.0   815.0       1.0     0.0        0.0
  105   61.0      1.0  729.0   5.0  33.0  1054.0       1.0     2.0        1.0
  15    60.0      2.0  782.0   5.0  25.0   834.0       1.0     0.0        0.0
  145   59.0      3.0  710.0   5.0  25.0   848.0       1.0     0.0        0.0
  27    58.0      1.0  799.0   5.0  33.0  1025.0       1.0     2.0        1.0
  50    53.0      2.0  636.0   6.0  30.0   553.0       1.0     2.0        1.0
  76    53.0      3.0  673.0   6.0  30.0   589.0       1.0     2.0        1.0
  154   53.0      3.0  716.0   6.0  30

In [137]:
clean_data = data.dropna()
y = clean_data.Price
X = clean_data.drop(['Price'], axis=1)
linreg = LinearRegression()
linreg.fit(X, y)
linreg.intercept_
linreg.coef_

array([-3.16023874e+00,  9.66322780e-03,  1.68895111e+00,  2.36213231e-01,
        4.41271852e-03,  6.37578599e+00,  4.14475743e+00,  1.83579537e+00])

In [143]:
X.columns

Index(['Bedroom', 'Space', 'Room', 'Lot', 'Tax', 'Bathroom', 'Garage',
       'Condition'],
      dtype='object')

In [148]:
params = {'Intercept': linreg.intercept_,
          'Bedroom': linreg.coef_[0],
          'Space': linreg.coef_[1],
          'Room': linreg.coef_[2],
          'Lot': linreg.coef_[3], 
          'Tax': linreg.coef_[4],
          'Bathroom': linreg.coef_[5],
          'Garage': linreg.coef_[6],
          'Condition': linreg.coef_[7],
         }

params

{'Intercept': 19.993467728068104,
 'Bedroom': -3.1602387359607067,
 'Space': 0.00966322779633439,
 'Room': 1.6889511073197694,
 'Lot': 0.23621323117136808,
 'Tax': 0.004412718519150623,
 'Bathroom': 6.375785992212336,
 'Garage': 4.144757426848795,
 'Condition': 1.8357953743202877}

In [152]:
pred_data = pd.DataFrame([{'Bedroom': 3, 'Space': 1500, 'Room':8, 'Lot':40, 'Tax':1000, 'Bathroom': 2,  'Garage': 1, 'Condition': 0 }])

In [153]:
pred_data

Unnamed: 0,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
0,3,1500,8,40,1000,2,1,0


In [157]:
linreg.predict(pred_data)[0]

69.27677925052453

In [60]:
number = path.Lot.describe(percentiles=[.60])[['60%']]

In [72]:
path.Lot.dropna()

0      39.0
1      33.0
2      35.0
3      24.0
4      50.0
       ... 
151    50.0
152    27.0
153    30.0
154    30.0
155    30.0
Name: Lot, Length: 146, dtype: float64

In [151]:
pred_data = pd.DataFrame([{'Bedroom': 3, 'Space': 1500, 'Room':8, 'Lot':40, 'Tax':1000, 'Bathroom': 2,  'Garage': 1, 'Condition': 0 }])

In [75]:
#number_of_observations
len(path[path['Lot'] >= np.percentile(path.Lot.dropna(), 60)])

92

In [99]:
path[path['Space'] < 800].sort_values('Price', ascending=False).dropna()


Unnamed: 0,Price,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
119,63.0,1.0,722.0,5.0,25.0,836.0,1.0,0.0,0.0
41,61.0,1.0,785.0,5.0,25.0,817.0,1.0,0.0,0.0
67,61.0,3.0,715.0,5.0,25.0,815.0,1.0,0.0,0.0
105,61.0,1.0,729.0,5.0,33.0,1054.0,1.0,2.0,1.0
15,60.0,2.0,782.0,5.0,25.0,834.0,1.0,0.0,0.0
145,59.0,3.0,710.0,5.0,25.0,848.0,1.0,0.0,0.0
27,58.0,1.0,799.0,5.0,33.0,1025.0,1.0,2.0,1.0
50,53.0,2.0,636.0,6.0,30.0,553.0,1.0,2.0,1.0
76,53.0,3.0,673.0,6.0,30.0,589.0,1.0,2.0,1.0
154,53.0,3.0,716.0,6.0,30.0,585.0,1.0,2.0,1.0


In [31]:
path.Tax.describe()[['mean', 'std', '50%', 'min', 'max']]

mean     911.707483
std      443.263430
50%      821.000000
min      418.000000
max     2752.000000
Name: Tax, dtype: float64