In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

### price = (m1 * area) + (m2 * bedrooms) + (m3 * age) + b

In [3]:
df = pd.read_csv('../../data/homepricesmulti.csv')
df.head()

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000


In [8]:
median_bedrooms = df.bedrooms.median()
df.bedrooms = df.bedrooms.fillna(median_bedrooms)

In [10]:
reg = linear_model.LinearRegression()
reg.fit(df[['area', 'bedrooms', 'age']], df.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
reg.coef_ 

#[m1, m2, m3]

array([  112.06244194, 23388.88007794, -3231.71790863])

In [12]:
reg.intercept_

221323.00186540396

In [19]:
reg.predict([[3000, 3, 40]])
# Given the value of these respective features, what is the predict cost?

array([498408.25158031])

## Save our model to a file and read it in so that we don't have to contiually train our model.  We can also send our model file to people, specifying the features that they need to pass in.

## This is great for large training datasets!

In [28]:
import pickle # serialize python objects to a file

In [29]:
# 'wb' and 'rb' = write and read binary

with open ('model_pickle', 'wb') as file: 
    pickle.dump(reg, file)

In [30]:
with open('model_pickle', 'rb') as file:
    model = pickle.load(file)

In [32]:
model.predict([[3222, 4, 3]])

array([666248.55638882])

 ## joblib is a scikit learn module that accomplishes the similar task, BUT: is more efficient on objects that carry large numpy arrays internally.

In [34]:
from sklearn.externals import joblib

In [35]:
joblib.dump(reg, 'model_joblib')

['model_joblib']

In [36]:
model_obj = joblib.load('model_joblib')

In [37]:
model_obj.predict([[2333, 2, 3]])

array([519847.28534638])