In [29]:
# Import the necessary packages
import dask.dataframe as dd
import dask
from dask.distributed import Client
from sklearn import linear_model

In [30]:
# Read in the data
def readData(file):
    datadf = dd.read_csv(file, parse_dates=['sold_date'], infer_datetime_format=True)
    return datadf

In [31]:
# Use a linear regression function (expected input groupby)
def groupbyRegr(data):
    groupRegr = linear_model.LinearRegression()
    droppedData = data[['bed', 'bath', 'acre_lot', 'house_size','price']].dropna()
    if droppedData.shape[0] > 0:
        groupRegr.fit(droppedData[['bed', 'bath', 'acre_lot', 'house_size']], droppedData['price'])
        droppedData['Predicted Price'] = np.round(groupRegr.predict(droppedData[['bed', 'bath', 'acre_lot', 'house_size']]))
        data['Predicted Price'] = droppedData['Predicted Price']
    return data

In [32]:
# Use a groupby function for a linear regression for each zip code
def zipGroupArray(data):
    data['Predicted Price'] = 0.
    return data.groupby('zip_code').apply(groupbyRegr, meta = data)

In [33]:
# Run all of the fuctions together as one unit
def runRegressor(data):
    return dask.compute(zipGroupArray(data))

In [34]:
# Set file where data can be found
file = './datasets/realtor-data.csv'

# Set up a Dask Client
client = Client(n_workers=4)
    
# Set dataframe variable
datadf = readData(file)

In [35]:
# Predict prices by running regressor on data
predictedData = runRegressor(datadf)[0]

In [36]:
# Filters data based on criteria in a dictionary
def filterData(data, criteria):
    return data[data.eval(criteria)]

In [37]:
filterData(predictedData, 'bath ==2 & bed == 3')

Unnamed: 0,status,price,bed,bath,acre_lot,full_address,street,city,state,zip_code,house_size,sold_date,Predicted Price
0,for_sale,105000.0,3.0,2.0,0.12,"Sector Yahuecas Titulo # V84, Adjuntas, PR, 00601",Sector Yahuecas Titulo # V84,Adjuntas,Puerto Rico,601.0,920.0,NaT,105000.0
7,for_sale,71600.0,3.0,2.0,0.08,"3467 St, Ponce, PR, 00731",3467 St,Ponce,Puerto Rico,731.0,1050.0,NaT,109187.0
10,for_sale,89000.0,3.0,2.0,13.39,"Km 3 4 Solar 457 Sr # 1, Isabela, PR, 00662",Km 3 4 Solar 457 Sr # 1,Isabela,Puerto Rico,662.0,1106.0,NaT,179892.0
11,for_sale,150000.0,3.0,2.0,0.08,"91 Del Rio, Juana Diaz, PR, 00795",91 Del Rio,Juana Diaz,Puerto Rico,795.0,1045.0,NaT,133014.0
12,for_sale,155000.0,3.0,2.0,0.10,"Pr, Lares, PR, 00669",Pr,Lares,Puerto Rico,669.0,4161.0,NaT,149496.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
923112,for_sale,524222.0,3.0,2.0,0.35,"721 Old Kensico Rd, White Plains, NY, 10603",721 Old Kensico Rd,White Plains,New York,10603.0,1657.0,NaT,577464.0
923113,for_sale,349000.0,3.0,2.0,0.06,"753 Elm St, Peekskill, NY, 10566",753 Elm St,Peekskill,New York,10566.0,2600.0,2004-09-02,499284.0
923118,for_sale,599999.0,3.0,2.0,0.81,"101 Smith Hill Rd, Airmont, NY, 10952",101 Smith Hill Rd,Airmont,New York,10952.0,1698.0,NaT,657322.0
923126,for_sale,690000.0,3.0,2.0,0.34,"25 Hungerford Rd, Briarcliff Manor, NY, 10510",25 Hungerford Rd,Briarcliff Manor,New York,10510.0,1560.0,1989-07-05,367397.0


In [38]:
# Close the client
client.close()