In [51]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import sklearn as skl
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict 
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import numpy as np
%matplotlib inline

In [52]:
# import in csv and name columns

# CRIM - per capita crime rate by town
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS - proportion of non-retail business acres per town.
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX - nitric oxides concentration (parts per 10 million)
# RM - average number of rooms per dwelling
# AGE - proportion of owner-occupied units built prior to 1940
# DIS - weighted distances to five Boston employment centres
# RAD - index of accessibility to radial highways
# TAX - full-value property-tax rate per $10,000
# PTRATIO - pupil-teacher ratio by town
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT - percentage of lower status of the population
# MEDV - Median value of owner-occupied homes in $10000's

In [53]:
file = 'Resources/bost_housing_augmented.csv'
boston_data_df = pd.read_csv(file)
boston_data_df.head()

Unnamed: 0,OBS.,TOWN,TOWN#,TRACT,LON,LAT,MEDV,CMEDV,CRIM,ZN,...,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,1,Nahant,0,2011,-70.955,42.255,24.0,24.0,0.00632,18.0,...,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,2,Swampscott,1,2021,-70.95,42.2875,21.6,21.6,0.02731,0.0,...,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,3,Swampscott,1,2022,-70.936,42.283,34.7,34.7,0.02729,0.0,...,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,4,Marblehead,2,2031,-70.928,42.293,33.4,33.4,0.03237,0.0,...,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,5,Marblehead,2,2032,-70.922,42.298,36.2,36.2,0.06905,0.0,...,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [4]:
# Keep town nnumber for classification
# Lat and long is pretty similar
print(boston_data_df.keys())

Index(['OBS.', 'TOWN', 'TOWN#', 'TRACT', 'LON', 'LAT', 'MEDV', 'CMEDV', 'CRIM',
       'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT'],
      dtype='object')


In [54]:
boston_data_df= boston_data_df.drop(labels=['TOWN', 'TOWN#','OBS.', 'LON', 'LAT', 'B', 'MEDV', 'INDUS', 'AGE', 'TAX' ], axis=1)

In [56]:
boston_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   TRACT    506 non-null    int64  
 1   CMEDV    506 non-null    float64
 2   CRIM     506 non-null    float64
 3   ZN       506 non-null    float64
 4   CHAS     506 non-null    int64  
 5   NOX      506 non-null    float64
 6   RM       506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   PTRATIO  506 non-null    float64
 10  LSTAT    506 non-null    float64
dtypes: float64(8), int64(3)
memory usage: 43.6 KB


In [67]:
boston_data_df.to_csv('Resources/bost_data_for_R_20220425.csv')

In [68]:
X = boston_data_df.drop('CMEDV', axis=1)
X

Unnamed: 0,TRACT,CRIM,ZN,CHAS,NOX,RM,DIS,RAD,PTRATIO,LSTAT
0,2011,0.00632,18.0,0,0.538,6.575,4.0900,1,15.3,4.98
1,2021,0.02731,0.0,0,0.469,6.421,4.9671,2,17.8,9.14
2,2022,0.02729,0.0,0,0.469,7.185,4.9671,2,17.8,4.03
3,2031,0.03237,0.0,0,0.458,6.998,6.0622,3,18.7,2.94
4,2032,0.06905,0.0,0,0.458,7.147,6.0622,3,18.7,5.33
...,...,...,...,...,...,...,...,...,...,...
501,1801,0.06263,0.0,0,0.573,6.593,2.4786,1,21.0,9.67
502,1802,0.04527,0.0,0,0.573,6.120,2.2875,1,21.0,9.08
503,1803,0.06076,0.0,0,0.573,6.976,2.1675,1,21.0,5.64
504,1804,0.10959,0.0,0,0.573,6.794,2.3889,1,21.0,6.48


In [69]:
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

In [70]:
X

array([[-0.50001414, -0.41978194,  0.28482986, ..., -0.98284286,
        -1.45900038, -1.0755623 ],
       [-0.49276078, -0.41733926, -0.48772236, ..., -0.8678825 ,
        -0.30309415, -0.49243937],
       [-0.49203545, -0.41734159, -0.48772236, ..., -0.8678825 ,
        -0.30309415, -1.2087274 ],
       ...,
       [-0.65088391, -0.41344658, -0.48772236, ..., -0.98284286,
         1.17646583, -0.98304761],
       [-0.65015857, -0.40776407, -0.48772236, ..., -0.98284286,
         1.17646583, -0.86530163],
       [-0.64943324, -0.41500016, -0.48772236, ..., -0.98284286,
         1.17646583, -0.66905833]])

In [71]:
y= boston_data_df['CMEDV']
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    19.0
Name: CMEDV, Length: 506, dtype: float64

In [72]:
LR = LinearRegression()
LR


LinearRegression()

In [13]:
# importing train_test_split from sklearn
from sklearn.model_selection import train_test_split
# splitting the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [62]:
y_test

173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
       ... 
412    17.9
436     9.6
411    17.2
86     22.5
75     21.4
Name: CMEDV, Length: 102, dtype: float64

In [63]:
# fitting the training data
LR.fit(x_train,y_train)


LinearRegression()

In [64]:
x_test

array([[-4.09836682e-01, -4.87722365e-01, -2.72598567e-01,
        -3.86090674e-01,  1.87151160e-01, -5.46076822e-01,
        -5.23001446e-01, -8.57929140e-01, -5.06456744e-01],
       [-4.13949312e-01,  1.22906036e+00,  3.66839786e+00,
        -9.30305468e-01,  6.74384427e-01,  1.34319026e-01,
        -6.37961799e-01, -3.95566647e-01, -1.27881429e+00],
       [-4.08212112e-01, -4.87722365e-01, -2.72598567e-01,
         4.69104002e-01, -4.29726047e-01, -9.16009086e-01,
        -6.37961799e-01,  7.60339586e-01,  7.59312520e-01],
       [-4.09852974e-01, -4.87722365e-01, -2.72598567e-01,
        -1.22400869e+00, -3.12904036e-01,  7.09373073e-01,
        -6.37961799e-01,  3.44213342e-01, -9.99868462e-01],
       [ 1.71842120e-01, -4.87722365e-01, -2.72598567e-01,
         1.36749033e+00,  1.76167773e-02, -6.78276980e-01,
         1.66124525e+00,  8.06575835e-01,  6.47173493e-01],
       [-4.08702043e-01, -4.87722365e-01, -2.72598567e-01,
        -1.01668877e+00, -8.02707883e-03,  1.222446

In [65]:
LR.predict(x_test)

array([28.93203653, 35.83771823, 17.92734899, 25.15066216, 17.55782496,
       24.32263081, 17.30581348, 14.4538493 , 22.47095392, 20.70719436,
       24.30805173, 18.35479652, -4.70313922, 22.55728983, 19.02993296,
       25.45974698, 18.88852254,  4.26057528, 39.78208664, 16.21628244,
       26.38633498, 29.27293808, 11.41933215, 24.05267658, 17.50583641,
       15.46371189, 23.49674169, 17.90941741, 22.48169502, 19.2532584 ,
       22.21645957, 24.9227485 , 24.61478648, 16.75643905, 16.52849626,
       17.68114408, 31.29912693, 19.11258104, 23.85960614, 24.81335459,
       13.75810744, 30.870377  , 41.43929951, 17.83672728, 27.57276258,
       15.75637363, 14.3292134 , 26.0267275 , 19.3969402 , 30.52781744,
       21.00351569, 33.43669794, 15.76951698, 26.87660218, 38.7306429 ,
       21.82920441, 17.482066  , 31.79283211, 24.86688462, 12.64009442,
       21.71242938, 29.22117654, 30.95073454, 16.94801703, 21.84109203,
       16.57165458, 19.26231601, 25.81187079, 29.98185761, 14.95

NameError: name 'y_pred' is not defined

In [18]:
# Connect to TabPy server using the client library
from tabpy.tabpy_tools.client import Client
connection = Client('http://localhost:9004/')
connection

<Client object at 0x2d10771b788 connected to 'http://localhost:9004/'>

In [43]:
connection.remove('HomeValue')

In [48]:
# The scoring function that will use the Gradient Boosting Classifier to classify new data points
def SuggestHomeValue(CRIM, ZN, CHAS, NOX, RM, 
                     DIS, RAD, PTRATIO, LSTAT):
    X = np.column_stack([CRIM, ZN, CHAS, NOX, RM, 
                         DIS, RAD, PTRATIO, LSTAT])
    #X = scaler.transform(X)
    pred= LR.predict(X)
    pred=pred[0]
    
    return pred

In [49]:
SuggestHomeValue(0.09178, 0.0, 0, 0.5, 6.5, 2.6, 5.0, 16, 9.04)

-30.4127234251412

In [50]:
# Publish the SuggestDiagnosis function to TabPy server so it can be used from Tableau
# Using the name DiagnosticsDemo and a short description of what it does
connection.deploy('HomeValue',
                  SuggestHomeValue,
                  'Returns est median home value trained on the boston housing dataset', override = True)

Overwriting existing file "C:\Users\Will\anaconda3\envs\mlenv\lib\site-packages\tabpy\tabpy_server\staging\endpoints\HomeValue\1" when saving query object


In [24]:
connection

<Client object at 0x2d10771b788 connected to 'http://localhost:9004/'>

In [47]:
connection.remove('HomeValue')