Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,68 +2,72 @@
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from revoscalepy.computecontext.RxInSqlServer import RxInSqlServer
from revoscalepy.computecontext.RxInSqlServer import RxSqlServerData
from revoscalepy.etl.RxImport import rx_import_datasource

#If you are running SQL Server 2017 RC1 and above:
from revoscalepy import RxComputeContext, RxInSqlServer, RxSqlServerData
from revoscalepy import rx_import

def get_rental_predictions():
conn_str = 'Driver=SQL Server;Server=MYSQLSERVER;Database=TutorialDB;Trusted_Connection=True;'
column_info = {
"Year" : { "type" : "integer" },
"Month" : { "type" : "integer" },
"Day" : { "type" : "integer" },
"RentalCount" : { "type" : "integer" },
"WeekDay" : {
"type" : "factor",
"levels" : ["1", "2", "3", "4", "5", "6", "7"]
},
"Holiday" : {
"type" : "factor",
"levels" : ["1", "0"]
},
"Snow" : {
"type" : "factor",
"levels" : ["1", "0"]
}
}
#Connection string to connect to SQL Server named instance
conn_str = 'Driver=SQL Server;Server=MYSQLSERVER;Database=TutorialDB;Trusted_Connection=True;'

#Define the columns we wish to import
column_info = {
"Year" : { "type" : "integer" },
"Month" : { "type" : "integer" },
"Day" : { "type" : "integer" },
"RentalCount" : { "type" : "integer" },
"WeekDay" : {
"type" : "factor",
"levels" : ["1", "2", "3", "4", "5", "6", "7"]
},
"Holiday" : {
"type" : "factor",
"levels" : ["1", "0"]
},
"Snow" : {
"type" : "factor",
"levels" : ["1", "0"]
}
}

#Get the data from SQL Server Table
data_source = RxSqlServerData(table="dbo.rental_data",
connection_string=conn_str, column_info=column_info)
computeContext = RxInSqlServer(
connection_string = conn_str,
num_tasks = 1,
auto_cleanup = False
)


RxInSqlServer(connection_string=conn_str, num_tasks=1, auto_cleanup=False)

# import data source and convert to pandas dataframe
df = pd.DataFrame(rx_import(input_data = data_source))
print("Data frame:", df)
# Get all the columns from the dataframe.
columns = df.columns.tolist()
# Filter the columns to remove ones we don't want to use in the training
columns = [c for c in columns if c not in ["Year"]]
# Store the variable we'll be predicting on.
target = "RentalCount"
# Generate the training set. Set random_state to be able to replicate results.
train = df.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = df.loc[~df.index.isin(train.index)]
# Print the shapes of both sets.
print("Training set shape:", train.shape)
print("Testing set shape:", test.shape)
# Initialize the model class.
lin_model = LinearRegression()
# Fit the model to the training data.
lin_model.fit(train[columns], train[target])

data_source = RxSqlServerData(table="dbo.rental_data",
connectionString=conn_str, colInfo=column_info)
computeContext = RxInSqlServer(
connectionString = conn_str,
numTasks = 1,
autoCleanup = False
)


RxInSqlServer(connectionString=conn_str, numTasks=1, autoCleanup=False)

# import data source and convert to pandas dataframe
df = pd.DataFrame(rx_import_datasource(data_source))
print("Data frame:", df)
# Get all the columns from the dataframe.
columns = df.columns.tolist()
# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["Year"]]
# Store the variable we'll be predicting on.
target = "RentalCount"
# Generate the training set. Set random_state to be able to replicate results.
train = df.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = df.loc[~df.index.isin(train.index)]
# Print the shapes of both sets.
print("Training set shape:", train.shape)
print("Testing set shape:", test.shape)
# Initialize the model class.
lin_model = LinearRegression()
# Fit the model to the training data.
lin_model.fit(train[columns], train[target])
# Generate our predictions for the test set.
lin_predictions = lin_model.predict(test[columns])
print("Predictions:", lin_predictions)
# Compute error between our test predictions and the actual values.
lin_mse = mean_squared_error(lin_predictions, test[target])
print("Computed error:", lin_mse)
# Generate our predictions for the test set.
lin_predictions = lin_model.predict(test[columns])
print("Predictions:", lin_predictions)
# Compute error between our test predictions and the actual values.
lin_mse = mean_squared_error(lin_predictions, test[target])
print("Computed error:", lin_mse)

get_rental_predictions()
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,24 @@ BEGIN
@language = N'Python'
, @script = N'

from sklearn import linear_model

import pickle


df = rental_train_data

# Get all the columns from the dataframe.
columns = df.columns.tolist()


# Store the variable well be predicting on.
target = "RentalCount"

from sklearn.linear_model import LinearRegression

# Initialize the model class.
lin_model = LinearRegression()
lin_model = linear_model.LinearRegression()
# Fit the model to the training data.
lin_model.fit(df[columns], df[target])

import pickle
#Before saving the model to the DB table, we need to convert it to a binary object
trained_model = pickle.dumps(lin_model)
'
Expand Down Expand Up @@ -75,15 +76,15 @@ AS
BEGIN
DECLARE @py_model varbinary(max) = (select model from rental_py_models where model_name = @model);

EXEC sp_execute_external_script
EXEC sp_execute_external_script
@language = N'Python'
, @script = N'


import pickle
rental_model = pickle.loads(py_model)


df = rental_score_data
#print(df)

Expand All @@ -106,15 +107,15 @@ lin_mse = mean_squared_error(linpredictions, df[target])
#print(lin_mse)

import pandas as pd
predictions_df = pd.DataFrame(lin_predictions)
predictions_df = pd.DataFrame(lin_predictions)
OutputDataSet = pd.concat([predictions_df, df["RentalCount"], df["Month"], df["Day"], df["WeekDay"], df["Snow"], df["Holiday"], df["Year"]], axis=1)
'
, @input_data_1 = N'Select "RentalCount", "Year" ,"Month", "Day", "WeekDay", "Snow", "Holiday" from rental_data where Year = 2015'
, @input_data_1_name = N'rental_score_data'
, @params = N'@py_model varbinary(max)'
, @py_model = @py_model
with result sets (("RentalCount_Predicted" float, "RentalCount" float, "Month" float,"Day" float,"WeekDay" float,"Snow" float,"Holiday" float, "Year" float));

END;
GO

Expand Down