## Download and prepare data

Import the necessary packages.

In [250]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from azureml.core import Dataset

In [236]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '*********'
resource_group = 'machine'
workspace_name = 'modelling'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='E-commerce')
dataset = dataset.to_pandas_dataframe()
dataset

Unnamed: 0,Order ID,Order Date,Ship Date,Aging,Ship Mode,Product Category,Product,Sales,Quantity,Discount,...,Shipping Cost,Order Priority,Customer ID,Customer Name,Segment,City,State,Country,Region,Months
0,AU-2015-1,2015-11-09,2015-11-17,8.0,First Class,Auto & Accessories,Car Media Players,140.0,2.0,0.05,...,4.6,Medium,LS-001,Lane Daniels,Consumer,Brisbane,Queensland,Australia,Oceania,Nov
1,AU-2015-2,2015-06-30,2015-07-02,2.0,First Class,Auto & Accessories,Car Speakers,211.0,3.0,0.03,...,11.2,Medium,IZ-002,Alvarado Kriz,Home Office,Berlin,Berlin,Germany,Central,Jun
2,AU-2015-3,2015-12-05,2015-12-13,8.0,First Class,Auto & Accessories,Car Body Covers,117.0,5.0,0.01,...,3.1,Critical,EN-003,Moon Weien,Consumer,Porirua,Wellington,New Zealand,Oceania,Dec
3,AU-2015-4,2015-05-09,2015-05-16,7.0,First Class,Auto & Accessories,Car & Bike Care,118.0,2.0,0.05,...,2.6,High,AN-004,Sanchez Bergman,Corporate,Kabul,Kabul,Afghanistan,Central Asia,May
4,AU-2015-5,2015-07-09,2015-07-18,9.0,First Class,Auto & Accessories,Tyre,250.0,1.0,0.04,...,16.0,Critical,ON-005,Rowe Jackson,Corporate,Townsville,Queensland,Australia,Oceania,Jul
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,FA-2015-30771,2015-01-21,2015-01-27,6.0,Standard Class,Fashion,Sports Wear,85.0,5.0,0.04,...,1.7,Medium,IN-0040977,Welch Fein,Corporate,Pasadena,Texas,United States,Central,Jan
51286,FA-2015-30772,2015-06-22,2015-06-24,2.0,Standard Class,Fashion,Sports Wear,85.0,1.0,0.03,...,0.2,Medium,TT-0040978,Martinez Arnett,Corporate,Harare,Harare,Zimbabwe,Africa,Jun
51287,FA-2015-30773,2015-01-01,2015-01-07,6.0,Standard Class,Fashion,Sports Wear,85.0,1.0,0.05,...,0.1,Medium,ON-0040979,Mccoy Duston,Home Office,Townsville,Queensland,Australia,Oceania,Jan
51288,FA-2015-30774,2015-12-07,2015-12-14,7.0,Standard Class,Fashion,Sports Wear,85.0,3.0,0.04,...,2.8,Medium,RN-0040980,Bentley Zypern,Consumer,Houston,Texas,United States,Central,Dec


Remove some of the columns that you won't need for training or additional feature building.

In [237]:
cols = ["Order ID", "Order Date", "Ship Date", "Order Priority", "Customer ID", "Customer Name",
                     "Segment", "Ship Mode", "Product Category", "Product", "City", "State","Country","Region","Months"
                    ]
for col in cols:
    dataset.pop(col)
    
dataset.head()

Unnamed: 0,Aging,Sales,Quantity,Discount,Profit,Shipping Cost
0,8.0,140.0,2.0,0.05,46.0,4.6
1,2.0,211.0,3.0,0.03,112.0,11.2
2,8.0,117.0,5.0,0.01,31.2,3.1
3,7.0,118.0,2.0,0.05,26.2,2.6
4,9.0,250.0,1.0,0.04,160.0,16.0


### Cleanse data 

In [238]:
dataset.describe()

Unnamed: 0,Aging,Sales,Quantity,Discount,Profit,Shipping Cost
count,51288.0,51289.0,51288.0,51289.0,51290.0,51289.0
mean,5.254933,156.430619,2.997368,0.029971,72.725157,7.272932
std,2.959887,65.781574,1.413446,0.014114,49.09768,4.908497
min,1.0,33.0,1.0,0.01,0.5,0.1
25%,3.0,109.0,2.0,0.02,25.7,2.6
50%,5.0,159.0,3.0,0.03,67.0,6.7
75%,8.0,218.0,4.0,0.04,120.6,12.1
max,10.0,250.0,5.0,0.05,167.5,16.8


In [239]:
dataset.fillna (0, inplace=True)

dataset.dtypes

Aging            float64
Sales            float64
Quantity         float64
Discount         float64
Profit           float64
Shipping Cost    float64
dtype: object

Split the data into training and test sets by using the `train_test_split` function in the `scikit-learn` library. This function segregates the data into the x (**features**) data set for model training and the y (**values to predict**) data set for testing. The `test_size` parameter determines the percentage of data to allocate to testing. The `random_state` parameter sets a seed to the random generator, so that your train-test splits are deterministic.

In [240]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(dataset.drop('Profit', axis=1), dataset['Profit'], test_size=0.2, random_state=42)

In [241]:
train_x.shape

(41032, 5)

In [242]:
test_x.shape

(10258, 5)

In [243]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(train_x,train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [244]:
y_pred = regressor.predict(test_x) 
y_pred

array([ 65.02769478, 139.97894884,  48.01483459, ...,  11.93473771,
       111.99601428,  20.91991442])

In [245]:
test_y

49728     64.7
45547    139.5
15664     47.7
40561     28.3
49426     17.6
         ...  
10271     15.6
43411     17.0
49462     12.4
30658    112.1
34372     20.7
Name: Profit, Length: 10258, dtype: float64

In [246]:
predicted = regressor.predict(test_x)
regressor.score(test_x, test_y)

0.9999639135818107