# Linear Regression Example 

In [2]:
# we need to import everything we need first
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# initialize a Dataframe 'df' with some dummy values
# Dummy values are in the  csv file, the string in the read_csv() represents where our csv file is located
# you can change the string path in the read_csv() to import the csv file from different folder
df = pd.read_csv('OmniPower.csv')
df.head() # show the first 5 pairs

Unnamed: 0,Sales,Price,Promotion
0,4141,59,200
1,3842,59,200
2,3056,59,200
3,3519,59,200
4,4226,59,400


In [4]:
df.tail() # show the last 5 pairs

Unnamed: 0,Sales,Price,Promotion
29,1882,99,400
30,2159,99,400
31,1602,99,400
32,3354,99,600
33,2927,99,600


In [5]:
print(df.shape)

(34, 3)


# Model Training
### Train/Test split

In [6]:
input_variable = df[['Price', 'Promotion']]
sales = df['Sales'] # This is the same as
sales = df.Sales

x_train = input_variable
y_train = sales


In [7]:
print(x_train.shape) # x_train and x_test shape should be 2D array

print(y_train.shape)


(34, 2)
(34,)


In [8]:
x_train.head()

Unnamed: 0,Price,Promotion
0,59,200
1,59,200
2,59,200
3,59,200
4,59,400


In [9]:
# import model
from sklearn.linear_model import LinearRegression

#instantiate
linReg = LinearRegression()

# fit out linear model to the train set data
linReg_model = linReg.fit(x_train, y_train)

In [10]:
# print the intercept and coefficients
print(linReg.intercept_)
print(linReg.coef_)

5837.52075892857
[-53.21733631   3.61305804]


## Predict data
We will use our model with x_test as input, to see how well our model fits

In [11]:
y_hat = linReg.predict(x_train)
print(x_train)
print(y_train)


    Price  Promotion
0      59        200
1      59        200
2      59        200
3      59        200
4      59        400
5      59        400
6      59        400
7      59        400
8      59        600
9      59        600
10     59        600
11     59        600
12     79        200
13     79        200
14     79        200
15     79        200
16     79        400
17     79        400
18     79        400
19     79        400
20     79        600
21     79        600
22     79        600
23     79        600
24     99        200
25     99        200
26     99        200
27     99        200
28     99        400
29     99        400
30     99        400
31     99        400
32     99        600
33     99        600
0     4141
1     3842
2     3056
3     3519
4     4226
5     4630
6     3507
7     3754
8     5000
9     5120
10    4011
11    5015
12    1916
13     675
14    3636
15    3224
16    2295
17    2730
18    2618
19    4421
20    4113
21    3746
22    3532
23    3825
2

In [14]:
y_predictions = linReg.predict([[50,200]])
print(linReg.predict([[50,200]]))  # predicting y value when x = 3.5, 
                                # note: we are using double square brackets because we need a 2D array as input

[3899.2655506]


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# the error is large due to the data in our set are large values, we should use r2-score to check
mean_squared_error(y_train, y_hat) 

## Model Validation

In [None]:
r2_score(y_train, y_hat)