Form a group among yourself (5 to 7)
Setup the environment

# Check the System requirements

## Check the python version

In [8]:
!python --version

Python 3.6.4


## Create a virtual enviroment (optional)

**Create virtual environment**

Syntax: *python3.6 -m venv {Virtual Env Name} (or) {Absolute Path for Virtual Env}*

Example: python3.6 -m venv noor (or) python3.6 -m venv Users/noor/python/project

**cd to venv directory**

*cd noor* (or) *Users/noor/python/project*

**Activate virtual environment**

*source bin/activate* (Inside the venv folder)

## Install necessary libraries

**pip install numpy scikit-learn notebook pandas matplotlib**

# Linear Regression from scratch

In [9]:
# Numpy is the only dependecy we need to run our own linear regression.
import numpy as np

In [10]:
# spendings = np.array([50, 60, 80, 90, 21])
# sales = np.array([100, 120, 160, 180, 42])

spendings = np.array([2, 4, 6, 8, 10])
sales = np.array([20, 40, 60, 80, 100])

# Now i want to predict the sales for the given spendings to optimize the expenses

In [15]:
# So our task is to find the best line that fits our data points : Linear Regression
# So we have to find the equation y=wx+b where w and b are unknown
# we should find the values of w and b such that error/cost function is minimum

# lets start with a random guess of w and b
w = 0.0
b = 0.0
w, b = train(spendings, sales, w, b, alpha = 0.01, epochs = 1)

epoch:  0 loss:  22.472


In [14]:
# Actual training happens here
def train(x, y, w, b, alpha, epochs):
    for e in range(epochs):
        w, b = update_w_and_b(x, y,  w, b, alpha)
        print("epoch: ", e, "loss: ", loss(x,  y,  w, b))
    return w, b

In [13]:
# Gradient descent equation implementation 
def update_w_and_b(x, y, w, b, alpha):
    
    dl_dw = 0.0
    dl_db = 0.0
    
    N = len(x)
    for i in range(N):
        dl_dw += -(x[i]*(y[i]-(w*x[i])))
        dl_db += -(y[i]-(w*x[i]))
        
    w = w-(1/float(N))*dl_dw*alpha
    b = b-(1/float(N))*dl_db*alpha
    
    return w,b

In [12]:
# Cost function implementation
def loss(x, y, w, b):
    N =len(x)
    total_error=0.0
    for i in range(N):
        total_error += (y[i]-(w*x[i] + b))**2
        return total_error/float(N)

In [11]:
# Finds the ynew given the xnew
def predict(x, w, b):
    return w*x + b

**Oh! Oh! Some thing is not right,** We are getting 203. We should get 230 or some value near to that.

Rewind today session and try to change paramaters by your own.
If you couldn't find, don't worry.

Ask us. We'll help you.

**Don't forget, Machine learning is all about hyper parameters**

In [16]:
x_new = 23
y_new = predict(23, w, b)
y_new

101.8

In [17]:
w = 0.0
b = 0.0
w, b = train(spendings, sales, w, b, alpha = 0.01, epochs = 10)

epoch:  0 loss:  22.472
epoch:  1 loss:  5.694579199999997
epoch:  2 loss:  1.1406616371199991
epoch:  3 loss:  0.10874278540083229
epoch:  4 loss:  0.006999263261700918
epoch:  5 loss:  0.0993375833844693
epoch:  6 loss:  0.19787213022770495
epoch:  7 loss:  0.26773582392327666
epoch:  8 loss:  0.31146450375730644
epoch:  9 loss:  0.3373966960282711


In [18]:
x_new = 23
y_new = predict(23, w, b)
y_new

230.66189775477307

# Linear Regressing using scikit-learn : hourly wages dataset

## Import necessary libraries

In [21]:
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Load and explore data

In [22]:
dataset=pd.read_csv("data/hourly_wages.csv")

In [23]:
dataset.head()

Unnamed: 0,wage_per_hour,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,5.1,0,8,21,35,1,1,0,1,0
1,4.95,0,9,42,57,1,1,0,1,0
2,6.67,0,12,1,19,0,0,0,1,0
3,4.0,0,12,4,22,0,0,0,0,0
4,7.5,0,12,17,35,0,1,0,0,0


In [24]:
dataset.describe(include='all')

Unnamed: 0,wage_per_hour,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
count,534.0,534.0,534.0,534.0,534.0,534.0,534.0,534.0,534.0,534.0
mean,9.024064,0.179775,13.018727,17.822097,36.833333,0.458801,0.655431,0.292135,0.185393,0.044944
std,5.139097,0.38436,2.615373,12.37971,11.726573,0.498767,0.475673,0.45517,0.388981,0.207375
min,1.0,0.0,2.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0
25%,5.25,0.0,12.0,8.0,28.0,0.0,0.0,0.0,0.0,0.0
50%,7.78,0.0,12.0,15.0,35.0,0.0,1.0,0.0,0.0,0.0
75%,11.25,0.0,15.0,26.0,44.0,1.0,1.0,1.0,0.0,0.0
max,44.5,1.0,18.0,55.0,64.0,1.0,1.0,1.0,1.0,1.0


## Prepare training and testing data

In [25]:
#create a dataframe with all training data except the target column
X = dataset.drop(columns=['wage_per_hour'])

#check that the target variable has been removed
X.head()

Unnamed: 0,union,education_yrs,experience_yrs,age,female,marr,south,manufacturing,construction
0,0,8,21,35,1,1,0,1,0
1,0,9,42,57,1,1,0,1,0
2,0,12,1,19,0,0,0,1,0
3,0,12,4,22,0,0,0,0,0
4,0,12,17,35,0,1,0,0,0


In [26]:
#create a dataframe with only the target column
Y = dataset[['wage_per_hour']]

#view dataframe
Y.head()

Unnamed: 0,wage_per_hour
0,5.1
1,4.95
2,6.67
3,4.0
4,7.5


In [27]:
(trainX, testX, trainY, testY) = train_test_split(X, Y, test_size=0.25, random_state=42)

In [28]:
print("Number of data in training set ",len(trainX), len(trainY))
print("Number of data in tesing set ",len(testX), len(testY))

Number of data in training set  400 400
Number of data in tesing set  134 134


## Model building

In [29]:
# There are three steps to model something with sklearn
# 1. Set up the model
model = LinearRegression()
# 2. Use fit
for i in range(0,100):
    model.fit(trainX, trainY)
# 3. Check the score
model.score(testX, testY)



0.3087885789792394

## Save and Load the model / Prediction¶

In [30]:
import pickle

In [31]:
Xnew = [[0, 12, 45, 63, 1, 1, 0, 0, 0]]
ynew = model.predict(Xnew)
ynew

array([[9.59925632]])

In [32]:
# save the model to disk
filename = 'linr_model'
pickle.dump(model, open(filename, 'wb'))

In [33]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [34]:
ynew = loaded_model.predict(Xnew)
ynew

array([[9.59925632]])