In [5]:
import numpy as np

import pandas as pd
from pandas import Series,DataFrame
from bokeh.layouts import gridplot, row, column

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import sklearn.model_selection

# For very simple visualizations (without too much interaction, output_notebook works too, which we enable here)
output_notebook()

In [92]:
# Create a LinearRegression Object
lreg = LinearRegression()

# Load data set
df = pd.read_csv('datagun.csv')

In [93]:
# Create the figure object
f = figure(plot_width=400, plot_height=250)

# Make a histogram
hist, edges = np.histogram(df["n_injured"], density=True, bins=400)

f.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

show(f)

In [95]:

figures = [figure() for _ in range(1)]
for index, fig in enumerate(figures):
    # Create a scatter-plot
    fig.scatter(df['n_killed'], df['n_injured'])
    
    ## Add some axis information
    fig.yaxis.axis_label = "injured"
    fig.xaxis.axis_label = "killed"
    fig.left[0].formatter.use_scientific = False
    
show(gridplot(figures, ncols=2, plot_width=400, plot_height=250, toolbar_location='right'))

In [101]:
# Set up X as median room values and use Use vstack to make X two-dimensional
# Remember that X normally is (N,) instead of (N,1).
X = np.vstack(df.n_injured)

# Set up Y as the target price of the houses.
Y = df.n_killed

In [103]:
# Creating [X 1] (remember the useful np.ones function from the first notebook?)
X = np.vstack(df.n_injured)
X = np.column_stack((X, np.ones(X.shape[0])))

# Now get out m and b values for our best fit line
a, b = np.linalg.lstsq(X, Y)[0]
print(a, b)

-0.09193419899319907 0.2978724075854637


  


In [104]:
f = figure(plot_width=400, plot_height=250)

# Create a scatter-plot
f.scatter(df["n_injured"], df["n_killed"])
    
# Create the line
x = df["n_injured"]
f.line(x, a * x + b, color='red')

## Add some axis information
f.xaxis.axis_label = "injured"
f.yaxis.axis_label = "killed"

show(f)

In [106]:
r = np.array(df["n_killed"])
x = np.array(df["n_injured"])
y = a * x + b
mse = mean_squared_error(r, y)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

0.26831826647748847
0.5179944656822971


In [None]:
# Data Columns
X_multi = df.drop('n_killed',1)

# Targets
Y_target = df.Price

# Implement Linear Regression
lreg.fit(X_multi,Y_target)

In [None]:
# What is our constant? (b in univariate regression)
print(' The estimated intercept coefficient is {0:.2f}'.format(lreg.intercept_))

print(' The number of coefficients used was {0:d}'.format(len(lreg.coef_)))

In [None]:
# Set a DataFrame from the Features
coeff_df = DataFrame(boston_df.drop('Price', 1).columns)
coeff_df.columns = ['Features']

# Set a new column lining up the coefficients from the linear regression
coeff_df["Coefficient Estimate"] = pd.Series(lreg.coef_)

# Show
coeff_df.sort_values(by='Coefficient Estimate', ascending=False)

In [None]:
# Grab the output and set as X and Y test and train data sets!
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X,boston_df.Price)

# Print shapes of the training and testing data sets
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
print(X_train)

In [None]:
lreg.fit(X_train,Y_train)

In [None]:
# Predictions on training and testing sets
pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)

In [None]:
mse = mean_squared_error(r, y)

print("Fit a model X_train, and calculate MSE with Y_train: {0:.2f}".format(mean_squared_error(Y_train, pred_train)))
    
print("Fit a model X_train, and calculate MSE with X_test and Y_test: {0:.2f}".format(mean_squared_error(Y_test, pred_test)))