In [None]:
# setup
from mlwpy import *
%matplotlib inline

In [None]:
diabetes = datasets.load_diabetes()

tts = skms.train_test_split(diabetes.data,
                            diabetes.target, 
                            test_size=.25)

(diabetes_train_ftrs, diabetes_test_ftrs, 
 diabetes_train_tgt,  diabetes_test_tgt) = tts

In [None]:
diabetes_df = pd.DataFrame(diabetes.data, 
                           columns=diabetes.feature_names)
diabetes_df['target'] = diabetes.target
diabetes_df.head()

In [None]:
sns.pairplot(diabetes_df[['age', 'sex', 'bmi', 'bp', 's1']], 
             height=1.5, hue='sex', plot_kws={'alpha':.2});

In [None]:
values = np.array([1, 3, 5, 8, 11, 13, 15])
print("no outlier")
print(np.mean(values), 
      np.median(values))

values_with_outlier = np.array([1, 3, 5, 8, 11, 13, 40])
print("with outlier")
print("%5.2f" % np.mean(values_with_outlier), 
      np.median(values_with_outlier))

In [None]:
distances = np.array([4.0, 2.0, 2.0])
closeness = 1.0 / distances              # element-by-element division
weights = closeness / np.sum(closeness)  # normalize sum to one
weights

In [None]:
values = np.array([4,6,8])

mean = np.mean(values)
wgt_mean = np.dot(values, weights)

print("Mean:", mean)
print("Weighted Mean:", wgt_mean)

In [None]:
knn   = neighbors.KNeighborsRegressor(n_neighbors=3)
fit   = knn.fit(diabetes_train_ftrs, diabetes_train_tgt)
preds = fit.predict(diabetes_test_ftrs)

# evaluate our predictions against the held-back testing targets
metrics.mean_squared_error(diabetes_test_tgt, preds)

In [None]:
np.sqrt(3500)

In [None]:
diabetes_df['target'].max() - diabetes_df['target'].min()

In [None]:
def axis_helper(ax, lims):
    'clean up axes'
    ax.set_xlim(lims); ax.set_xticks([])
    ax.set_ylim(lims); ax.set_yticks([])
    ax.set_aspect('equal')

In [None]:
# our data is very simple:  two (x,y) points
D = np.array([[3,5],
              [4,2]])

# we'll take x as our "input" and y as our "output"
x,y = D[:,0], D[:,1]

In [None]:
horizontal_lines = np.array([1, 2, 3, 3.5, 4, 5])

results = []
fig, axes = plt.subplots(1,6,figsize=(10,5))
for h_line, ax in zip(horizontal_lines, axes.flat):
    # styling
    axis_helper(ax, (0,6))
    ax.set_title(str(h_line))
    
    # plot the data
    ax.plot(x,y, 'ro')
    
    # plot the prediction line
    ax.axhline(h_line, color='y') # ax coords; defaults to 100%    

    # plot the errors
    # the horizontal line *is* our prediction; renaming for clarity
    predictions = h_line 
    ax.vlines(x, predictions, y)
    
    # calculate the error amounts and their sum-of-squares
    errors = y - predictions
    sse = np.dot(errors, errors)

    
    # put together some results in a tuple
    results.append((predictions, 
                    errors, errors.sum(), 
                    sse, np.sqrt(sse)))

In [None]:
col_labels = "Prediction", "Errors", "Sum", "SSE", "Distance"
display(pd.DataFrame.from_records(results, 
                                  columns=col_labels, 
                                  index="Prediction"))

In [None]:
def process(D, model, ax):
    # make some useful abbreviations/names
    # y is our "actual"
    x, y = D[:,0], D[:,1] 
    m, b = model
    
    # styling
    axis_helper(ax, (0,8))

    # plot the data
    ax.plot(x,y,'ro')

    # plot the prediction line
    helper_xs = np.array([0,8])
    helper_line = m * helper_xs + b
    ax.plot(helper_xs, helper_line, color='y')
    
    # plot the errors
    predictions = m * x + b
    ax.vlines(x, predictions, y)
    
    # calculate error amounts
    errors = y - predictions
    
    # tuple up the results
    sse = np.dot(errors, errors)
    return (errors, errors.sum(), sse, np.sqrt(sse))

In [None]:
# our data is very simple:  two (x,y) points
D = np.array([[3,5],
              [4,2]])

#                      m   b  --> predictions = mx + b
lines_mb = np.array([[ 1,  0],
                     [ 1,  1],
                     [ 1,  2],
                     [-1,  8],
                     [-3, 14]])

col_labels = ("Raw Errors", "Sum", "SSE", "TotDist")
results = []

# note: plotting occurs in process()
fig, axes = plt.subplots(1,5,figsize=(12,6))
records = [process(D, mod, ax) for mod,ax in zip(lines_mb, axes.flat)]
df = pd.DataFrame.from_records(records, columns=col_labels)
display(df)

In [None]:
lr    = linear_model.LinearRegression()
fit   = lr.fit(diabetes_train_ftrs, diabetes_train_tgt)
preds = fit.predict(diabetes_test_ftrs)

# evaluate our predictions against the unseen testing targets
metrics.mean_squared_error(diabetes_test_tgt, preds)

In [None]:
tgt = np.array([3,5,8,10,12,15])

In [None]:
# random guesses with| some constraints
num_guesses = 10
results = []
for g in range(num_guesses):
    guess = np.random.uniform(low=tgt.min(), high=tgt.max())
    total_dist = np.sum((tgt - guess)**2)
    results.append((total_dist, guess))
best_guess = sorted(results)[0][1]
best_guess

In [None]:
# use a random choice to take a hypothetical 
# step up or down:  follow it, if it is an improvement
num_steps = 100
step_size = .05

best_guess = np.random.uniform(low=tgt.min(), high=tgt.max())
best_dist  = np.sum((tgt - best_guess)**2)

for s in range(num_steps):
    new_guess = best_guess + (np.random.choice([+1, -1]) * step_size)
    new_dist = np.sum((tgt - new_guess)**2)
    if new_dist < best_dist:
        best_guess, best_dist = new_guess, new_dist
print(best_guess)

In [None]:
# hypothetically take both steps (up and down)
# choose the better of the two.
# if it is an improvement, follow that step
num_steps = 1000
step_size = .02

best_guess = np.random.uniform(low=tgt.min(), high=tgt.max())
best_dist  = np.sum((tgt - best_guess)**2)
print("start:", best_guess)
for s in range(num_steps):
    # np.newaxis is needed to align the minus
    guesses = best_guess + (np.array([-1, 1]) * step_size)
    dists   = np.sum((tgt[:,np.newaxis] - guesses)**2, axis=0)
    
    better_idx = np.argmin(dists)

    if dists[better_idx] > best_dist:
        break
    
    best_guess = guesses[better_idx]
    best_dist  = dists[better_idx]
print("  end:", best_guess)

In [None]:
print("mean:", np.mean(tgt))

In [None]:
# stand alone code
from sklearn import (datasets, neighbors, 
                     model_selection as skms,
                     linear_model, metrics)

diabetes = datasets.load_diabetes()
tts =  skms.train_test_split(diabetes.data,
                             diabetes.target, 
                             test_size=.25)
(diabetes_train, diabetes_test, 
 diabetes_train_tgt, diabetes_test_tgt) = tts

models = {'kNN': neighbors.KNeighborsRegressor(n_neighbors=3),
          'linreg' : linear_model.LinearRegression()}

for name, model in models.items():
    fit   = model.fit(diabetes_train, diabetes_train_tgt)
    preds = fit.predict(diabetes_test)

    score = np.sqrt(metrics.mean_squared_error(diabetes_test_tgt, preds))
    print("{:>6s} : {:0.2f}".format(name,score))

In [None]:
!cat scripts/perf_02.py

In [None]:
!python scripts/perf_02.py mem lr
!python scripts/perf_02.py time lr

In [None]:
!python scripts/perf_02.py mem knn
!python scripts/perf_02.py time knn