In [1]:
import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/In_Class_Assignments/customer_offers.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
offers = pd.read_csv(file_content_stream)
offers.head()

Unnamed: 0,customer_name,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,Adams,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,Allen,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,Anderson,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,Bailey,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Baker,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [2]:
## Summary statistics 
offers.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,32
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.1,0.1,0.06,0.12,0.04,0.12,0.19,0.2,0.1,0.07,...,0.05,0.12,0.06,0.15,0.09,0.06,0.17,0.22,0.17,0.04
std,0.301511,0.301511,0.238683,0.326599,0.196946,0.326599,0.394277,0.402015,0.301511,0.256432,...,0.219043,0.326599,0.238683,0.35887,0.287623,0.238683,0.377525,0.416333,0.377525,0.196946
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
## Dropping customer_name 
offers = offers.drop(columns = ['customer_name'], axis = 1)
offers.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,32
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [7]:
## Fitting the 10-nearest neighbor model to the data 
knn_md = NearestNeighbors(n_neighbors = 10, algorithm = 'brute').fit(offers)

## Defining the observation of interest (15)
x_int = offers.loc[14]

## Extracting the 10-nearest neighbors of obseration 15
knn_md.kneighbors([x_int])

(array([[0.        , 1.41421356, 1.41421356, 1.41421356, 1.41421356,
         1.41421356, 1.41421356, 1.73205081, 1.73205081, 1.73205081]]),
 array([[14, 63, 96,  3, 18, 32, 58, 36, 42, 46]]))

In [10]:
## Taking a look at the neighbors
offers.loc[[14, 63, 96,  3, 18, 32, 58, 36, 42, 46]].apply(sum, axis = 0)

1     0
2     0
3     0
4     0
5     0
6     0
7     2
8     3
9     0
10    0
11    3
12    1
13    0
14    0
15    0
16    0
17    0
18    2
19    0
20    0
21    0
22    2
23    0
24    0
25    0
26    0
27    0
28    0
29    1
30    7
31    0
32    0
dtype: int64