In [3]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
import os
from sklearn.preprocessing import MinMaxScaler

import implicit

# Adjust directory accordingly
os.chdir('C:\\Users\\wesch\\OneDrive\\20 Spring Summer Urban Science Intensive\\nyconnect\\data')

In [15]:
# FCC data cleaning
# 2-digit code indicating the Technology of Transmission used to offer broadband service. 
def tech_code_translator(x):
    # ADSL, Cable, Fiber, Fixed Wireless, Satellite, Other
    if x in ['10', '11', '12', '20', '30']:
        return 'ADSL'
    if x in ['40', '41', '42', '43']:
        return 'Cable'
    if x == '50':
        return 'Fiber'
    if x == '60':
        return 'Satellite'
    if x == '70':
        return 'Fixed Wireless'
    return 'Other'
converters = {'tract': lambda x: str(x), 
              'block': lambda x: str(x),
              'Technology Code': lambda x: tech_code_translator(x)
             }

fcc477 = pd.read_csv("Fixed_Broadband_Deployment_Data__Jun__2019_Status_V1.csv", converters = converters, encoding = "ISO-8859-1")
fcc477.rename(columns={'Census Block FIPS Code':'FIPS Code'}, inplace=True)
# get FIPS 11: state + county + tract
fcc477["FIPS Code"] = fcc477["FIPS Code"].astype(str)
fcc477["FIPS_11"] = fcc477["FIPS Code"].str[:11]
# filter for unique providers in each tract that provide broadband service
fcc477_broadband = fcc477[(fcc477["Max Advertised Downstream Speed (mbps)"]>=25.0)&(fcc477["Max Advertised Upstream Speed (mbps)"]>=3.0)]

In [19]:
# select only relevant columns
# convert column types
fcc477_broadband_narrow = fcc477_broadband.loc[:, ["FIPS Code", "Provider ID", "Max Advertised Downstream Speed (mbps)"]]
fcc477_broadband_narrow.columns = ["fips", "isp", "speed"]
fcc477_broadband_narrow["fips"] = fcc477_broadband_narrow["fips"].astype("category")
fcc477_broadband_narrow["fips_id"] = fcc477_broadband_narrow["fips"].cat.codes

In [20]:
# The implicit library expects data as a item-user matrix so we
# create two matricies, one for fitting the model (item-user) 
# and one for recommendations (user-item)
sparse_item_user = sparse.csr_matrix((fcc477_broadband_narrow['speed'].astype(float), (fcc477_broadband_narrow['fips_id'], fcc477_broadband_narrow['isp'])))
sparse_user_item = sparse.csr_matrix((fcc477_broadband_narrow['speed'].astype(float), (fcc477_broadband_narrow['isp'], fcc477_broadband_narrow['fips_id'])))

In [21]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)



HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [24]:
#---------------------
# FIND SIMILAR ITEMS
#---------------------

# Find the 10 most similar to a random block in the Bronx
item_id = 27 
n_similar = 10

# Get the user and item vectors from our trained model
user_vecs = model.user_factors
item_vecs = model.item_factors

# Calculate the vector norms
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

# Calculate the similarity score, grab the top N items and
# create a list of item-score tuples of most similar artists
scores = item_vecs.dot(item_vecs[item_id]) / item_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])

# Print the names of our most similar census blocks
for item in similar:
    idx, score = item
    print (fcc477_broadband_narrow.fips.loc[fcc477_broadband_narrow.fips_id == idx].iloc[0])

360050002000001
360471078001002
360470374024002
360050310001011
360050386001003
360471104002008
360050078001004
360470594012003
360050151001003
360050462022008


In [25]:
#------------------------------
# CREATE USER RECOMMENDATIONS
#------------------------------

def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):
    """The same recommendation function we used before"""

    user_interactions = sparse_user_item[user_id,:].toarray()

    user_interactions = user_interactions.reshape(-1) + 1
    user_interactions[user_interactions > 1] = 0

    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    census_blocks = []
    scores = []

    for idx in item_idx:
        census_blocks.append(fcc477_broadband_narrow.fips.loc[fcc477_broadband_narrow.fips_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'census_blocks': census_blocks, 'score': scores})

    return recommendations

# Get the trained user and item vectors. We convert them to 
# csr matrices to work with our previous recommend function.
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for isp with id 52332 aka Verizon
user_id = 52332

recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)

print (recommendations)

     census_blocks     score
0  360470013002000  1.000000
1  360810171001020  0.898521
2  360810199002028  0.817265
3  360610048005000  0.788376
4  360470059002005  0.776867
5  360610031001021  0.776058
6  360810039001007  0.771240
7  360470051003006  0.755547
8  360470531002001  0.745918
9  360610115002008  0.745808


In [None]:
# prepare for merging
fcc477_broadband_grouped = fcc477_broadband.groupby(["FIPS_11", "boro"]).agg(
    num_providers = pd.NamedAgg(column = "Provider Name", aggfunc = lambda x: len(x.unique())))
fcc477_broadband_grouped.reset_index(inplace=True)
fcc477_broadband_grouped["ct2010"] = fcc477_broadband_grouped["FIPS_11"].str[5:]
fcc477_broadband_grouped["boro"] = fcc477_broadband_grouped["boro"].replace({"bronx": "Bronx",
                                                                    "brooklyn": "Brooklyn",
                                                                    "manhattan": "Manhattan",
                                                                    "staten island": "Staten Island",
                                                                    "queens": "Queens"})
# merge
fcc477_broadband_grouped_merged = fcc477_broadband_grouped.merge(census_tracts.loc[:,["boro_name", "ct2010", "ctlabel"]], right_on = ["ct2010", "boro_name"], left_on = ["ct2010", "boro"])
# new common column
fcc477_broadband_grouped_merged["BoroCTLbl"] = fcc477_broadband_grouped_merged["boro_name"] + " " + fcc477_broadband_grouped_merged["ctlabel"].astype(str)
# select only for those columns
fcc477_broadband_grouped_merged = fcc477_broadband_grouped_merged[["BoroCTLbl", "num_providers"]]