## Rental owner clustering

Clusters rental license properties by owner name and owner address fields (ownname1, ownname2, ownadd1, ownadd2).  Writes the result to a temporary file.

In [1]:
!pip install leven
!pip install ngram



In [2]:
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Get cleaned rental registration and join it to SDAT, limited to Cambridge vs Dorchester County

In [4]:
rentals = pd.read_csv('/content/drive/My Drive/pita 2021/cleaned_rental_billing-2020-test.csv')
rentals.acctid = rentals.acctid.astype(str)

In [5]:
cambridge_apns = pd.read_csv('/content/drive/My Drive/SDAT/CAN-ref.csv')

In [6]:
sdat = pd.read_csv('drive/My Drive/pita 2021/SDAT-CAN-ref-202105.csv')

# make sure the acctid is in the right format, and limit the results to just Cambridge
sdat.acctid = sdat.acctid.apply(lambda x: str(x).strip())
sdat = sdat.merge(cambridge_apns,on='acctid',how='outer',indicator=True)
xtra_apns = sdat.query('_merge == "right_only"')
xtra_categories = list(xtra_apns.CAN_OWNCLASS.unique())
print("There are ",len(xtra_apns), "properties that don't match SDAT in the CAN ref.")
print("None of these types impact the analysys?",xtra_categories)
sdat = sdat.query('_merge == "both"').drop(columns='_merge')

  interactivity=interactivity, compiler=compiler, result=result)


There are  67 properties that don't match SDAT in the CAN ref.
None of these types impact the analysys? ['UNDEVELOPED_HOUSING', 'COMMERCIAL', 'CITY_OF_CAMBRIDGE', 'OTHER', 'DORCHESTER_COUNTY', 'AGRICULTURE', 'HOUSING']


In [7]:
print("WARNING:",len(rentals.drop(columns='address').merge(sdat,on='acctid',how='outer',indicator=True).query('_merge == "left_only"')),"didn't resolve!")
rentals = rentals.drop(columns='address').merge(sdat,on='acctid',how='inner')



## Clustering rentals on owners requires joining with SDAT first

In [8]:
!cp "drive/My Drive/pita 2021/text_cluster.py" .
from text_cluster import assign_clusters
from text_cluster import text_similarity
from text_cluster import cluster_strings

####Cluster the rental licenses
Assign clusters by owner name and owner address

In [9]:
%%time
owner_names = cluster_strings(rentals.ownname1.array,'o_name',metric='jaccard',threshold=0.1)
owner_addresses = cluster_strings(rentals.ownadd1.astype(str).array,'o_address',metric='jaccard',threshold=0.1)

rows: 1524 clusters: 171
rows: 1524 clusters: 157
CPU times: user 6min 28s, sys: 33 s, total: 7min 1s
Wall time: 5min 59s


In [10]:
def best_cluster(cnumbers):
  if len(cnumbers) > 1 and cnumbers.index[0] < 0:
    return cnumbers.index[1]
  else:
    return cnumbers.index[0]

rentals['owner_cluster'] = rentals.apply(lambda x: owner_names.get(x.ownname1,-1), axis=1)
rentals['owneradd_cluster'] = rentals.apply(lambda x: owner_addresses.get(x.ownadd1,-1), axis=1)
rentals[['acctid','ownname1','owner_cluster','ownadd1','owneradd_cluster']]

    
ro_df = pd.DataFrame(rentals.groupby(['acctid'])['owneradd_cluster'].agg(lambda x:best_cluster(x.value_counts())))#.to_dict()
roadd_c = rentals.groupby(['acctid'])['owner_cluster'].agg(lambda x:best_cluster(x.value_counts()))
ro_df = ro_df.merge(roadd_c,left_index=True,right_index=True)
ro_df

Unnamed: 0_level_0,owneradd_cluster,owner_cluster
acctid,Unnamed: 1_level_1,Unnamed: 2_level_1
1007104057,183,201
1007104073,1,4
1007104294,1,4
1007104324,89,96
1007104340,328,365
...,...,...
1007231407,147,157
1007231490,578,645
1007239653,542,597
1007286611,148,159


#### Save rental clusters

In [11]:
ro_df.to_csv('drive/My Drive/pita 2021/rental_clusters.csv')