In [1]:
#standard imports
import pandas as pd
import numpy as np

#API tools
import requests
import json
from pandas.io.json import json_normalize
from urllib.request import Request, urlopen
import json

#visuals
import matplotlib as plt
import seaborn as sns

#text tools
from collections import Counter #word frequency counts
import re #regexs
from ast import literal_eval #turn strings that look like lists into actual lists
import itertools #use to return combinations of words in topic strings 

#Natural Language Processing
import nltk
import lda #Latent Dirichlet Allocation (create topics)
import gensim
from gensim import corpora, models #for constructing document term matrix
#from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk import stem
from nltk.corpus import stopwords

#clustering tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan

#magic
%matplotlib inline

In [2]:
pd.set_option('display.float_format', lambda x: '%.3f' % x) #otherwise we have scientific notation

## Access All Socrata Open Data Portal Meta-Data with Series of API Calls
Socrata's Discovery API allows a user to access ample metadata and usage data for datasets from Socrata portals. However, the API does have some limitations:
- The API itself is designed for queried searches, not returning all data about every Socrata open data portal. As such, you have to embrace some workarounds to truly access all available data.
- Without changing the 'limit' parameter, the API only returns 100 datasets for a given search parameter (the datasets returned *do seem to be sorted by total views*)

Additionally, trying to return more than 600 datasets at a time is very slow (on this machine).

** To work around these limitations, we will create a list of URLs for city/state open government Socrata open data portal. We will iterate through this list, returning the top 600 pages of data for each city's portal. Then we will convert each returned result to pandas-readable JSON, and convert that JSON into a dataframe. Then we will concatenate each dataframe. **

*Example API Call: http://api.us.socrata.com/api/catalog/v1?domains=data.seattle.gov&limit=600*

### First, get a list of all Socrata open data portals (for governments in the U.S.)
We will use an API call to get this list, which we will use for more API calls.

In [3]:
url_request = requests.get('http://api.us.socrata.com/api/catalog/v1/domains') 
#this is an API call of basically only domain URLs

url_json = url_request.json() #turn this requests.get object into a JSON-style dict
url_df = pd.io.json.json_normalize(url_json, record_path = 'results')

In [4]:
url_df.head(4)

Unnamed: 0,count,domain
0,4,2014bonds.cityofws.org
1,69,amopen.amo.on.ca
2,84,bchi.bigcitieshealth.org
3,3,bea.data.commerce.gov


In [5]:
urls = url_df.domain

** We have some .ca, .mx, etc. URLs that clearly aren't portals in the U.S. Let's remove them: **

In [6]:
city_urls = urls[urls.str.contains('^[a-zA-Z0-9\-\.]+\.(gov|org|com|us)$')] #use regex to match only certain URLs
city_urls = city_urls.reset_index(drop=True)
city_urls.tail()

  """Entry point for launching an IPython kernel.


214              www.forsythfutures.org
215    www.mdchildhungerpartnership.org
216            www.metrochicagodata.org
217                 www.opendatanyc.com
218                   www.sudandata.org
Name: domain, dtype: object

In [7]:
city_urls.head(10)

0          2014bonds.cityofws.org
1        bchi.bigcitieshealth.org
2           bea.data.commerce.gov
3           bis.data.commerce.gov
4    brigades.opendatanetwork.com
5         bythenumbers.sco.ca.gov
6        census.data.commerce.gov
7                chhs.data.ca.gov
8             chronicdata.cdc.gov
9         churned-data.awcnet.org
Name: domain, dtype: object

** As we can see, we have to manually datasets that the human eye can tell aren't related to U.S. cities or states. These include federal and non-profit open data. We could build a complex regex to do this, but there's only 220 or so datasets total. **

(We will KEEP data from state portals)

** CAUTION -- Must confirm these are the right index numbers, as the data returned by the API changes (obviously) **


In [8]:
city_urls[0:50]

0              2014bonds.cityofws.org
1            bchi.bigcitieshealth.org
2               bea.data.commerce.gov
3               bis.data.commerce.gov
4        brigades.opendatanetwork.com
5             bythenumbers.sco.ca.gov
6            census.data.commerce.gov
7                    chhs.data.ca.gov
8                 chronicdata.cdc.gov
9             churned-data.awcnet.org
10                 cip.cityofnovi.org
11          controllerdata.lacity.org
12         dashboard.alexandriava.gov
13               dashboard.hawaii.gov
14                dashboard.plano.gov
15                 dashboard.slco.org
16                     data.acgov.org
17                  data.albanyny.gov
18                  data.auburnwa.gov
19               data.austintexas.gov
20             data.baltimorecity.gov
21                      data.brla.gov
22              data.burlingtonvt.gov
23               data.cambridgema.gov
24                       data.cdc.gov
25              data.chattlibrary.org
26          

In [9]:
#went through df with human eye line by line
clean_urls = city_urls.drop(city_urls.index[[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 24, 25, 36, 40, 41, 47, 48, 52, 53, 54, 
                                             56, 57, 58, 66, 71, 79, 83, 84, 91, 94, 96, 97, 99, 103, 104, 108, 109, 
                                             110, 121, 128, 133, 140, 144, 146, 147, 148, 152, 153, 157, 159, 161, 
                                             162, 163, 164, 165, 171, 176, 199, 200, 201, 203, 204, 205, 209, 212, 
                                             214, 215, 218, 37, 39]]) #forgot two outlier datasets here

clean_urls = clean_urls[~clean_urls.str.contains('^[a-zA-Z0-9\-\.]+\.(cms.gov)$')] #make sure all CMS are gone

  


In [10]:
clean_urls[0:50]

0            2014bonds.cityofws.org
10               cip.cityofnovi.org
12       dashboard.alexandriava.gov
13             dashboard.hawaii.gov
14              dashboard.plano.gov
15               dashboard.slco.org
16                   data.acgov.org
17                data.albanyny.gov
18                data.auburnwa.gov
19             data.austintexas.gov
20           data.baltimorecity.gov
21                    data.brla.gov
22            data.burlingtonvt.gov
23             data.cambridgema.gov
26           data.cincinnati-oh.gov
27            data.cityofboston.gov
28           data.cityofchicago.org
29          data.cityofevanston.org
30       data.cityofgainesville.org
31                data.cityofgp.com
32           data.cityofmadison.com
33            data.cityofnewyork.us
34            data.cityoftacoma.org
35    data.cityofwestsacramento.org
38                data.colorado.gov
42        data.countyofriverside.us
43                    data.cstx.gov
44                      data

In [11]:
clean_urls = clean_urls.reset_index(drop=True)
clean_urls.index

RangeIndex(start=0, stop=149, step=1)

### Create a function to return JSON-formatted dicts of usage and meta-data for each URL's datasets.

In [12]:
#a function that takes a pd.Series of Socrata open data portal URLs and returns data for EACH in list of JSON-format dicts
def get_json(urls): #urls arg should be a Series of open data portal URLs
    json_dicts = []
    for url in urls:
        request = requests.get('http://api.us.socrata.com/api/catalog/v1?domains={}&limit=600'.format(url))
        raw_data = request.json()
        json_dicts.append(raw_data)
    return json_dicts

Get JSON Dicts: (** One Minute+ Run Time Alert **)

In [13]:
json_dicts = get_json(clean_urls)

In [14]:
len(json_dicts) #we have JSON data from 164 cities/states

149

This is a hulking list of JSON-style dictionaries. Let's see if pandas can handle making them into dfs:

In [15]:
def json_to_df(json_dicts):
    final_df = pd.DataFrame()
    for d in json_dicts:
        df = pd.io.json.json_normalize(d, record_path = 'results')
        final_df = final_df.append(df)
    return final_df

In [16]:
data = json_to_df(json_dicts)

In [17]:
data.head(5)

Unnamed: 0,classification,link,metadata,owner,permalink,preview_image_url,resource
0,"{'categories': [], 'tags': [], 'domain_categor...",https://2014bonds.cityofws.org/dataset/Bonds-P...,{'domain': '2014bonds.cityofws.org'},"{'id': '3xde-ipm7', 'display_name': 'mjmartin'}",https://2014bonds.cityofws.org/d/9csq-7i9t,,"{'name': 'Bonds Project Dataset (Official)', '..."
1,"{'categories': [], 'tags': [], 'domain_tags': ...",https://2014bonds.cityofws.org/dataset/Bonds-A...,{'domain': '2014bonds.cityofws.org'},"{'id': 'd2yk-enzf', 'display_name': 'Christian...",https://2014bonds.cityofws.org/d/xwab-9myw,,"{'nbe_fxf': None, 'description': '', 'type': '..."
2,"{'categories': [], 'tags': [], 'domain_tags': ...",https://2014bonds.cityofws.org/dataset/Wards-2...,{'domain': '2014bonds.cityofws.org'},"{'id': 'vkxb-3xjh', 'display_name': 'James Chu...",https://2014bonds.cityofws.org/d/963m-jiy8,https://2014bonds.cityofws.org/views/963m-jiy8...,"{'nbe_fxf': None, 'description': '', 'type': '..."
3,"{'categories': [], 'tags': [], 'domain_tags': ...",https://2014bonds.cityofws.org/dataset/Project...,{'domain': '2014bonds.cityofws.org'},"{'id': '3xde-ipm7', 'display_name': 'mjmartin'}",https://2014bonds.cityofws.org/d/hz5n-bkfx,,"{'nbe_fxf': None, 'description': '', 'type': '..."
0,"{'categories': [], 'tags': [], 'domain_categor...",https://cip.cityofnovi.org/dataset/Capital-Pro...,{'domain': 'cip.cityofnovi.org'},"{'id': 'nc6z-mbfi', 'display_name': 'Jessica'}",https://cip.cityofnovi.org/d/7nqp-ya57,,"{'name': 'Capital Project Dataset', 'id': '7nq..."


In [18]:
data.shape #we have data on 35,000+ datasets

(33452, 7)

**Ok, now we just need to unpack the dicts in these columns and clean up our df **.

In [19]:
data = data.reset_index(drop=True)
data.head(10)

Unnamed: 0,classification,link,metadata,owner,permalink,preview_image_url,resource
0,"{'categories': [], 'tags': [], 'domain_categor...",https://2014bonds.cityofws.org/dataset/Bonds-P...,{'domain': '2014bonds.cityofws.org'},"{'id': '3xde-ipm7', 'display_name': 'mjmartin'}",https://2014bonds.cityofws.org/d/9csq-7i9t,,"{'name': 'Bonds Project Dataset (Official)', '..."
1,"{'categories': [], 'tags': [], 'domain_tags': ...",https://2014bonds.cityofws.org/dataset/Bonds-A...,{'domain': '2014bonds.cityofws.org'},"{'id': 'd2yk-enzf', 'display_name': 'Christian...",https://2014bonds.cityofws.org/d/xwab-9myw,,"{'nbe_fxf': None, 'description': '', 'type': '..."
2,"{'categories': [], 'tags': [], 'domain_tags': ...",https://2014bonds.cityofws.org/dataset/Wards-2...,{'domain': '2014bonds.cityofws.org'},"{'id': 'vkxb-3xjh', 'display_name': 'James Chu...",https://2014bonds.cityofws.org/d/963m-jiy8,https://2014bonds.cityofws.org/views/963m-jiy8...,"{'nbe_fxf': None, 'description': '', 'type': '..."
3,"{'categories': [], 'tags': [], 'domain_tags': ...",https://2014bonds.cityofws.org/dataset/Project...,{'domain': '2014bonds.cityofws.org'},"{'id': '3xde-ipm7', 'display_name': 'mjmartin'}",https://2014bonds.cityofws.org/d/hz5n-bkfx,,"{'nbe_fxf': None, 'description': '', 'type': '..."
4,"{'categories': [], 'tags': [], 'domain_categor...",https://cip.cityofnovi.org/dataset/Capital-Pro...,{'domain': 'cip.cityofnovi.org'},"{'id': 'nc6z-mbfi', 'display_name': 'Jessica'}",https://cip.cityofnovi.org/d/7nqp-ya57,,"{'name': 'Capital Project Dataset', 'id': '7nq..."
5,"{'categories': [], 'tags': [], 'domain_tags': ...",https://cip.cityofnovi.org/dataset/Capital-Pro...,{'domain': 'cip.cityofnovi.org'},"{'id': 'vkxb-3xjh', 'display_name': 'James Chu...",https://cip.cityofnovi.org/d/kuue-gg3b,,"{'nbe_fxf': None, 'description': '', 'type': '..."
6,"{'categories': [], 'tags': [], 'domain_categor...",https://cip.cityofnovi.org/dataset/Funding-Sou...,{'domain': 'cip.cityofnovi.org'},"{'id': 'trij-xrnq', 'display_name': 'Meredith ...",https://cip.cityofnovi.org/d/5h3i-jdcp,,"{'name': 'Funding Sources (Pie Chart)', 'id': ..."
7,"{'categories': [], 'tags': [], 'domain_categor...",https://cip.cityofnovi.org/dataset/Funding-Sou...,{'domain': 'cip.cityofnovi.org'},"{'id': 'trij-xrnq', 'display_name': 'Meredith ...",https://cip.cityofnovi.org/d/uuui-htp8,,"{'name': 'Funding Sources', 'id': 'uuui-htp8',..."
8,"{'categories': [], 'tags': [], 'domain_tags': ...",https://cip.cityofnovi.org/dataset/City-Bounda...,{'domain': 'cip.cityofnovi.org'},"{'id': 'trij-xrnq', 'display_name': 'Meredith ...",https://cip.cityofnovi.org/d/9fp5-7dsy,https://cip.cityofnovi.org/views/9fp5-7dsy/fil...,"{'description': '', 'type': 'map', 'download_c..."
9,"{'categories': [], 'tags': [], 'domain_tags': ...",https://cip.cityofnovi.org/dataset/Projects/cs...,{'domain': 'cip.cityofnovi.org'},"{'id': 'trij-xrnq', 'display_name': 'Meredith ...",https://cip.cityofnovi.org/d/cspu-rwqv,https://cip.cityofnovi.org/views/cspu-rwqv/fil...,"{'description': '', 'type': 'map', 'download_c..."


## ++++ Computationally Intensive!! ++++

In [20]:
#get each column's dict as separate df
classification = data.classification.apply(pd.Series)
metadata = data.metadata.apply(pd.Series)
resource = data.resource.apply(pd.Series)

#mash_data = pd.concat([data, classification, metadata, resource])

In [21]:
resource.head(10)

Unnamed: 0,attribution,columns_datatype,columns_description,columns_field_name,columns_name,createdAt,description,download_count,id,name,nbe_fxf,obe_fxf,page_views,parent_fxf,provenance,type,updatedAt,view_count
0,,"[location, text, photo, text, text, text, text...","[, , , , , , , , , , , , , , , ]","[location, icon, project_image, category_id, l...","[Automated Geocoding, Icon, Project Image, Cat...",2015-02-12T19:14:36.000Z,,15.0,9csq-7i9t,Bonds Project Dataset (Official),,,"{'page_views_last_week': 8, 'page_views_last_m...",,official,dataset,2017-08-11T19:53:36.000Z,"{'page_views_last_week': 8, 'page_views_last_m..."
1,,"[text, text, text]","[, , ]","[icon, name, id]","[Icon, name, id]",2015-02-05T18:41:29.000Z,,7.0,xwab-9myw,Bonds Application - Reference Table,,,"{'page_views_total': 66, 'page_views_total_log...",,official,dataset,2015-04-17T15:30:47.000Z,"{'page_views_total': 66, 'page_views_total_log..."
2,,[],[],[],[],2015-02-09T20:48:39.000Z,,7.0,963m-jiy8,Wards 2011.shp,,,"{'page_views_total': 23, 'page_views_total_log...",,official,map,2016-08-30T22:00:21.000Z,"{'page_views_total': 23, 'page_views_total_log..."
3,,"[text, text, text, text, text, number, text, t...","[, , , , , , , , , , , , , ]","[city, location_id, project_id, project_name, ...","[City, Location Id, Project Id, Project Name, ...",2015-04-15T05:15:46.000Z,,3.0,hz5n-bkfx,Project Dataset (Staging),,,"{'page_views_total': 15, 'page_views_total_log...",,official,dataset,2015-04-15T05:20:19.000Z,"{'page_views_total': 15, 'page_views_total_log..."
4,,"[text, text, text, text, text, text, text, tex...","[, , , , , , , , , Please be sure to include ""...","[document_10, document_9, document_8, document...","[Document 10, Document 9, Document 8, Document...",2015-08-28T12:03:36.000Z,,25.0,7nqp-ya57,Capital Project Dataset,,,"{'page_views_last_week': 0, 'page_views_last_m...",,official,dataset,2017-07-27T14:24:01.000Z,"{'page_views_last_week': 0, 'page_views_last_m..."
5,,"[text, text, text]","[, , ]","[id, name, icon]","[id, name, Icon]",2015-08-28T12:01:45.000Z,,22.0,kuue-gg3b,Capital Project Application - Reference Table,,,"{'page_views_total': 63, 'page_views_total_log...",,official,dataset,2015-10-02T23:07:35.000Z,"{'page_views_total': 63, 'page_views_total_log..."
6,,"[text, money]","[, ]","[funding_source, budget]","[Funding Source, Budget]",2016-08-09T19:40:33.000Z,,32.0,5h3i-jdcp,Funding Sources (Pie Chart),,,"{'page_views_last_week': 2, 'page_views_last_m...","[7nqp-ya57, kf75-36tp]",official,chart,2017-07-27T14:23:39.000Z,"{'page_views_last_week': 2, 'page_views_last_m..."
7,,"[text, money]","[, ]","[funding_source, budget]","[Funding Source, Budget]",2016-07-13T16:52:30.000Z,,21.0,uuui-htp8,Funding Sources,,,"{'page_views_last_week': 4, 'page_views_last_m...","[7nqp-ya57, kf75-36tp]",official,filter,2017-07-27T14:23:39.000Z,"{'page_views_last_week': 4, 'page_views_last_m..."
8,,[],[],[],[],2016-10-07T23:19:08.000Z,,0.0,9fp5-7dsy,City Boundary,,,"{'page_views_total': 27, 'page_views_total_log...",,official,map,2016-10-07T23:20:31.000Z,"{'page_views_total': 27, 'page_views_total_log..."
9,,[],[],[],[],2016-10-07T23:22:05.000Z,,0.0,cspu-rwqv,Projects,,,"{'page_views_total': 20, 'page_views_total_log...",,official,map,2016-10-07T23:25:10.000Z,"{'page_views_total': 20, 'page_views_total_log..."


In [22]:
#unpack views:
views = resource.view_count.apply(pd.Series)
views.head(10)

Unnamed: 0,page_views_last_month,page_views_last_month_log,page_views_last_week,page_views_last_week_log,page_views_total,page_views_total_log
0,34.0,5.129,8.0,3.17,464.0,8.861
1,1.0,1.0,0.0,0.0,66.0,6.066
2,0.0,0.0,0.0,0.0,23.0,4.585
3,0.0,0.0,0.0,0.0,15.0,4.0
4,4.0,2.322,0.0,0.0,393.0,8.622
5,0.0,0.0,0.0,0.0,63.0,6.0
6,6.0,2.807,2.0,1.585,62.0,5.977
7,10.0,3.459,4.0,2.322,59.0,5.907
8,4.0,2.322,4.0,2.322,27.0,4.807
9,1.0,1.0,1.0,1.0,20.0,4.392


Concatenate all these dfs into one:

In [23]:
data_mash = pd.concat([resource, classification, views, metadata], axis=1)
data_mash[15:25]

Unnamed: 0,attribution,columns_datatype,columns_description,columns_field_name,columns_name,createdAt,description,download_count,id,name,...,domain_tags,tags,page_views_last_month,page_views_last_month_log,page_views_last_week,page_views_last_week_log,page_views_total,page_views_total_log,domain,license
15,,"[percent, number, number, calendar_date]","[, , , ]",[percent_of_valid_parking_meter_problem_servic...,[Percent of valid parking meter problem servic...,2016-02-22T20:36:45.000Z,,17.0,gnby-xhxz,Parking Complaints Bar Chart,...,[],[],17.0,4.17,4.0,2.322,274.0,8.103,dashboard.alexandriava.gov,
16,,"[calendar_date, number]","[, ]","[fiscal_year_originated, count]","[Fiscal Year Originated, Count]",2016-08-09T20:05:43.000Z,,10.0,mzr8-ks94,Home Buyer Loans Graph,...,[],[],18.0,4.248,6.0,2.807,272.0,8.093,dashboard.alexandriava.gov,
17,,"[number, number, percent, calendar_date]","[, , , ]","[potholes_filled, lane_miles_swept, percent_of...","[Potholes Filled, Lane Miles Repaved, Percent ...",2015-10-09T16:19:29.000Z,,7.0,5kha-4jsz,Potholes Filled by FY (Bar Chart),...,[],[],38.0,5.285,3.0,2.0,269.0,8.077,dashboard.alexandriava.gov,
18,,"[number, text]","[, ]","[number_of_units, fiscal_year]","[NUMBER OF UNITS, Fiscal Year]",2015-09-04T11:17:55.000Z,,5.0,ikzj-e4dt,Lane Miles Repaved,...,[],[],28.0,4.858,5.0,2.585,264.0,8.05,dashboard.alexandriava.gov,
19,,"[number, number, text]","[, , ]","[lipos_admissions_per_100k, nvmhi_admission_pe...","[LIPOS Admissions per 100K, NVMHI Admission pe...",2016-06-27T20:10:49.000Z,,5.0,e5v5-i25b,NVMHI Admissions,...,"[delete, dchs]",[],38.0,5.285,5.0,2.585,263.0,8.044,dashboard.alexandriava.gov,
20,,"[percent, calendar_date, number, percent, number]","[, , , , ]","[diversion_rate, calendar_year, missed_collect...","[Diversion rate, Calendar Year, Missed Collect...",2015-09-04T13:38:06.000Z,,6.0,ayk9-7672,Tons Recycling Dropoff Centers,...,[],[],15.0,4.0,2.0,1.585,246.0,7.948,dashboard.alexandriava.gov,
21,,"[number, percent, calendar_date, number, number]","[, , , , ]","[number_of_property_owners_trained, percent_of...","[Number of property owners trained, Percent of...",2015-10-29T19:42:42.000Z,Office of Housing Data,11.0,9vuk-i6y2,Property Owners Trainined,...,[housing],[],30.0,4.954,4.0,2.322,246.0,7.948,dashboard.alexandriava.gov,
22,,"[percent, text, text, percent, text, calendar_...","[, , , , , , , , , , , , ]","[problem, protected_class_group, action, prob_...","[Percent Problem, Protected Class Group, Actio...",2016-01-13T17:58:05.000Z,,17.0,mfwu-24pf,Fair Housing Sites Tested Per Year,...,"[fair housing, housing]",[],20.0,4.392,3.0,2.0,237.0,7.895,dashboard.alexandriava.gov,
23,,"[number, calendar_date]","[, ]","[count, fyear]","[Count, FYEAR]",2016-07-12T20:20:28.000Z,DCHS Rent Relief Data. Well Being and Safety f...,7.0,gr5b-bvxa,Number of Residents Assisted Through Rent Relief,...,"[well, rent relief, well-being and safety for ...",[],26.0,4.755,3.0,2.0,233.0,7.87,dashboard.alexandriava.gov,
24,,"[calendar_date, number, text]","[, , ]","[season, pts, outcome]","[Season, PTS, Outcome]",2015-07-24T18:52:32.000Z,"Lakers game outcomes and Kobe box scores, 1996...",3.0,22w3-jbsg,"Kobe average points in wins and losses, by season",...,[],[],31.0,5.0,0.0,0.0,226.0,7.827,dashboard.alexandriava.gov,


In [24]:
data_mash.columns

Index(['attribution', 'columns_datatype', 'columns_description',
       'columns_field_name', 'columns_name', 'createdAt', 'description',
       'download_count', 'id', 'name', 'nbe_fxf', 'obe_fxf', 'page_views',
       'parent_fxf', 'provenance', 'type', 'updatedAt', 'view_count',
       'categories', 'domain_category', 'domain_metadata', 'domain_tags',
       'tags', 'page_views_last_month', 'page_views_last_month_log',
       'page_views_last_week', 'page_views_last_week_log', 'page_views_total',
       'page_views_total_log', 'domain', 'license'],
      dtype='object')

In [25]:
cols_to_keep = ['name', 'description', 'attribution', 'columns_field_name', 'columns_name', 'type', 'categories', 
                'domain_category','domain_tags', 'provenance', 'download_count','page_views_last_month',
                'page_views_last_week','page_views_total', 
                'page_views_total_log', 'domain']
cities = data_mash[cols_to_keep]
cities[25:35]

Unnamed: 0,name,description,attribution,columns_field_name,columns_name,type,categories,domain_category,domain_tags,provenance,download_count,page_views_last_month,page_views_last_week,page_views_total,page_views_total_log,domain
25,Resident Survey: Air Quality,For the relevant summary reports and technical...,,"[of_total_weight, answergroupstep1_group, comp...","[Response Percent (weighted), Answer, Comparis...",chart,[finance],,"[resident survey, city strategic plan]",official,4.0,60.0,8.0,220.0,7.788,dashboard.alexandriava.gov
26,Rental Unit Gain and Loss Graph,,,"[fiscal_year, units_lost, new_units_added]","[Fiscal Year, Rental Units Lost, New Rental Un...",chart,[],,"[assisted rental, housing]",official,14.0,6.0,1.0,217.0,7.768,dashboard.alexandriava.gov
27,Education Level- Filter,Worfkforce Development Center Education Ranges,,"[fystartdate, count, geom, fiscalyear, fyendda...","[FYStartDate, count, geom, FiscalYear, FYEndDa...",chart,[],,"[education level, increase employment, workfor...",official,5.0,22.0,1.0,216.0,7.762,dashboard.alexandriava.gov
28,"Issue types, as a percent of all types",,,"[fiscal_year, percent_unsubstantiated, percent...","[Fiscal Year, Percent Unsubstantiated, Percent...",chart,[],,[],official,5.0,31.0,1.0,214.0,7.748,dashboard.alexandriava.gov
29,Resident Survey: Ease of Travel by Bike,For the relevant summary reports and technical...,,"[of_total_weight, answergroupstep1_group, comp...","[Response Percent (weighted), Answer, Comparis...",chart,[finance],,"[resident survey, city strategic plan]",official,2.0,61.0,12.0,214.0,7.748,dashboard.alexandriava.gov
30,Pothole Service Requests by Fiscal Year,,,"[fiscal_year, count]","[Fiscal Year, Count]",chart,[],,[],official,2.0,34.0,10.0,209.0,7.714,dashboard.alexandriava.gov
31,Signal Services Requests Column Graph,,,[percent_of_signals_receiving_preventative_mai...,[Percent of signals receiving preventative mai...,chart,[],,[],official,1.0,30.0,11.0,207.0,7.7,dashboard.alexandriava.gov
32,Local GDP 2016,,,"[number_of_accounts, business_license_class_na...","[Number of Accounts, Business License Class Na...",chart,[finance],,"[local gross domestic product, city strategic ...",official,4.0,39.0,6.0,207.0,7.7,dashboard.alexandriava.gov
33,Number of Youth Counseled through the Workforc...,"Work Force Development Center, Youth Participants",,"[fytarget, fyenddate, geom, count, fystartdate...","[FYTarget, FYEndDate, geom, count, FYStartDate...",chart,[politics],,"[youth work experience, work experience, workf...",official,4.0,44.0,4.0,206.0,7.693,dashboard.alexandriava.gov
34,DCHS_BH_Opioid Use,Drug use report from Alaina,,"[primary_opiates_category_combined, fiscal_yea...","[Opiates Category Combined, Fiscal Year, Perce...",chart,[],,"[behavioral health, detox, dchs]",official,4.0,9.0,0.0,204.0,7.679,dashboard.alexandriava.gov


In [26]:
cities.shape

(33452, 16)

## EDA/Data Cleaning
We are going to cluster these datasets by content and use those clusters for quantitative analysis. However, we first need to see what we're dealing with and drop and NaN data for views. 

** See how many NaN view counts there are - we'll want to drop these **

In [27]:
NAs = cities[cities.page_views_total.isnull()] 
len(NAs.index)

0

In [28]:
type(cities.iloc[0, 10])

numpy.float64

Our page views column are floats, not lists (which could potentially be empty, as in other columns), so we're hopefully getting the true number of NaNs (0). This would square with what we expect - unless something went seriously wrong, Socrata's API will return view data for every dataset. However, not every city/state tags their datasets with categories/tags -- hence, some are missing.

In [29]:
type(cities.attribution[0])

NoneType

Looks like the "attribution" attribute has many NoneTypes. That's ok - we don't want to use attribution for clustering, as it's too specific - we want to be able to cluster similar datasets across cities, whereas including attribution - e.g. "Dallas Police Department" -- will just skew our clusters to clustering within a given city.

### Now we need to choose what attributes we'll use to create a text 'mash' from which we will use natural language processing and clustering tools. 

We need to choose carefully - as noted with attribution, the model we will build to do this is not a genius. It basically comes down to word appearance frequency is giant vectors, and how close these vectors are to each other in mathematical space. So, we want to include text attributes that won't skew how a dataset is classified.

## Create "Mash" from categories and tags:

** This actually takes a fair amount of cleaning **

In [30]:
mash_df = cities.copy()

#domain category is tricky because it contains both NaNs and NoneTypes. We'll replace both with a nothing str
mash_df['domain_category'] = mash_df['domain_category'].replace([None], '') 

#combine domain_tags and categories lists of strings columns - easy enough
mash_df['mash'] = mash_df.domain_tags + mash_df.categories 

#CONVERT lists of strings into actual string
mash_df['mash'] = mash_df['mash'].apply(lambda x: ','.join(map(str, x)))

#now add domain_category string to our mash column
mash_df['mash'] = mash_df.mash.str.cat(mash_df.domain_category, sep=' ') #this adds blank space if domain_category blank

#NOW our mash column is real-life words separated by commas and spaces. we need to split them
mash_df.mash = mash_df.mash.str.strip() #remove trailing whitespace
mash_df.mash = mash_df.mash.str.replace(' ', ",") #now replace all whitespaces with a comma to make splitting easier
mash_df.mash = mash_df.mash.str.lower() #we need strings to all be lowercase for vectorizing purposes later
mash_df.mash = mash_df.mash.str.split(',') #split big string on comma into actual words

In [31]:
mash_df.iloc[433].mash

['environmental', 'protection']

In [32]:
mash_df[430:435]

Unnamed: 0,name,description,attribution,columns_field_name,columns_name,type,categories,domain_category,domain_tags,provenance,download_count,page_views_last_month,page_views_last_week,page_views_total,page_views_total_log,domain,mash
430,Children Who Are Confirmed By Child Protective...,,http://datacenter.kidscount.org/,"[location, timeframe, dataformat, date_time, g...","[Location, TimeFrame, DataFormat, Date Time, G...",dataset,[],,[],official,23.0,0.0,0.0,304.0,8.253,dashboard.hawaii.gov,[]
431,AABD Client Age - June 2013,,,"[pctn, n, characteristics]","[PctN, N, Characteristics]",chart,[],,[],official,14.0,33.0,0.0,304.0,8.253,dashboard.hawaii.gov,[]
432,Reading Proficiency Changes over Time Line Chart,,Socrata,"[year_text, year_date, reading_proficiency_mee...","[Year, Date Time, Meets, Exceeds, Approaches, ...",chart,[education],,[],official,21.0,25.0,1.0,302.0,8.243,dashboard.hawaii.gov,[education]
433,Wastewater Percentage Reused,,,"[percentage_reused, wastewater_reused_mgd, tot...","[Percentage Reused, Wastewater Reused (MGD), T...",chart,[],Environmental Protection,[],official,26.0,31.0,7.0,302.0,8.243,dashboard.hawaii.gov,"[environmental, protection]"
434,TImeLIne of Class Size,This data reflects average class sizes from 20...,,"[year_string, class_size, year]","[Year String, Class Size, Year]",chart,[],,[class size],official,22.0,4.0,0.0,301.0,8.238,dashboard.hawaii.gov,"[class, size]"


## Data Cleaning
- Drop type "filter", which essentially counts datasets twice
- Drop any blank "mash" value - can't analyze data we don't have
- Get counts of the data we deleted

In [33]:
#before drop
len(mash_df.index)

33452

In [34]:
cleaning_df = mash_df.copy()

#drop filtered views
cleaning_df = cleaning_df[cleaning_df.type != 'filter']

print("Filtered Views Dropped: {}".format(len(mash_df.index) - len(cleaning_df.index)))

Filtered Views Dropped: 6732


In [35]:
cleaning_df.provenance.value_counts() #check what's here; we want only official

official     25615
community     1105
Name: provenance, dtype: int64

In [36]:
cleaning_df.provenance.isnull().sum() #no NaNs

0

In [37]:
cleaning_df = cleaning_df[cleaning_df.provenance == 'official'] #make sure we have only gov datasets here

In [38]:
len(cleaning_df.index)

25615

In [39]:
cleaning_df.mash

0                                                       []
1                                                       []
2                                                       []
3                                                       []
4                                                       []
5                                                       []
6                                                       []
8                                                       []
9                                                       []
10                                                      []
11                                                      []
12                                                      []
13               [point, in, time, homelessness, ss, dchs]
14                                               [housing]
15                                        [transportation]
16                                                      []
17                                                      

In [40]:
cleaning_df.head()

Unnamed: 0,name,description,attribution,columns_field_name,columns_name,type,categories,domain_category,domain_tags,provenance,download_count,page_views_last_month,page_views_last_week,page_views_total,page_views_total_log,domain,mash
0,Bonds Project Dataset (Official),,,"[location, icon, project_image, category_id, l...","[Automated Geocoding, Icon, Project Image, Cat...",dataset,[],,[],official,15.0,34.0,8.0,464.0,8.861,2014bonds.cityofws.org,[]
1,Bonds Application - Reference Table,,,"[icon, name, id]","[Icon, name, id]",dataset,[],,[],official,7.0,1.0,0.0,66.0,6.066,2014bonds.cityofws.org,[]
2,Wards 2011.shp,,,[],[],map,[],,[],official,7.0,0.0,0.0,23.0,4.585,2014bonds.cityofws.org,[]
3,Project Dataset (Staging),,,"[city, location_id, project_id, project_name, ...","[City, Location Id, Project Id, Project Name, ...",dataset,[],,[],official,3.0,0.0,0.0,15.0,4.0,2014bonds.cityofws.org,[]
4,Capital Project Dataset,,,"[document_10, document_9, document_8, document...","[Document 10, Document 9, Document 8, Document...",dataset,[],,[],official,25.0,4.0,0.0,393.0,8.622,cip.cityofnovi.org,[]


### Remove all blank lists:

In [41]:
df = cleaning_df.copy()
df = df[df.astype(str).mash != "['']"] #drops all blank lists
df = df.reset_index(drop=True)

In [42]:
big_mash = df.copy()
print("Number of Records in big_mash df: {}".format(len(big_mash.index)))

Number of Records in big_mash df: 21793


** Clean up description string and get it into a tokenized list that we can easily add to the existing mash list of tokens: **

In [43]:
big_mash.description = big_mash.description.str.replace(',', ' ') #replace natural commas with white space
big_mash.description = big_mash.description.str.replace('-', ' ') 
big_mash.description = big_mash.description.str.replace('.', ' ')
big_mash.description = big_mash.description.str.replace('&', ' ') 
big_mash.description = big_mash.description.str.replace(':', ' ')
big_mash.description = big_mash.description.str.split()

In [44]:
big_mash['big_mash'] = big_mash.mash + big_mash.description 

** Now we must reset our index or our later topic percentage comparisons won't match! **

In [45]:
big_mash = big_mash.reset_index(drop=True)
big_mash.head()

Unnamed: 0,name,description,attribution,columns_field_name,columns_name,type,categories,domain_category,domain_tags,provenance,download_count,page_views_last_month,page_views_last_week,page_views_total,page_views_total_log,domain,mash,big_mash
0,Homelessness PIT Transitional Age Youth,[],,[location_on_the_night_of_the_count_total_pers...,"[Location on the night of the count, Total Per...",chart,[],,"[point in time, homelessness, ss, dchs]",official,4.0,46.0,5.0,319.0,8.322,dashboard.alexandriava.gov,"[point, in, time, homelessness, ss, dchs]","[point, in, time, homelessness, ss, dchs]"
1,Fair Housing Complaints,[],,"[violations, percent_found_to_be_compliant, si...","[Number of complaints, Percent of sites found ...",chart,[],,[housing],official,21.0,31.0,1.0,278.0,8.124,dashboard.alexandriava.gov,[housing],[housing]
2,Parking Complaints Bar Chart,[],,[percent_of_valid_parking_meter_problem_servic...,[Percent of valid parking meter problem servic...,chart,[transportation],,[],official,17.0,17.0,4.0,274.0,8.103,dashboard.alexandriava.gov,[transportation],[transportation]
3,NVMHI Admissions,[],,"[lipos_admissions_per_100k, nvmhi_admission_pe...","[LIPOS Admissions per 100K, NVMHI Admission pe...",chart,[],,"[delete, dchs]",official,5.0,38.0,5.0,263.0,8.044,dashboard.alexandriava.gov,"[delete, dchs]","[delete, dchs]"
4,Property Owners Trainined,"[Office, of, Housing, Data]",,"[number_of_property_owners_trained, percent_of...","[Number of property owners trained, Percent of...",chart,[],,[housing],official,11.0,30.0,4.0,246.0,7.948,dashboard.alexandriava.gov,[housing],"[housing, Office, of, Housing, Data]"


In [46]:
big_mash.big_mash[12] #this looks great; do have to remember to make it all lowercase

['youth',
 'work',
 'experience',
 'work',
 'experience',
 'workforce',
 'development',
 'center',
 'dchs',
 'politics',
 'Work',
 'Force',
 'Development',
 'Center',
 'Youth',
 'Participants']

## EDA on full dataset (minus filtered views)

In [47]:
print("Number of Missing domain_category tags: {}".format(big_mash.domain_category.isnull().sum()))

Number of Missing domain_category tags: 0


In [48]:
print("Empty Mash Rows Dropped: {}".format(len(cleaning_df.index) - len(big_mash.index)))

Empty Mash Rows Dropped: 3822


In [49]:
print("Clean DataFrame Length: {}".format(len(big_mash.index)))

Clean DataFrame Length: 21793


In [50]:
def median_mash_len(df):
    lengths = []
    for ls in df.big_mash:
        lengths.append(len(ls))
    return np.median(lengths)

In [51]:
print("Median Mash Length: {}".format(median_mash_len(big_mash)))

Median Mash Length: 23.0


# NTLK Topic Analysis to find latent topics in "mash" categories
- Goal: use topic word "umbrellas" to count views/downloads by umbrella.
- Drawbacks: LDA assumes a document has multiple topics. This may be true of our mash - e.g. government and finance - but sometimes it might be just a single topic being arbitrarily split.

Resources: 
- https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html
- https://algobeans.com/2015/06/21/laymans-explanation-of-topic-modeling-with-lda-2/

## Fit an LDA Model
- This is very computationally intensive.
- We pick the number of topics that we want the algorithm to find. This is a very important parameter.
- LDA Model fitting is an iterative process. The algorithm starts out by assigning every word to a temporary topic. Then, for *every* word, it updates the topics by calculating:
    - How prevalent is that word across topics? Topics with a high prevalance of the word in question get a higher weight for that word's assignment.
     - How prevalent are topics within a document? If one topic within a document is more prevalent, it gets a higher weight.
     - Based on these two criteria, LDA then updates a word's topic and document assignment.

** Therefore, the more iterations of LDA you can run, the more accurate it gets **
- Unfortunately, 40 passes takes about an hour.

In [52]:
mash = big_mash.big_mash

stop_words = stopwords.words('english') #list of stop words

stop = list(stop_words) #copy ls since we're gonna mess with it
stop.append('&') #this ampersand is giving us fits
stop.extend(['', 'data', 'dataset', 'datasets', '//data', 'http', 
             'https', 'html', 'www', '//www', "=", "gov", "gov)", "(gov"]) #add words that don't specify given subject
stop.extend("•") #special char that showed up in earlier model iterations
stop.extend("–") #ditto as above
stop.extend(np.arange(101).astype(str)) #remove common numbers 0-100
stop.extend(np.arange(1980, 2025).astype(str)) #remove common years 1979-2024

texts = [] #blank list to append to

for ls in mash:
    lowers = [word.lower() for word in ls]
    stopped_tokens = [word for word in lowers if not word in stop]
    texts.append(stopped_tokens)

In [53]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# !!! ~One hour run time warning !!!

## 50 Topics & 45 passes

In [54]:
lda_50_45 = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary, 
                                         passes = 45, random_state=7)

We've (hopefully) created groups of words that tend to co-occur -- "topics".

One can assume the decimals returned for each word is the distribution of that word over the topic itself. So obviously, larger decimals are more important.

In [55]:
lda_50_45.show_topics(num_topics=50, formatted=False)


[(0,
  [('new', 0.10316757798764456),
   ('york', 0.071569777494110365),
   ('state', 0.053206212403621514),
   ('prevention', 0.032032819059224719),
   ('environment', 0.0259229189793997),
   ('waste', 0.023565802574361239),
   ('section', 0.019080124537761374),
   ('taxes', 0.018383886834403274),
   ('environmental', 0.011596182265864707),
   ('site', 0.011555240674199723)]),
 (1,
  [('food', 0.054992544288208108),
   ('cost', 0.042269310008768754),
   ('facility', 0.042127112534745534),
   ('inspection', 0.03847589835551201),
   ('inspections', 0.032707388728011114),
   ('health', 0.028980025847417542),
   ('release', 0.027962755993273639),
   ('county', 0.027773584270816991),
   ('facilities', 0.026376692147202847),
   ('weekly', 0.021455540181697839)]),
 (2,
  [('recreation', 0.087322849450364337),
   ('fire', 0.066376996472196026),
   ('emergency', 0.050154897104409624),
   ('public', 0.049353844405411552),
   ('parks', 0.043831285217989721),
   ('safety', 0.04053988618797006),
 

In [56]:
lda_45_fifty = gensim.models.ldamodel.LdaModel(corpus, num_topics=45, id2word = dictionary, 
                                         passes = 50, random_state=7)

In [57]:
lda_45_fifty.show_topics(num_topics=45, formatted=False)

[(0,
  [('health', 0.28889349052335084),
   ('care', 0.038152752914044963),
   ('county', 0.019615084197074441),
   ('community', 0.019350158353068463),
   ('home', 0.018423974405805475),
   ('medicaid', 0.016124394901775481),
   ('managed', 0.015703304770523105),
   ('system', 0.012857080046039977),
   ('state', 0.011904354997325322),
   ('medical', 0.011775910916319483)]),
 (1,
  [('chart', 0.069111335913462274),
   ('county', 0.03741621452915532),
   ('assistance', 0.034746374264368704),
   ('historic', 0.032832750195864284),
   ('income', 0.031630079455261646),
   ('arts', 0.025084058341459166),
   ('governance', 0.024586485403403292),
   ('program', 0.023034810339547393),
   ('finance', 0.017740679523431474),
   ('insurance', 0.017411275747355712)]),
 (2,
  [('census', 0.064906883684765576),
   ('demographics', 0.037242956403767423),
   ('community', 0.025265700083693006),
   ('u', 0.015312570141907301),
   ('bureau', 0.014669806766646965),
   ('township', 0.014231272416557694),
 

In [58]:
#Try 48 topics
lda_48_forty = gensim.models.ldamodel.LdaModel(corpus, num_topics=48, id2word = dictionary, 
                                         passes = 40, random_state=7)

In [59]:
lda_48_forty.show_topics(num_topics=48, formatted=False)

[(0,
  [('health', 0.34111006609046857),
   ('care', 0.042087730089601157),
   ('"about"', 0.025842829053547376),
   ('cases', 0.020041568984374279),
   ('services', 0.018166861041868612),
   ('department', 0.016766608007023197),
   ('demographics', 0.01431613410975068),
   ('medical', 0.013670961940252566),
   ('facilities', 0.013508392154239044),
   ('public', 0.012063311465395103)]),
 (1,
  [('transportation', 0.3229756258776596),
   ('transit', 0.034970034774981389),
   ('infrastructure', 0.030128117457588435),
   ('bay', 0.02689792222626438),
   ('seattle', 0.02496197177494526),
   ('routes', 0.022608948604884809),
   ('treatment', 0.021597817911408388),
   ('road', 0.020160147773203036),
   ('longitude', 0.0119131594008866),
   ('latitude', 0.011611884088580328)]),
 (2,
  [('goal', 0.074563284390782775),
   ('payments', 0.043503900545184188),
   ('endorsed', 0.038016772158128598),
   ('demographics', 0.033722379753645233),
   ('crest', 0.030642760605548446),
   ('economy', 0.0297

In [60]:
# 50 topics looks pretty damn good... try 49 and a bunch of passes just to compare.

In [61]:
lda_49_sixty = gensim.models.ldamodel.LdaModel(corpus, num_topics=49, id2word = dictionary, 
                                         passes = 60, random_state=7)

In [62]:
lda_49_sixty.show_topics(num_topics=49, formatted=False)

[(0,
  [('new', 0.19401806380593659),
   ('york', 0.080494504471462794),
   ('prevention', 0.035488842816394882),
   ('percentage', 0.027175453469101662),
   ('inspection', 0.025312274756160352),
   ('inspections', 0.024283191255110546),
   ('case', 0.01966651872670919),
   ('reported', 0.017813933632749392),
   ('cases', 0.014989217362823921),
   ('department', 0.01423899182353321)]),
 (1,
  [('environment', 0.077563792232817577),
   ('recreation', 0.068552759547399977),
   ('gas', 0.037045144234832891),
   ('parks', 0.035580036085293414),
   ('natural', 0.027961643955858952),
   ('historic', 0.021556676519912169),
   ('energy', 0.021233097162056038),
   ('park', 0.019746229255702165),
   ('demographics', 0.018809940818879642),
   ('greenhouse', 0.016907553791280894)]),
 (2,
  [('fire', 0.1134373467679646),
   ('ny', 0.063013405573689465),
   ('currently', 0.019268395851742632),
   ('important', 0.016565157256828302),
   ('forest', 0.013552797076731593),
   ('set', 0.01240071567118871

In [83]:
#lda_50_forty.save('lda_50_forty_clean_data_clusters') #only need to run this once

In [63]:
lda_52_sixty = gensim.models.ldamodel.LdaModel(corpus, num_topics=52, id2word = dictionary, 
                                         passes = 60, random_state=7)

In [65]:
lda_52_sixty.show_topics(num_topics=52, formatted=False)

[(0,
  [('prevention', 0.039228265770020823),
   ('hospital', 0.038899194905431915),
   ('ny', 0.034882850346069173),
   ('healthy', 0.032344049932466652),
   ('statewide', 0.029269391777562728),
   ('hospitals', 0.022294582283033028),
   ('inpatient', 0.021854135158083089),
   ('healthcare', 0.018663035226976647),
   ('api', 0.018299534915519441),
   ('quality-safety-costs', 0.016526700960621438)]),
 (1,
  [('chart', 0.069963659585634108),
   ('children', 0.059274004768666677),
   ('income', 0.03908399441304488),
   ('historic', 0.025093228857131812),
   ('home', 0.021852521187430106),
   ('assistance', 0.018091177706023875),
   ('homes', 0.017940795661987072),
   ('families', 0.017918354840225928),
   ('low', 0.017857714487521079),
   ('pay', 0.017610064270261648)]),
 (2,
  [('population', 0.13151520321707977),
   ('county', 0.091960434049722567),
   ('demographics', 0.084275005086619365),
   ('age', 0.0434468663431666),
   ('total', 0.028221558698175012),
   ('king', 0.0194688109256

In [66]:
lda_52_sixty.save('lda_52_sixty_good_model')

## Discussion of Results:

This is a very decent topic analysis. By my count, **we've created 48 genuinely useful "clusters" of latent topics.** Since topic analysis is probabilistic, and our data contains "noise" (that is, words either far too specific or far too common to add real meaning) to begin with, **we have a few topics that won't help us.**
- Topic 32 is just too vague. It probably has something to do with some open government performance metrics, but there are no words to really a distinguish it.  This is not a failure of the algorithm; I don't doubt these words really are occuring together alot in the data. But "open, created, items" doesn't tell us much.
- Topic 24 clearly has to do with some "special" software/gis for open data (probably having to do with nursing) -- it looks like through the openmichigan portal. Again, these words are quite likely appearing together. But the practical application of a topic like this when it comes to determinging what is popular in the real world is limited.
- Topic 42 has a similar problem to topics 32 and 42 topics, but is a little more clear for content type. It looks like political information for several years. But we can't really tell more specifics.
- Topic 43 is what I'd call a "parochial" topic; it's clearly about common information like jobs and licensing from new york and michigan. Again, not useful as its own topic - but my spin is that this helps isolate "new york" and "ny" to keep them from inflating other topic views.

**Other topics are useful, but may mix content. It could be a vagary of the English language, or it might actually reveal new insights:**
- Topic 51 is a topic that tells us something, but it appears to be a mix of youth court cases and youth college enrollment. This could be that "enrollment" is used in English in the context of college and court-mandated programs. Then again, this could be "at-risk youth outcome statistics - either court or college (or both).
- Topic 37 appears to be about the construction, permits, and financials of building projects; since these are are pretty closely related in real life, however, I'd argue this is a good topic formation.

**Many topics contain a proper name, but clearly identify something useful:**
- Topic 9 -- "recreation", "parks", "jersey", "park", "centers", "neighborhood", "centers", etc. is clearly about parks and recreation facilities. It just contains jersey. Because of how we'll award our views/downloads counts to each topic, "jersey" should only skew this slightly. Again, this is just "error" (but not really error, according to the model - jersey probably had a lot of parks and recreation datasets) we have to tolerate if we can't exclude proper names as stop words.

**Topic 13 has a seemingly weird outlier:**
- Topic 13 is clearly about gas & fuel emissions, but also contains "food". I'm guessing "food" is linked to "gas" by the word "natural" across many datasets (natural food, natural gas). This is just noise we will have to tolerate.

**Many topics are absolutely beautiful. A few examples:**
- Topic 11 -- energy, environment, electricity, air, sustainable, action, climate, city, clean, facilities -- identifies a city's environmental initiatives with words I wouldn't have even thought of to group together.
- Topic 31 -- politics, government, election, campaign, elections, commissions, results, etc -- leaves no doubt about its content.
- Topic 5 -- locations, bacteria, hours, culture, county, levels, contact, directory, e [as in e coli], contains -- is amazing. Words that could be all sorts of different topics that become so clear in context together. This is about local bacteria levels! (Presumably in lakes)

**Also to note**: I chose 50 topics after extensive trial and error. Fewer and more topics results in worse "human eye" evaluation of topics. It appears you need to provide enough topics to draw out parochial and general words into their own topics, without choosing so many topics that you stretch them too thin.

### Topic Composition of Documents

In [67]:
corpus_lda = lda_52_sixty[corpus] #this is just a wrapper; calculates on the fly when you call it

In [68]:
#view topic composition of documents
for doc in corpus_lda[10:13]:
    print(doc) 

[(36, 0.060921471085349795), (47, 0.88565972549584659)]
[(25, 0.11324786324786323), (30, 0.18654121408011587), (38, 0.22435897435897434), (40, 0.10694707536980026), (50, 0.26847752251589579)]
[(0, 0.059954751131221624), (9, 0.1073825278351072), (16, 0.35215278936410371), (31, 0.058460290192871016), (33, 0.12027274148133688), (49, 0.13209364207680635), (51, 0.11877828054298625)]


In [69]:
corpus_lda_list = list(corpus_lda) #bit of a run time here, as corpus_lda was just a wrapper; this calcs on the fly

#we're going to use this list later for a df

### Our corpus index numbers and df index numbers DO line up, as we can see below:
- corpus[] returns integer ids and frequency for each string
- dictionary.token2id shows us the id for each string so we can look'
- big_mash.big_mash[] shows that cell's strings in our df

In [85]:
corpus[500]

[(6, 1)]

In [86]:
dictionary.token2id

{'point': 0,
 'time': 1,
 'homelessness': 2,
 'ss': 3,
 'dchs': 4,
 'housing': 5,
 'transportation': 6,
 'delete': 7,
 'office': 8,
 'fair': 9,
 'well': 10,
 'rent': 11,
 'relief': 12,
 'well-being': 13,
 'safety': 14,
 'older': 15,
 'adults': 16,
 'resident': 17,
 'survey': 18,
 'city': 19,
 'strategic': 20,
 'plan': 21,
 'finance': 22,
 'relevant': 23,
 'summary': 24,
 'reports': 25,
 'technical': 26,
 'appendices': 27,
 '<a': 28,
 'href="https': 29,
 'alexandriava': 30,
 'gov/performance/default': 31,
 'aspx?id=89091">click': 32,
 'here</a>': 33,
 'assisted': 34,
 'rental': 35,
 'education': 36,
 'level': 37,
 'increase': 38,
 'employment': 39,
 'workforce': 40,
 'development': 41,
 'center': 42,
 'worfkforce': 43,
 'ranges': 44,
 'local': 45,
 'gross': 46,
 'domestic': 47,
 'product': 48,
 'youth': 49,
 'work': 50,
 'experience': 51,
 'politics': 52,
 'force': 53,
 'participants': 54,
 'behavioral': 55,
 'health': 56,
 'detox': 57,
 'drug': 58,
 'use': 59,
 'report': 60,
 'alaina':

In [88]:
big_mash.big_mash[500]

['transportation']

### Evaluating Topics Composition of Specific Dataset:

**Dataset with only one word to identify it:**

In [89]:
big_mash.big_mash[500] #random topic with one word

['transportation']

The human eye would say the topic above is about electric vehicle charging stations in Austin, and more broadly green energy and transportation.

In [90]:
#cribbed from here: http://nbviewer.jupyter.org/gist/boskaiolo/cc3e1341f59bfbd02726 
for index, score in sorted(lda_52_sixty[corpus[500]], key=lambda tup: -1*tup[1]): #500th document
    print("Score: {}\t Topic: {} \n".format(score, lda_52_sixty.print_topic(index, 15))) #15 word topics

Score: 0.5096153846153855	 Topic: 0.165*"transportation" + 0.038*"traffic" + 0.030*"street" + 0.026*"parking" + 0.024*"infrastructure" + 0.024*"city" + 0.024*"safe" + 0.023*"vehicle" + 0.022*"streets" + 0.016*"bike" + 0.013*"vehicles" + 0.012*"road" + 0.012*"motor" + 0.010*"bicycle" + 0.010*"routes" 



**Dataset with many words to identify it**

In [91]:
big_mash.big_mash[600]

['rating',
 'customer',
 'animal',
 'plano',
 'environment',
 'social',
 'services',
 'government',
 'Data',
 'on',
 'the',
 'City',
 'of',
 'Plano',
 'Animal',
 "Service's",
 'Department',
 'This',
 'data',
 'set',
 'contains',
 'information',
 'on',
 'the',
 "Department's",
 'customer',
 'feedback',
 'ratings',
 'These',
 'scores',
 'are',
 'tabulated',
 'once',
 'a',
 'quarter']

In [92]:
for index, score in sorted(lda_52_sixty[corpus[600]], key=lambda tup: -1*tup[1]): #600th document
    print("Score: {}\t Topic: {} \n".format(score, lda_52_sixty.print_topic(index, 15))) #15 word topics

Score: 0.248182024694236	 Topic: 0.275*"services" + 0.096*"social" + 0.054*"human" + 0.053*"dfps" + 0.035*"information" + 0.033*"programs" + 0.030*"us" + 0.023*"definition" + 0.022*"visit" + 0.020*"state" + 0.015*"animal" + 0.014*"support" + 0.013*"environment" + 0.012*"agenda" + 0.010*"agency" 

Score: 0.16746794871794857	 Topic: 0.061*"official" + 0.050*"account" + 0.048*"accounts" + 0.041*"(openmichigan@michigan" + 0.029*"special" + 0.029*"nursing" + 0.027*"software" + 0.024*"gis" + 0.023*"use" + 0.022*"consumer" + 0.017*"esri" + 0.016*"required" + 0.016*"portal" + 0.015*"chicago" + 0.014*"via" 

Score: 0.14285535723202014	 Topic: 0.131*"property" + 0.052*"consumption" + 0.046*"real" + 0.031*"value" + 0.025*"properties" + 0.022*"significant" + 0.022*"easy" + 0.020*"usage" + 0.020*"assessment" + 0.015*"company" + 0.015*"percentages" + 0.015*"city" + 0.014*"estate" + 0.014*"values" + 0.012*"government" 

Score: 0.10521562143486674	 Topic: 0.087*"service" + 0.035*"requests" + 0.033*"in

**Very interesting (and promising, for our model's purposes):**

As the human eye can tell, this dataset is customer ratings about the City of Plano's animal department. That is....highly specific. And yet our model's top topic for it, in terms of composition, is a catch-all human and social services category, which includes an animal tag!


** *HOWEVER, we can see some potential issues:* **

A property value topic also shows some affinity with this (I bet this topic, which has "assessment" in the context of property values, also contains words like "feedback" and "ratings" deeper in the tag. It's important to remember that these topics go deeper than the first 10 words that we've been displaying. Words that are less likely to appear in the topic "count" for less in defining that topic, but they're still there. More simply, a fiscal year topic also shows some affinity, probably due to the dataset having "government" repeated several times. **This shows the importance of an affinity cut-off (perhaps .2 or above) or only giving the topic with the most affinity credit when it comes to calculating popularity.**

## Also To Note:
- Our topics are sparse in terms of probability/composition. As mentioned above, they look very good the human eye, and can be useful, but remember that they go on much longer than 10 words, and that your top 10 words only compose like 10-15% of the topic (very approximate)
- **A significant weakness** of this model is that all topics formed are the same size; there is no way to make "clusters" (topics) of varying density. But in real life, we know we have very specific topics that really only make sense with 3-4 words, whereas broad categories (public safety) can make sense with a ton of words
- Normally, another LDA weakness is that it is "bag of words"; it doesn't take into account the placement of words in a sentence. However, since we are mainly using tags ("public safety", "health", etc), that's not much of an issue here!
- This model allows for words to be re-used in topics. This is good for our purposes - e.g. "public safety" and "public records".

## Where are there errors and uncertainty?
**Again, not every dataset is tagged thoroughy, accurately, or appropriately.** 
  - Some cities/portals just give their datasets weird names or use stock descriptions for every single category of open data.
    
**Proper names obviously skew results somewhat; a proper name doesn't really tell us about the content of a dataset.**
  - However, it's just not feasible to remove every proper name as a stopword; at least not without extensive trial and error
  - This effect is mitigated by using topic composition for calculating the percentage of views that a topic gets. "Maryland" is in a community capital projects group, but a Maryland police department mash row will only match around .05 (very approximate)- so, the erroneous boost in views is neglible.
  
**The human brain interprets our clusters at the end. It's up to us to make up "category" or "topic" names. Two LDA topics may be very similar IRL topics (we can see this with taxes/public assistance - probably because that's such a prevalent IRL category).** 
  - Then again, all of this is labeled by humans. Back to our first point, there is always room for disagreement/debate in what "subject" a dataset is about, and how narrow to make subjects.

# Calculate Popularity of Each Topic Tag:
** We have 27k+ datasets, each with their own "mash". We also have 42 topics. Each mash is composed of X% of a handful of topics (usually 3-6). We are going to give each topic proportional credit for a given dataset's popularity. So, if topic 38 composes 30% of a dataset's mash, it get's 30% of that dataset's quantitative value.**
- However, since an LDA model is probabilistic, it by nature finds topics to be 5-10% of a "document" (our mash). These topics are very marginal at best. So if a topic doesn't account for at least 10% of a document, it doesn't get any points.
    
** We also need to devise a metric for a dataset's popularity. It will be a weighted combination of that dataset's views and downloads. **
- There's no way to make this statistically exact, really. We want to give downloads more credit than just combining their raw totals with views, as a download indicates a dataset is more useful and utilized (what if people are just viewing a dataset, but do little with it?).
    
*The takeaway from all this is that our final metrics will be best read as proportional, not absolute comparisons.* 
- We are losing some interpretability; we certainly won't be able to take a proposed dataset and predict the views/downloads it will get. However, we will be able to say that certain categories are more broadly popular than others. 

In [93]:
stats = big_mash.copy()
stats = stats.assign(topic_comp = corpus_lda_list)
stats.head() #the topic_comp column are actual Python lists

Unnamed: 0,name,description,attribution,columns_field_name,columns_name,type,categories,domain_category,domain_tags,provenance,download_count,page_views_last_month,page_views_last_week,page_views_total,page_views_total_log,domain,mash,big_mash,topic_comp
0,Homelessness PIT Transitional Age Youth,[],,[location_on_the_night_of_the_count_total_pers...,"[Location on the night of the count, Total Per...",chart,[],,"[point in time, homelessness, ss, dchs]",official,4.0,46.0,5.0,319.0,8.322,dashboard.alexandriava.gov,"[point, in, time, homelessness, ss, dchs]","[point, in, time, homelessness, ss, dchs]","[(49, 0.836538461538)]"
1,Fair Housing Complaints,[],,"[violations, percent_found_to_be_compliant, si...","[Number of complaints, Percent of sites found ...",chart,[],,[housing],official,21.0,31.0,1.0,278.0,8.124,dashboard.alexandriava.gov,[housing],[housing],"[(49, 0.509615384615)]"
2,Parking Complaints Bar Chart,[],,[percent_of_valid_parking_meter_problem_servic...,[Percent of valid parking meter problem servic...,chart,[transportation],,[],official,17.0,17.0,4.0,274.0,8.103,dashboard.alexandriava.gov,[transportation],[transportation],"[(3, 0.509615384615)]"
3,NVMHI Admissions,[],,"[lipos_admissions_per_100k, nvmhi_admission_pe...","[LIPOS Admissions per 100K, NVMHI Admission pe...",chart,[],,"[delete, dchs]",official,5.0,38.0,5.0,263.0,8.044,dashboard.alexandriava.gov,"[delete, dchs]","[delete, dchs]","[(16, 0.339743589744), (26, 0.339743589744)]"
4,Property Owners Trainined,"[Office, of, Housing, Data]",,"[number_of_property_owners_trained, percent_of...","[Number of property owners trained, Percent of...",chart,[],,[housing],official,11.0,30.0,4.0,246.0,7.948,dashboard.alexandriava.gov,[housing],"[housing, Office, of, Housing, Data]","[(19, 0.254807692308), (49, 0.504807692308)]"


## One way of scoring - only when a topic composes more than .2 of a doc does it get "credit" for its share of that doc's views.

In [94]:
def calculate_topic_pop(df):
    results_dict = {}
    
    for row_num in df.index:
        for tup in df.topic_comp[row_num]:
            if not tup[0] in results_dict:
                if tup[1] >= 0.1:
                    results_dict[tup[0]] = (tup[1] * (df.iloc[row_num].download_count + 
                                             df.iloc[row_num].page_views_total_log))
                else:
                    pass
            if tup[0] in results_dict:
                if tup[1] >= 0.1:
                    results_dict[tup[0]] += (tup[1] * (df.iloc[row_num].download_count + 
                                             df.iloc[row_num].page_views_total_log))
    return results_dict

### Ok, let's make sure this function works and is calculating accurately:

In [95]:
small_tester = stats.head(5)

In [96]:
d = calculate_topic_pop(small_tester)

In [97]:
d

{3: 25.586043343189221,
 16: 8.8634985682820346,
 19: 9.6563794545575803,
 26: 8.8634985682820346,
 49: 45.022915362457141}

Topic 49 leads the pack in our metrics. We can see from our LDA model that it is the following:

In [99]:
lda_52_sixty.show_topic(49)

[('development', 0.17602775675090593),
 ('housing', 0.15221265764931263),
 ('economic', 0.058854109443175158),
 ('community', 0.04859194305354804),
 ('infrastructure', 0.019040125293198554),
 ('managed', 0.018304696035360443),
 ('medicaid', 0.017876406351462711),
 ('department', 0.014023829088608938),
 ('buildings', 0.013775405992175799),
 ('economy', 0.012347394313369113)]

In [100]:
small_tester

Unnamed: 0,name,description,attribution,columns_field_name,columns_name,type,categories,domain_category,domain_tags,provenance,download_count,page_views_last_month,page_views_last_week,page_views_total,page_views_total_log,domain,mash,big_mash,topic_comp
0,Homelessness PIT Transitional Age Youth,[],,[location_on_the_night_of_the_count_total_pers...,"[Location on the night of the count, Total Per...",chart,[],,"[point in time, homelessness, ss, dchs]",official,4.0,46.0,5.0,319.0,8.322,dashboard.alexandriava.gov,"[point, in, time, homelessness, ss, dchs]","[point, in, time, homelessness, ss, dchs]","[(49, 0.836538461538)]"
1,Fair Housing Complaints,[],,"[violations, percent_found_to_be_compliant, si...","[Number of complaints, Percent of sites found ...",chart,[],,[housing],official,21.0,31.0,1.0,278.0,8.124,dashboard.alexandriava.gov,[housing],[housing],"[(49, 0.509615384615)]"
2,Parking Complaints Bar Chart,[],,[percent_of_valid_parking_meter_problem_servic...,[Percent of valid parking meter problem servic...,chart,[transportation],,[],official,17.0,17.0,4.0,274.0,8.103,dashboard.alexandriava.gov,[transportation],[transportation],"[(3, 0.509615384615)]"
3,NVMHI Admissions,[],,"[lipos_admissions_per_100k, nvmhi_admission_pe...","[LIPOS Admissions per 100K, NVMHI Admission pe...",chart,[],,"[delete, dchs]",official,5.0,38.0,5.0,263.0,8.044,dashboard.alexandriava.gov,"[delete, dchs]","[delete, dchs]","[(16, 0.339743589744), (26, 0.339743589744)]"
4,Property Owners Trainined,"[Office, of, Housing, Data]",,"[number_of_property_owners_trained, percent_of...","[Number of property owners trained, Percent of...",chart,[],,[housing],official,11.0,30.0,4.0,246.0,7.948,dashboard.alexandriava.gov,[housing],"[housing, Office, of, Housing, Data]","[(19, 0.254807692308), (49, 0.504807692308)]"


The human eye indicates that this sample is mainly about housing/property and community development. Two of our datasets are explicitly about housing, and a third is closely related (youth homelessness). Most of these datasets have similar log_views and downloads. **This is very much an eyeball test an an extremely limited slice of our df, but it looks right so far**.

# Calculate Stats for Full 27k Row DF:

In [101]:
to_count = stats.copy()
to_count = to_count.fillna(0) #have to fill NaNs for downloads, log views etc with 0 or we get all NaNs

In [102]:
topic_stats = calculate_topic_pop(to_count)

In [103]:
#topic_stats #uncomment to view all stats in a dict

Pop this into a dataframe and add our topics' top 10 words:

In [104]:
topic_pop = pd.DataFrame.from_dict(topic_stats, orient='index')
topic_pop = topic_pop.rename(index=str, columns={0:"Adjusted_Popularity"}) #rename column

In [108]:
topic_tuple_lists = list(lda_52_sixty.show_topics(num_topics=52, formatted=False))
topics_list = []

for tup in topic_tuple_lists:
    topics_list.append(tup[1])

In [109]:
topic_pop = topic_pop.assign(Topic = topics_list)
topic_pop = topic_pop.sort_values(by='Adjusted_Popularity', ascending=False)

topic_pop.head(10)

Unnamed: 0,Adjusted_Popularity,Topic
28,92505937.889,"[(health, 0.305311895849), (san, 0.06110177203..."
41,866205.286,"[(water, 0.0961247088917), (environment, 0.048..."
23,599549.333,"[(financial, 0.0959611006916), (permits, 0.066..."
3,559885.634,"[(chart, 0.0699636595856), (children, 0.059274..."
21,471289.932,"[(health, 0.0945969708127), (disease, 0.043755..."
39,406099.043,"[(energy, 0.123710849454), (environment, 0.090..."
31,355623.123,"[(finance, 0.164742098166), (year, 0.062366577..."
48,346728.373,"[(transportation, 0.0493422015093), (plans, 0...."
26,344660.582,"[(transportation, 0.164542015423), (traffic, 0..."
43,340707.468,"[(state, 0.175638296746), (new, 0.133403517998..."


In [110]:
topics = topic_pop.copy()
topics[["topic1", "topic2", "topic3", "topic4", 
           "topic5", "topic6", "topic7", "topic8", "topic9", "topic10"]] = topics.Topic.apply(pd.Series)

In [111]:
topics.head(10)

Unnamed: 0,Adjusted_Popularity,Topic,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10
28,92505937.889,"[(health, 0.305311895849), (san, 0.06110177203...","(health, 0.305311895849)","(san, 0.0611017720363)","(county, 0.0586392706518)","(care, 0.0280055059751)","(counties, 0.0199253952569)","(system, 0.0163770295818)","(facilities, 0.0140076033687)","(network, 0.0135614573066)","(home, 0.0112061646845)","(francisco, 0.0103104720032)"
41,866205.286,"[(water, 0.0961247088917), (environment, 0.048...","(water, 0.0961247088917)","(environment, 0.0481019608081)","(protection, 0.0206283827247)","(tx, 0.0202036902737)","(filed, 0.0195698643711)","(waste, 0.0192716246599)","(site, 0.0192005256969)","(quality, 0.0185576020412)","(environmental, 0.0173950954393)","(monitoring, 0.0158884561679)"
23,599549.333,"[(financial, 0.0959611006916), (permits, 0.066...","(financial, 0.0959611006916)","(permits, 0.0668632739602)","(building, 0.0631953900546)","(permit, 0.054456521697)","(expenditures, 0.0429247389481)","(guide, 0.0250580534259)","(issued, 0.0232434329642)","(construction, 0.0228846174877)","(information, 0.0227968624179)","(filter, 0.021954851646)"
3,559885.634,"[(chart, 0.0699636595856), (children, 0.059274...","(chart, 0.0699636595856)","(children, 0.0592740047687)","(income, 0.039083994413)","(historic, 0.0250932288571)","(home, 0.0218525211874)","(assistance, 0.018091177706)","(homes, 0.017940795662)","(families, 0.0179183548402)","(low, 0.0178577144875)","(pay, 0.0176100642703)"
21,471289.932,"[(health, 0.0945969708127), (disease, 0.043755...","(health, 0.0945969708127)","(disease, 0.0437556864768)","(000, 0.0390249577363)","(per, 0.0309178121906)","(rate, 0.0290345540868)","(rates, 0.0262263352287)","(chronic, 0.0250697947817)","(risk, 0.0223843182628)","(agency, 0.0216210266927)","(due, 0.0195463066794)"
39,406099.043,"[(energy, 0.123710849454), (environment, 0.090...","(energy, 0.123710849454)","(environment, 0.0906900208835)","(electricity, 0.0275225911326)","(air, 0.0244810000559)","(sustainable, 0.0241394711405)","(action, 0.0236316969298)","(climate, 0.0231125236045)","(city, 0.0191503642982)","(clean, 0.0190989188479)","(facilities, 0.0184715188825)"
31,355623.123,"[(finance, 0.164742098166), (year, 0.062366577...","(finance, 0.164742098166)","(year, 0.062366577351)","(state, 0.0533822539548)","(monthly, 0.0506962481995)","(fiscal, 0.0476222221357)","(payments, 0.0357199197181)","(government, 0.0264540232587)","(june, 0.0225770287471)","(july, 0.0216759115561)","(report, 0.0215900104843)"
48,346728.373,"[(transportation, 0.0493422015093), (plans, 0....","(transportation, 0.0493422015093)","(plans, 0.0463408207807)","(iowa, 0.0372400214617)","(area, 0.031583902009)","(transit, 0.0297466527633)","(operations, 0.029650998681)","(region, 0.0286308933708)","(bus, 0.0239404049245)","(priority, 0.0197652573751)","(people, 0.0157596265395)"
26,344660.582,"[(transportation, 0.164542015423), (traffic, 0...","(transportation, 0.164542015423)","(traffic, 0.0382494150922)","(street, 0.0297088546702)","(parking, 0.0260484479646)","(infrastructure, 0.024499661836)","(city, 0.0242484880067)","(safe, 0.0239084069945)","(vehicle, 0.0231351100665)","(streets, 0.0219852292388)","(bike, 0.0155102577616)"
43,340707.468,"[(state, 0.175638296746), (new, 0.133403517998...","(state, 0.175638296746)","(new, 0.133403517998)","(michigan, 0.10266572561)","(york, 0.0777270773871)","(information, 0.0292838745522)","(check, 0.0203220904165)","(measurements, 0.0197080431536)","(jobs, 0.0170402721275)","(licensing, 0.0169768940674)","(ny, 0.0167776417369)"


Just the most superficial and speculative of conclusions looking at this but... it seems like datasets about health care facilities are astoundingly popular. Environment and water quality and permitting are a distant second. There is ANOTHER health dataset -- clearly about public health and disease rates -- in the top 10. And we can clearly tell that two types of transportation data sets -- one that's more planning-oriented, another that seems more focused on traffic, parking, and bikes -- also show up prominently.

# Recalculate based on different metrics - either full credit, or only credit for above a .1.

### ANOVA test of statistical significance:

### Different Metrics for Popularity
- Take into account views last week/month

## Display titles of strongest document matches for each "cluster" (topic)