<a href="https://colab.research.google.com/github/mmotztulane/CSCapstone/blob/main/PublicRecords_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
%%capture
!pip install bertopic
import pandas as pd
import spacy
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# requests data - convert to list of strings
requests = pd.read_csv('Public_Records_Requests.csv')
requests = requests['Request Text'].tolist()

In [None]:
# training
topic_model_1 = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model_1.fit_transform(requests)

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

In [None]:
# save model 1
topic_model_1.save("PublicRecords_Model1")	

In [None]:
# most frequent topics: -1 indicates outliers, ignore
freq = topic_model_1.get_topic_info(); freq.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,2530,-1_the_and_of_in
1,0,223,0_fire_report_occurred_incident
2,1,144,1_camera_footage_video_accident
3,2,135,2_fire_incident_report_occurred
4,3,132,3_rfp_proposals_selection_evaluation
5,4,104,4_public_records_fees_act
6,5,104,5_paid_taxes_nola_st
7,6,100,6_street_permits_orleans_new
8,7,93,7_incident_initial_report_view
9,8,93,8_paid_taxes_years_la


In [None]:
topic_model_1.get_topic(0) # get the most frequent topic

[('fire', 0.05267922523855218),
 ('report', 0.03489122195061917),
 ('occurred', 0.03273741016577411),
 ('incident', 0.025259593276499192),
 ('la', 0.02479762762000258),
 ('involving', 0.02093564846224196),
 ('new', 0.018817773365469956),
 ('orleans', 0.01856295081160376),
 ('on', 0.017813844971275566),
 ('at', 0.01724144049364158)]

In [None]:
topic_model_1.get_topic(1) # get the 2nd most frequent topic

[('camera', 0.03233983545989223),
 ('footage', 0.03190395171780871),
 ('video', 0.03158366868124489),
 ('accident', 0.03074310280447741),
 ('intersection', 0.030413903174532117),
 ('pm', 0.02656250932480622),
 ('traffic', 0.017989825800340278),
 ('cameras', 0.015957888315200414),
 ('at', 0.015665788564913737),
 ('on', 0.015453408727017172)]

In [None]:
# size of each circle = frequency of topic
interactive_map = topic_model_1.visualize_topics(); interactive_map

In [None]:
# Set labels for top 10 topics
topic_model_1.set_topic_labels({0: "Fire Incident", 1: "Traffic Camera Footage",
                                2: "Fire Incident", 3: "RFP", 4: "Public Records Requests",
                                5: "Tax Payment", 6: "Street Permits", 7: "Incident Reports",
                                8: "Taxes", 9: "Emails"})

# Display model 1 bar chart
topic_model_1.visualize_barchart(top_n_topics=10, title='Model 1: Topic Word Scores',
                                 custom_labels=True)

In [None]:
# change ngram_range, let each entity in a topic be up to 3 words
vectorizer_model = CountVectorizer(ngram_range=(1, 3))

# update model 1
topic_model_1.update_topics(requests, vectorizer_model=vectorizer_model)

In [None]:
# get most frequent topics: ngram_range = (1,3)
freq = topic_model_1.get_topic_info(); freq.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,2374,-1_the_and_of_to
1,0,364,0_paid_tax_bill_taxes
2,1,223,1_fire_report_the fire_fire report
3,2,143,2_rfp_proposals_rfp no_proposal
4,3,134,3_fire_report_incident_fire report
5,4,132,4_camera_footage_accident_intersection
6,5,110,5_public_records_request_public records
7,6,96,6_bid_group_tabulations_bid tabulations
8,7,89,7_contract_the city_the city of_city
9,8,87,8_incident_initial incident_incident report_in...


In [None]:
# train a new model with modified topic sizes (min topic size = 5)
topic_model_2 = BERTopic(language="english", min_topic_size=5, calculate_probabilities=True, verbose=True)
topics, probs = topic_model_2.fit_transform(requests)

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

In [None]:
# save model 2
topic_model_2.save("PublicRecords_Model2")	

In [None]:
# most frequent topics: model 2
freq = topic_model_2.get_topic_info(); freq.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,2122,-1_the_and_in_of
1,0,365,0_paid_bill_taxes_tax
2,1,243,1_fire_report_occurred_incident
3,2,132,2_video_footage_intersection_accident
4,3,79,3_rfp_proposals_sheets_scoring
5,4,77,4_incident_initial_view_report
6,5,74,5_permit_permits_seasons_issued
7,6,74,6_code_violations_open_parcel
8,7,69,7_bid_unit_prices_tabulations
9,8,68,8_body_footage_camera_cam


In [None]:
topic_model_2.get_topic(0) # get the most frequent topic

[('paid', 0.030763668590070976),
 ('bill', 0.022534097061466675),
 ('taxes', 0.021453433857983782),
 ('tax', 0.01726269520375312),
 ('bills', 0.014618764186750375),
 ('payment', 0.013607243854968174),
 ('years', 0.013133040618896929),
 ('need', 0.011279474878138566),
 ('amount', 0.01105217765898051),
 ('property', 0.01071072975638051)]

In [None]:
topic_model_2.get_topic(1) # get the 2nd most frequent topic

[('fire', 0.03495184621558374),
 ('report', 0.023251193681501966),
 ('occurred', 0.022447373691165347),
 ('incident', 0.018760099343314),
 ('la', 0.015005217922794222),
 ('involving', 0.014626335187983272),
 ('new', 0.010739276696985394),
 ('on', 0.010698407010312922),
 ('orleans', 0.010648460715898724),
 ('at', 0.010626758741238479)]

In [None]:
# Set labels for top 10 topics
topic_model_2.set_topic_labels({0: "Tax Payments", 1: "Fire Incident",
                                2: "Traffic Camera Footage", 3: "RFP", 4: "Incident Reports",
                                5: "Permit Applications", 6: "Code Violations", 7: "Pricing",
                                8: "Body Camera Footage", 9: "Fire Incident"})

# Display model 2 bar chart
topic_model_2.visualize_barchart(top_n_topics=10, title='Model 2: Topic Word Scores',
                                 custom_labels=True)

In [None]:
# train a new model with modified topic sizes (topic size = 5)
topic_model_3 = BERTopic(language="english", top_n_words=5, min_topic_size=5,
                         calculate_probabilities=True, verbose=True)
topics, probs = topic_model_3.fit_transform(requests)

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

In [None]:
# save model 3
topic_model_3.save("PublicRecords_Model3")	

In [None]:
# most frequent topics: model 3
freq = topic_model_3.get_topic_info(); freq.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,2424,-1_the_and_in_all
1,0,92,0_paid_taxes_years_bill
2,1,86,1_contract_contracts_services_city
3,2,83,2_permit_permits_applications_issued
4,3,81,3_body_footage_camera_cam
5,4,78,4_accident_video_intersection_camera
6,5,78,5_street_4412a_dating_permits
7,6,76,6_incident_initial_view_report
8,7,76,7_emails_january_2018_received
9,8,75,8_public_fees_act_me


In [None]:
topic_model_3.get_topic(0) # get the most frequent topic

[('paid', 0.023392599683124887),
 ('taxes', 0.018728375030529702),
 ('years', 0.018574413885267912),
 ('bill', 0.015169417880669438),
 ('la', 0.014078575994031483)]

In [None]:
topic_model_3.get_topic(1) # get the 2nd most frequent topic

[('contract', 0.02239770790045691),
 ('contracts', 0.018548210489361925),
 ('services', 0.017404181860354244),
 ('city', 0.013677655011864817),
 ('agreements', 0.013673633835138364)]

In [None]:
# Set labels for top 10 topics
topic_model_3.set_topic_labels({0: "Tax Payment", 1: "Contracts",
                                2: "Permit Applications", 3: "Body Camera Footage", 4: "Traffic Camera Footage",
                                5: "Permits", 6: "Incident Reports", 7: "Emails",
                                8: "Public Records", 9: "Tax Bills"})

# Display model 3 bar chart
topic_model_3.visualize_barchart(top_n_topics=10, title='Model 3: Topic Word Scores',
                                 custom_labels=True)

In [None]:
# train new model, add diversity
topic_model_4 = BERTopic(language="english", calculate_probabilities=True, diversity=0.5, verbose=True)
topics, probs = topic_model_4.fit_transform(requests)

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

In [None]:
# save model 4
topic_model_4.save("PublicRecords_Model4")

In [None]:
# most frequent topics: model 4
freq = topic_model_4.get_topic_info(); freq.head(20)

Unnamed: 0,Topic,Count,Name
0,-1,2320,-1_all_records_or_new
1,0,350,0_taxes_bills_st_nola
2,1,223,1_fire_orleans_happened_st
3,2,150,2_accident_intersection_traffic_crime
4,3,136,3_rfp_proposals_evaluation_submitted
5,4,135,4_public_records_fees_louisiana
6,5,123,5_hearing_rental_nicholls_recordings
7,6,115,6_fire_incident_2431_2020
8,7,110,7_group_tabulations_de_proposal
9,8,100,8_contracts_donation_deepwater_oil


In [None]:
topic_model_4.get_topic(0) # get the most frequent topic

[('taxes', 0.02530271397460279),
 ('bills', 0.015073727022006201),
 ('st', 0.010985627874413149),
 ('nola', 0.010229897459317771),
 ('andor', 0.010098585806939566),
 ('estate', 0.008436083284670768),
 ('mail', 0.008332047192687285),
 ('presale', 0.008297518451924642),
 ('payments', 0.008249751830665961),
 ('notices', 0.008010579230723259)]

In [None]:
topic_model_4.get_topic(1) # get the 2nd most frequent topic

[('fire', 0.05161375796259105),
 ('orleans', 0.01807454606534836),
 ('happened', 0.010803363303864126),
 ('st', 0.01030270774100855),
 ('need', 0.008678603942663536),
 ('avenue', 0.00765552346236769),
 ('department', 0.007624992426648681),
 ('investigation', 0.007187351008425922),
 ('june', 0.006314481830658009),
 ('70119', 0.006110321162769096)]

In [None]:
# Set labels for top 10 topics
topic_model_4.set_topic_labels({0: "Tax Bills", 1: "Fire Incident",
                                2: "Crime, Traffic Cameras", 3: "RFP", 4: "Public Records",
                                5: "City Court", 6: "Fire Incident", 7: "Proposals",
                                8: "Donations", 9: "Incident Reports"})

# Display model 4 bar chart
topic_model_4.visualize_barchart(top_n_topics=10, title='Model 4: Topic Word Scores',
                                 custom_labels=True)