In [1]:
# installing all packages and dependencies
!pip install bertopic[all]



In [2]:
from bertopic import BERTopic
import pandas as pd

In [3]:
# importing local file dataset and creating different document filters to pass into BERTopic model
# filtered_smaller_sample.csv can be found on our github repo

df = pd.read_csv('filtered_smaller_sample.csv', engine="python")
docs = df['data_a']
docs_filter = df['filtered_sentence']
docs_combined = df['combined']

In [None]:
# extracting topic from our docs - using only the answers
topic_model_answers_only = BERTopic()
topics, _ = topic_model_answers_only.fit_transform(docs)

In [None]:
# checking the number of topics generated, should equal total number of rows 536
len(topics)


536

In [None]:
# getting all topic information 
topic_model_answers_only.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,209,-1_this_your_it_you
1,0,107,0_the_his_film_name
2,1,100,1_san_washington_tour_city
3,2,74,2_john_michael_lewis_hart
4,3,36,3_queen_elizabeth_cleopatra_victoria
5,4,10,4_turtle_he_him_his


In [None]:
# extracting topic from our second set of docs - using only the filtered sentences (stop words pre-removed, includes both questions and answers)
topic_model_filtered = BERTopic()
topics, _ = topic_model_filtered.fit_transform(docs_filter)

In [None]:
# accessing information from filtered topic 
topic_model_filtered.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,504,-1_state_name_world_first
1,0,16,0_exile_evian_events_event
2,1,16,1_language_greek_word_example


In [None]:
# extracting topic from our second set of docs - using only combined questions and answers, with stop words
topic_model_combined = BERTopic()
topics, _ = topic_model_combined.fit_transform(docs_combined)

In [None]:
topic_model_combined.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,507,-1_the_in_of_on
1,0,16,0_zoo_exiled_exile_everlast
2,1,13,1_language_policy_curriculum_syllabus


In [25]:
# testing combined docs with an nr_topics = 20
topic_model_combined_20 = BERTopic(min_topic_size=8, embedding_model="paraphrase-MiniLM-L6-v2")
topics, _ = topic_model_combined_20.fit_transform(docs_combined)

topic_model_combined_20.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,115,-1_com_his_href_www
1,0,56,0_let_pill_emoticons_take
2,1,55,1_authors_poets_art_wrote
3,2,47,2_city_geography_island_etropolis
4,3,28,3_warhol_company_business_industry
5,4,23,4_state_college_superlatives_california
6,5,21,5_drama_film_movie_cinema
7,6,20,6_science_measuring_gases_energy
8,7,19,7_sports_sportsmen_athletes_espn
9,8,19,8_collective_animal_mythology_angels


In [26]:
topic_model_combined_20.topics

{-1: [('com', 0.016618828219322843),
  ('his', 0.015787865508182748),
  ('href', 0.015649930471539075),
  ('www', 0.015649930471539075),
  ('archive', 0.015649930471539075),
  ('old', 0.013591198328320302),
  ('country', 0.013295062575458275),
  ('from', 0.012794288897787399),
  ('history', 0.012519214365904),
  ('exile', 0.012510651158528382)],
 0: [('let', 0.03982132224053963),
  ('pill', 0.026308628180492705),
  ('emoticons', 0.026308628180492705),
  ('take', 0.024276791002828375),
  ('it', 0.023803567827908944),
  ('slang', 0.022124890714994207),
  ('rhymes', 0.02104690254439416),
  ('word', 0.019674496002545182),
  ('can', 0.018776353610839885),
  ('medicine', 0.016593668036245658)],
 1: [('authors', 0.03469994590687873),
  ('poets', 0.026860103740379726),
  ('art', 0.02573604975274745),
  ('wrote', 0.02478567564777052),
  ('literature', 0.02258866770901998),
  ('artists', 0.01800567614225246),
  ('novel', 0.01800567614225246),
  ('governor', 0.01800567614225246),
  ('book', 0.016

In [31]:
# testing combined docs with an nr_topics = 22
topic_model_combined_22_4 = BERTopic(min_topic_size=4, nr_topics=22, embedding_model="paraphrase-MiniLM-L6-v2")
topics, _ = topic_model_combined_22_4.fit_transform(docs_combined)

topic_model_combined_22_4.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,91,-1_name_he_greek_his
1,0,48,0_flag_people_www_href
2,1,44,1_emoticons_consonants_pill_money
3,2,31,2_warhol_authors_literature_art
4,3,28,3_gov_civil_old_penn
5,4,24,4_airline_travel_devices_measuring
6,5,23,5_oscar_film_movie_cinematic
7,6,22,6_state_annual_alaska_events
8,7,21,7_peach_female_royal_queen
9,8,18,8_capital_city_etropolis_tourism


In [32]:
topic_model_combined_22_4.topics

{-1: [('name', 0.018686479270611518),
  ('he', 0.017455325616521574),
  ('greek', 0.015312909669076079),
  ('his', 0.01528893758504579),
  ('angels', 0.014361587857446694),
  ('was', 0.014047902095905484),
  ('her', 0.013328545299396104),
  ('angel', 0.012097290861745394),
  ('american', 0.011107121082830088),
  ('who', 0.01085470973353267)],
 0: [('flag', 0.08008947895450935),
  ('people', 0.06838358773802622),
  ('www', 0.049294821505462376),
  ('href', 0.049294821505462376),
  ('flags', 0.042993295016236555),
  ('2007', 0.029112824565643017),
  ('07', 0.02764958816358262),
  ('architects', 0.026454036971491117),
  ('world', 0.025258067895334115),
  ('2010', 0.02138393934373058)],
 1: [('emoticons', 0.0348476214458376),
  ('consonants', 0.0348476214458376),
  ('pill', 0.0348476214458376),
  ('money', 0.03334083172026773),
  ('slang', 0.02935342642155742),
  ('it', 0.02729938574355063),
  ('re', 0.025768181043621823),
  ('words', 0.022089874099784794),
  ('rhymes', 0.02090857286750256

In [33]:
# saving most effective model

topic_model_combined_22_4.save("22-4-model")

In [35]:
# importing new testing dataset
new_df = pd.read_csv('jeopardy_combined.csv', engine="python")
new_df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer,Q_A_Combined
0,4680,12/31/2004,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus,"For the last 8 years of his life, Galileo was ..."
1,4680,12/31/2004,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,No. 2: 1912 Olympian; football star at Carlisl...
2,4680,12/31/2004,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona,The city of Yuma in this state has a record av...
3,4680,12/31/2004,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,"In 1963, live on ""The Art Linkletter Show"", th..."
4,4680,12/31/2004,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,"Signer of the Dec. of Indep., framer of the Co..."


In [37]:
# defining new variable with total docs of 5000
testing = new_df["Q_A_Combined"]
testing_docs = testing.loc[:5000]

In [38]:
# fitting 22_4 model into new data to see how it performs: 

topics, _ = topic_model_combined_22_4.fit_transform(testing_docs)
topic_model_combined_22_4.get_topic_info()


Unnamed: 0,Topic,Count,Name
0,-1,1881,-1_was_his_from_he
1,0,214,0_it_your_you_can
2,1,211,1_archive_jpg_2009_05
3,2,209,2_wrote_novel_book_poem
4,3,195,3_her_she_ballet_queen
5,4,191,4_island_sea_planet_islands
6,5,177,5_bird_fish_species_animal
7,6,176,6_president_kennedy_amendment_nixon
8,7,152,7_instrument_symphony_musical_composer
9,8,150,8_war_battle_film_military


In [39]:
topic_model_combined_22_4.topics

{-1: [('was', 0.016134720382434603),
  ('his', 0.013701418717281926),
  ('from', 0.01315353863623927),
  ('he', 0.012243063218269063),
  ('by', 0.011445828310780073),
  ('as', 0.010232966488224682),
  ('name', 0.008567823777639715),
  ('who', 0.007871218355545002),
  ('be', 0.007375611580859525),
  ('her', 0.007127261618597031)],
 0: [('it', 0.0335428335751567),
  ('your', 0.0217685844488914),
  ('you', 0.020382234804293447),
  ('can', 0.014536838353686266),
  ('frog', 0.01304893552892824),
  ('some', 0.01235307548770589),
  ('are', 0.012288579045731697),
  ('brain', 0.011996048636933756),
  ('hormone', 0.011187272605483179),
  ('bones', 0.010609340216920463)],
 1: [('archive', 0.07408139552471617),
  ('jpg', 0.0654802750941685),
  ('2009', 0.02076871342357267),
  ('05', 0.019375030904200622),
  ('wmv', 0.0192650238151954),
  ('2007', 0.01607870585219377),
  ('07', 0.01596247060329176),
  ('2010', 0.015913964495701222),
  ('2008', 0.015606530532176314),
  ('ha', 0.014658785632025165)],