# Bertopic with Kickstarter

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyreadr
import os

# Load data

In [2]:
def get_pandas_frame(path):
    return pyreadr.read_r(path)[None]


def get_pandas_r_file(week_nr, file_type):
    week_nr = str(week_nr)
    week_folder = f"Kickstarter/2019_week{week_nr}"
    pledge_name = os.path.join(week_folder, f"{file_type}.rds")
    return get_pandas_frame(pledge_name)

def select_all_project_id_from_category(df, category):
    """
    take all the project id with the column Category == category
    :param df:
    :param category:
    :return:
    """
    df = df[df["Category"] == category]
    return df["my_id"].unique()



In [3]:
info_week_19 = get_pandas_r_file(19, "info")
info_week_19

Unnamed: 0,my_id,creator_slug,project_slug,Blurb,Category,Location,Staff_recommended,Goal_USD,Pledge_USD,Number_Backers,...,Project_Community_top_countries,Project_Community_nb_new_backers,Project_Community_nb_returning_backers,Creator_description,Creator_nb_projects,Creator_date_joined,Creator_nb_backed,Creator_location,Creator_made_comments,Project_description
0,project_000001,2146868994,the-because-black-life-conference-2018,The Because Black Life Conference is a (Black)...,Art,Minneapolis | US,TRUE,10000,10150,72,...,|United States:57 backers|Australia:1 backer|S...,27,45,August 18th 10a-6pm Cultivated by www.BlackTab...,1,2018-04-04 18:17:30 -0400,Backed 0 projects,KO,KO,The Because Black Life Conference 2018 - This ...
1,project_000002,1805256969,petography-illustrations-of-your-pet,Show an appreciation for your pet by having a ...,Illustration,Victoria | CA,FALSE,156.13997,645.0844790565,34,...,|United States:23 backers|Canada:2 backers|Uni...,1,33,"As an entrepreneur at heart, I enjoy creativit...",5,2017-03-28 18:37:59 -0400,Backed 2 projects,KO,24,Risks and challenges Having done print focuse...
2,project_000003,352302209,dessert-cuties-enamel-pins,Kawaii Style Enamel Pins featuring yummy treats!,Art,Boston | US,FALSE,800,815,33,...,|United States:24 backers|Australia:1 backer,7,26,I am lover of all things cute and cuddly and i...,2,2017-08-12 17:05:11 -0400,Backed 6 projects,"Boston, MA",4,Who doesn't love cute enamel pins? I madeÂ sev...
3,project_000004,melaniecollins,adorable-deadpool-chibi-sticker,Help me get my Deadpool stickers printed and g...,Illustration,Kettering | US,FALSE,400,870,61,...,|United States:49 backers|United Kingdom:5 bac...,0,61,My name is Melanie Collins (aka Melbaka) and I...,10,2016-03-06 18:10:08 -0500,Backed 3 projects,"Dayton, OH",3,My name is Melanie (Melbaka) and welcome to my...
4,project_000005,17929745,japanese-mythos-enamel-pins,Enamel pins based on creatures from Japanese M...,Art,Minneapolis | US,FALSE,1050,11069,293,...,|United States:239 backers|Canada:11 backers|U...,6,287,I am very excited to create new projects. I ho...,5,2013-10-12 23:41:38 -0400,Backed 67 projects,"Minneapolis, MN",42,â¦ï¸â¦ï¸â¦ï¸ Information â¦ï¸â¦ï¸â¦...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73795,,,,,,,,,,,...,,,,,,,,,,
73796,,,,,,,,,,,...,,,,,,,,,,
73797,,,,,,,,,,,...,,,,,,,,,,
73798,,,,,,,,,,,...,,,,,,,,,,


In [4]:
software_project_ids = select_all_project_id_from_category(info_week_19, "Software")
software_project_ids

array(['project_005539', 'project_005540', 'project_005541', ...,
       'project_007598', 'project_007599', 'project_007600'], dtype=object)

In [5]:
posts_week_19 = get_pandas_r_file(19, "posts")
posts_week_19

Unnamed: 0,Project_my_id,creator_slug,project_slug,Post_Nb,Post_Title,Post_Date,Post_Text,Post_id
0,project_000001,2146868994,the-because-black-life-conference-2018,Update #1,Can you believe it!! 8 Endorsements!!!,"May 18, 2018",The Because Black Life Conference is a (Black)...,project_0000011
1,project_000002,1805256969,petography-illustrations-of-your-pet,Update #5,Petography End of July Update!,"Jul 24, 2018",Hi everyone! Hope your summer is going well! ...,project_0000025
2,project_000002,1805256969,petography-illustrations-of-your-pet,Update #4,Digital Update!,"Jul 11, 2018",ONLY FOR BACKERS,project_0000024
3,project_000002,1805256969,petography-illustrations-of-your-pet,Update #3,Petography Update!,"Jun 13, 2018",ONLY FOR BACKERS,project_0000023
4,project_000002,1805256969,petography-illustrations-of-your-pet,Update #2,Name of your pets!,"May 22, 2018",ONLY FOR BACKERS,project_0000022
...,...,...,...,...,...,...,...,...
446868,project_073361,rowanlakejr,new-altervice,Update #4,A Bit of Info,"Mar 14, 2019",Just in case you never saw what the larger siz...,project_0733614
446869,project_073361,rowanlakejr,new-altervice,Update #3,Altervice Website Part 2,"Mar 11, 2019",The Altervice website is almost complete. I've...,project_0733613
446870,project_073361,rowanlakejr,new-altervice,Update #2,Altervice Website,"Mar 5, 2019",That's right folks! I'm currently working on a...,project_0733612
446871,project_073361,rowanlakejr,new-altervice,Update #1,New Altervice Commercial,"Mar 1, 2019",Hey guys! Thanks so much for support! Here's a...,project_0733611


# Now we select only the posts from the projects that are in the category software

In [6]:
def select_posts_from_the_available_project_ids(posts_df, project_ids):
    """
    select all the posts from the project ids
    :param posts_df:
    :param project_ids:
    :return:
    """
    return posts_df[posts_df["Project_my_id"].isin(project_ids)]

In [7]:
posts_software_projects = select_posts_from_the_available_project_ids(posts_week_19, software_project_ids)
posts_software_projects

Unnamed: 0,Project_my_id,creator_slug,project_slug,Post_Nb,Post_Title,Post_Date,Post_Text,Post_id
38385,project_005539,damicareer,purpose-connect,Update #2,The WHY...,"Apr 23, 2019",I received this from one of my clients last ye...,project_0055392
38386,project_005539,damicareer,purpose-connect,Update #1,Timeline Expectations - Day 2 Update,"Apr 20, 2019","Hey guys, itâs Day 2 and I just want to firs...",project_0055391
38387,project_005546,kidunot89,wpgraphql-woocommerce,Update #1,Release v0.0.3-beta,"Apr 25, 2019",Version 0.0.3 was released. Full Changelog...,project_0055461
38388,project_005547,465222059,table-university,Update #2,new collaboration started,"Apr 9, 2019",We started a new collaboration with tesilike. ...,project_0055472
38389,project_005547,465222059,table-university,Update #1,additional information!,"Apr 5, 2019",Hi all! I would add just an information to be...,project_0055471
...,...,...,...,...,...,...,...,...
43378,project_007590,102287676,the-hangupmachine-a-spam-filter-for-your-phone,Update #1,Half way point,"May 5, 2014",ONLY FOR BACKERS,project_0075901
43379,project_007592,1224571211,research-assistant-tools-to-accelerate-research,Update #2,We can now search the Public Library of Scienc...,"Apr 25, 2014","Research Assistant R502 is now available, maki...",project_0075922
43380,project_007592,1224571211,research-assistant-tools-to-accelerate-research,Update #1,First suggestion and a great one!,"Apr 21, 2014",Many thanks to everyone who have shown interes...,project_0075921
43381,project_007595,croosle,croosle-all-in-one-educational-and-experimenta...,Update #2,Croosle Smart Kit - stay up-to-date!,"May 12, 2014",ONLY FOR BACKERS,project_0075952


In [14]:
animation_projects = select_all_project_id_from_category(info_week_19, "Animation")
posts_animation_projects = select_posts_from_the_available_project_ids(posts_week_19, animation_projects)
posts_animation_projects

Unnamed: 0,Project_my_id,creator_slug,project_slug,Post_Nb,Post_Title,Post_Date,Post_Text,Post_id
141480,project_022394,ivanricardi,creation-of-the-official-animated-video-clip-p...,Update #2,"gracias por tu apoyo, hoy despertamos optimist...","Apr 25, 2019","gracias por tu apoyo, hoy despertamos optimist...",project_0223942
141481,project_022394,ivanricardi,creation-of-the-official-animated-video-clip-p...,Update #1,preparing the letter to plan the project,"Apr 24, 2019",With much emotion I share that we are preparin...,project_0223941
141482,project_022396,katina,make-krushia-story-an-animated-movie-to-promot...,Update #1,I launched my campaign several days ago and I ...,"Apr 29, 2019",I currently have Sharp Eye Animation Company w...,project_0223961
141499,project_022424,1209343473,fleegix,Update #2,"Alan Mendelsohn, The Boy From Mars.","Apr 25, 2019",#fleegix is an adaptation of this book by Dani...,project_0224242
141500,project_022424,1209343473,fleegix,Update #1,DAY 1: HOLY COW!!!!!,"Apr 23, 2019",Almost a third of the way funded on the first ...,project_0224241
...,...,...,...,...,...,...,...,...
292258,project_050242,doejo,homeless-cop,Update #3,Kato watching TV!,"Sep 3, 2011",Someday he will see himself on there!,project_0502423
292259,project_050242,doejo,homeless-cop,Update #2,Kato gets a job?!,"Aug 26, 2011","Imagine the cool cat we all know as Kato, if h...",project_0502422
292260,project_050242,doejo,homeless-cop,Update #1,Nelson &amp; Kato,"Aug 12, 2011","Just 2 good friends in a show about,........st...",project_0502421
292261,project_050246,twointhepink,music-video-for-reso-morn,Update #1,new video!,"Aug 6, 2011",We've uploaded a new video--check it out!Â,project_0502461


In [13]:
videogames_projects = select_all_project_id_from_category(info_week_19, "Video Games")
posts_videogames_projects = select_posts_from_the_available_project_ids(posts_week_19, videogames_projects)
posts_videogames_projects

Unnamed: 0,Project_my_id,creator_slug,project_slug,Post_Nb,Post_Title,Post_Date,Post_Text,Post_id
149664,project_024250,220356443,corruption-a-history-of-lestive,Update #4,Novelisation of Corruption,"Apr 29, 2019","Hey all, as you all know the Level 5 reward ti...",project_0242504
149665,project_024250,220356443,corruption-a-history-of-lestive,Update #3,Races in Corruption (and the rest of the Thorn...,"Apr 28, 2019",Hey all. I said I would discuss race in the ga...,project_0242503
149666,project_024250,220356443,corruption-a-history-of-lestive,Update #2,The History of the Thorntop World (or why I ma...,"Apr 27, 2019",I mentioned briefly in my main post about how ...,project_0242502
149667,project_024250,220356443,corruption-a-history-of-lestive,Update #1,Awesome Start!,"Apr 27, 2019",Thank you so much for the pledges so far. We'v...,project_0242501
149668,project_024251,125520822,romance-divine-adult-visual-novel,Update #3,"100 backers, 20% there!","May 1, 2019",Special update to commemorate getting our 100t...,project_0242513
...,...,...,...,...,...,...,...,...
377750,project_058917,2079547763,jagged-alliance-flashback,Update #36,Developer Diary 1 on Character Development,"Nov 15, 2013","Dear Backers, Today we have a real treat for ...",project_05891736
446816,project_073338,timseastudios,adome-action-platformer-and-exploration-adventure,Update #4,Timsea Studios announcement.,"Apr 26, 2019",In Timsea Studios we have made the decision to...,project_0733384
446817,project_073338,timseastudios,adome-action-platformer-and-exploration-adventure,Update #3,Working the art of displacement!,"Apr 22, 2019",In Timsea Studios we are paying special attent...,project_0733383
446818,project_073338,timseastudios,adome-action-platformer-and-exploration-adventure,Update #2,Gameplay!,"Apr 18, 2019",Adome gameplay is based on a satisfying sense ...,project_0733382


# Bertopic

# Method 1 where you specify each building block

In [17]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

def get_standard_bertopic():
    # Step 1 - Extract embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = KeyBERTInspired()

    # All steps together
    topic_model = BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    return topic_model


## make a topic model for software

In [43]:
topic_model_software = get_standard_bertopic()
topics_software, probs_software = topic_model_software.fit_transform(posts_software_projects["Post_Text"])

In [44]:
topic_model_software.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1803,-1_kickstarter_development_design_create,"[kickstarter, development, design, create, sof...",[With only 8 days left till the end of the c...
1,0,733,0_backers_sports_cordor_ducor,"[backers, sports, cordor, ducor, , , , , , ]","[ONLY FOR BACKERS, ONLY FOR BACKERS, ONLY FOR ..."
2,1,336,1_funding_funded_pledged_kickstarter,"[funding, funded, pledged, kickstarter, donate...","[Hey guys, itâs Day 2 and I just want to fir..."
3,2,208,2_kickstarter_funding_funded_projects,"[kickstarter, funding, funded, projects, succe...","[All, When we started our Kickstarter, we tho..."
4,3,173,3_beta_releases_release_updates,"[beta, releases, release, updates, upcoming, i...",[Just one week to go before our beta 1 release...
5,4,165,4_ios_apps_android_app,"[ios, apps, android, app, appstore, mobile, ip...",[Long time no talk! Sorry we haven't posted an...
6,5,143,5_chapters_ebook_chapter_paperback,"[chapters, ebook, chapter, paperback, books, b...","[Hi, I just wanted to let everyone know that ..."
7,6,135,6_ko_kristy__,"[ko, kristy, , , , , , , , ]","[KO, KO, KO]"
8,7,117,7_xiki_xsh_kickstarter_surveys,"[xiki, xsh, kickstarter, surveys, coming, surv...",[Hi all. Thanks again for your awesome support...
9,8,62,8_moviesandbox_vimeo_films_videos,"[moviesandbox, vimeo, films, videos, movieslic...","[Dear Kickstarters, Moviesandbox Version 1.0..."


In [45]:
topic_model_software.get_topic(0)

[('backers', 0.7565515),
 ('sports', 0.22999015),
 ('cordor', 0.21286517),
 ('ducor', 0.20531508),
 ('', 0.19963846),
 ('', 0.19963846),
 ('', 0.19963846),
 ('', 0.19963846),
 ('', 0.19963846),
 ('', 0.19963846)]

In [46]:
topic_model_software.visualize_topics()


In [47]:
topic_model_software.visualize_barchart(top_n_topics=32)


## make a topic model for animation

In [31]:
topic_model_animation = get_standard_bertopic()
topics_animation, probs_animation = topic_model_animation.fit_transform(posts_animation_projects["Post_Text"])

In [32]:
topic_model_animation.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3233,-1_kickstarter_making_production_work,"[kickstarter, making, production, work, update...",[Hello everyone! We're very sorry about the sl...
1,0,1509,0_backers_backer_promotion_,"[backers, backer, promotion, , , , , , , ]","[ONLY FOR BACKERS, ONLY FOR BACKERS, ONLY FOR ..."
2,1,225,1_pledge_pledges_pledged_pledging,"[pledge, pledges, pledged, pledging, kickstart...",[Hey p:13 backers ! project:13 needs a miracl...
3,2,195,2_festival_fest_festivals_cinema,"[festival, fest, festivals, cinema, cannes, sc...",[Happy December! It's been a little over a mo...
4,3,178,3_replay_html5_sound_play,"[replay, html5, sound, play, browser, enjoy, v...",[4 days to go! Still on time to choose your fa...
5,4,155,4_donated_donations_donation_contributions,"[donated, donations, donation, contributions, ...",[Thank you to everyone so far for helping us g...
6,5,144,5____,"[, , , , , , , , , ]","[, , cool!]"
7,6,141,6_ko___,"[ko, , , , , , , , , ]","[KO, KO, KO]"
8,7,125,7_film_vimeo_trailer_movie,"[film, vimeo, trailer, movie, donated, progres...",[Hi there. We are officially half way through...
9,8,124,8_sky_puppets_film_ghosts,"[sky, puppets, film, ghosts, monster, films, p...","[Hello dear backers! Â It's been a crazy, but ..."


In [33]:
topic_model_animation.get_topic(0)

[('backers', 0.7565515),
 ('backer', 0.6901392),
 ('promotion', 0.22582535),
 ('', 0.1996383),
 ('', 0.1996383),
 ('', 0.1996383),
 ('', 0.1996383),
 ('', 0.1996383),
 ('', 0.1996383),
 ('', 0.1996383)]

In [34]:
topic_model_animation.visualize_topics()

In [48]:
topic_model_animation.visualize_barchart(top_n_topics=32)


## make a topic model for videogames

In [36]:
topic_model_videogames = get_standard_bertopic()
topics_videogames, probs_videogames = topic_model_videogames.fit_transform(posts_videogames_projects["Post_Text"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [37]:
topic_model_videogames.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,13965,-1_updates_build_progress_kickstarter,"[updates, build, progress, kickstarter, releas...","[Alrighty, this is going to be a gigantic upda..."
1,0,3774,0_backers_supporter_kickstarters_trades,"[backers, supporter, kickstarters, trades, weâ...","[ONLY FOR BACKERS, ONLY FOR BACKERS, ONLY FOR ..."
2,1,1025,1_ko___,"[ko, , , , , , , , , ]","[KO, KO, KO]"
3,2,771,2_reached_foundation_stretch_goal,"[reached, foundation, stretch, goal, kickstart...",[The Final Countdown Begins - 69 Hours remaini...
4,3,441,3_soundtracks_instruments_music_musical,"[soundtracks, instruments, music, musical, mus...",[--- 1. Tell us about yourself. How did you s...
...,...,...,...,...,...
135,134,16,134_forums_forum_beta_kickstarter,"[forums, forum, beta, kickstarter, community, ...","[Greetings, Adventurers! The dust settled on ..."
136,135,16,135_princess_galactic_galacticprincess_galaxy,"[princess, galactic, galacticprincess, galaxy,...","[Hi Captains, this is the recent progress on ..."
137,136,16,136_crashes_alpha_a17_issues,"[crashes, alpha, a17, issues, bug, 14, build, ...","[Hey Survivalists, The team has been working h..."
138,137,15,137_copies_mastertronic_cassette_releases,"[copies, mastertronic, cassette, releases, edi...",[It's Mastertronic Monday once again and that ...


In [38]:
topic_model_videogames.get_topic(0)

[('backers', 0.7565515),
 ('supporter', 0.26904738),
 ('kickstarters', 0.227015),
 ('trades', 0.21733657),
 ('weâ', 0.21639884),
 ('street', 0.21249054),
 ('protect', 0.20895839),
 ('loyal', 0.20319381),
 ('', 0.19963834),
 ('', 0.19963834)]

In [39]:
topic_model_videogames.visualize_topics()

In [42]:
topic_model_videogames.visualize_barchart(top_n_topics=32)
