In [1]:
import pandas as pd
# status is always OPen so we can drop it
df = pd.read_csv(filepath_or_buffer='/kaggle/input/replit-bounties-dataset/replit_bounties.csv',
                thousands=',').drop(columns=['status'])
df.head()

Unnamed: 0,title,bounty,coins,company,description,time
0,Need art people in discord,$45.00,4500,NFtalk,Bounty Description Problem Description Im buil...,due 3 weeks from now
1,AI Presentation Analyzer,$90.00,9000,flixr,Objective Develop a single-page application wh...,due 7 days from now
2,Open AI Vision Website Analyzer,$90.00,9000,flixr,Objective Develop a single-page website where ...,due 4 days from now
3,WordPress Plugin - Security Plugin,$82.80,8280,wpupgrader,We would like to hire a freelancer who can bui...,due 1 month from now
4,Static Blog Page - Nuxt,$11.25,1125,lawtechusa,Need a developer who can work with me to modif...,due 7 days from now


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        240 non-null    object
 1   bounty       240 non-null    object
 2   coins        240 non-null    int64 
 3   company      240 non-null    object
 4   description  240 non-null    object
 5   time         240 non-null    object
dtypes: int64(1), object(5)
memory usage: 11.4+ KB


In [3]:
df.nunique()

title          240
bounty          69
coins           69
company        213
description    240
time            30
dtype: int64

In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='coins', log_y=True)

We would like to build a word cloud alternative that will tell us what these bounties want done, more what than how; so we want to try to filter out high-frequency terms that are about the Replit process; ideally we would filter out words dealing with software development and focus on the problem domain but that is probably too much to ask.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

NGRAM_RANGE = (1, 2)
model = CountVectorizer(encoding='utf-8', stop_words='english', min_df=15, max_df=0.2, lowercase=True, ngram_range=NGRAM_RANGE)
count_result = model.fit_transform(raw_documents=df['description'].values)
counts_df = pd.DataFrame(data={'word': model.get_feature_names_out().tolist(),
                               'count': count_result.toarray().sum(axis=0).tolist()})
counts_df.sort_values(ascending=False, by='count').head()

Unnamed: 0,word,count
2,ai,85
60,project,73
16,code,71
79,time,68
83,users,68


In [6]:
from plotly.express import treemap
treemap(data_frame=counts_df, path=['word'], names='word', values='count', color='count', height=800,
       color_continuous_scale='bluered', title = 'Interesting {} non-stop {}-grams by count'.format(len(counts_df), NGRAM_RANGE))

The interesting words here are probably AI, GPT, audio, image, video, ChatGPT and maybe generate. 