In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import seaborn as sns

## Product Hunt 2022 data- Exploratory Data Analysis

#### Structure investigation and adjustments

Importing the dataset in csv format. Data extracted from https://www.producthunt.com/time-travel using https://www.octoparse.com/ 

In [80]:
product_hunt= pd.read_csv('/Users/martinahackbartt/Documents/producthunt_2022_EDA/producthunt_scraping_dec23.csv', sep=';', decimal=',')
product_hunt= product_hunt.drop(columns = ["image"])
display(product_hunt)

Unnamed: 0,product_name,tagline,comments,price,URL,category,upvotes,date
0,World Explorer by Insured Nomads,Insurance meets travel tech for the global wor...,84,,https://www.producthunt.com/topics/global-nomad,Global Nomad,389.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
1,Tailwind Box Shadows,Curated list of box shadows for your cards to ...,25,Free,https://www.producthunt.com/topics/productivity,Productivity,203.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
2,24me Smart Personal Assistant,Keep new year's resolutions and get organized ...,8,Free,https://www.producthunt.com/topics/productivity,Productivity,127.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
3,Habitify Challenge,Track & build new habits with your friends in ...,10,Free,https://www.producthunt.com/topics/productivity,Productivity,168.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
4,Sunflower iOS App,Rewire your brain to associate sobriety with r...,14,Free,https://www.producthunt.com/topics/ios,iOS,105.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
...,...,...,...,...,...,...,...,...
6459,Backpaper,The 'No BS' wallpaper app,5,Free,https://www.producthunt.com/topics/android,Android,24.0,"Posts for December 23, 2022 | Product Hunt | P..."
6460,Attribuly Attribution for Shopify,Maximize ROAS with real-time marketing attribu...,7,Free Options,https://www.producthunt.com/topics/saas,SaaS,23.0,"Posts for December 23, 2022 | Product Hunt | P..."
6461,Avatar Maker - Add a Santa Hat,Add a santa hat or approval badge to your soci...,17,Free Options,https://www.producthunt.com/topics/funny-games,Funny Games,28.0,"Posts for December 23, 2022 | Product Hunt | P..."
6462,Ultimate Life Planner 1.4,"Manage Everything in your life, entirely in No...",4,Payment Required,https://www.producthunt.com/topics/productivity,Productivity,21.0,"Posts for December 23, 2022 | Product Hunt | P..."


Changing data types and reformatting the 'date' field

In [81]:
product_hunt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6464 entries, 0 to 6463
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  6464 non-null   object 
 1   tagline       6464 non-null   object 
 2   comments      6464 non-null   object 
 3   price         6365 non-null   object 
 4   URL           6464 non-null   object 
 5   category      6464 non-null   object 
 6   upvotes       6464 non-null   float64
 7   date          6464 non-null   object 
dtypes: float64(1), object(7)
memory usage: 404.1+ KB


In [82]:
#Strings
product_hunt[["product_name", "tagline", "URL", "category","price"]] = product_hunt[["product_name", "tagline", "URL", "category", "price"]].astype(str)
#Float
product_hunt[["upvotes", "comments"]] = product_hunt[["upvotes", "comments"]].astype(int)
#Date
date_format = 'Posts for %B %d, %Y | Product Hunt | Product Hunt'
product_hunt["date"] = pd.to_datetime(product_hunt["date"], format=date_format)

Adding a 'day of the week' column which will be used in the visualization

In [83]:
def get_day_of_week(timestamp):
    date_str = timestamp.strftime('%Y-%m-%d')
    dt = datetime.strptime(date_str, '%Y-%m-%d')
    return dt.strftime('%A')

product_hunt['day_of_week'] = product_hunt['date'].apply(get_day_of_week)

In [84]:
product_hunt.head(5)

Unnamed: 0,product_name,tagline,comments,price,URL,category,upvotes,date,day_of_week
0,World Explorer by Insured Nomads,Insurance meets travel tech for the global wor...,84,,https://www.producthunt.com/topics/global-nomad,Global Nomad,389,2022-01-01,Saturday
1,Tailwind Box Shadows,Curated list of box shadows for your cards to ...,25,Free,https://www.producthunt.com/topics/productivity,Productivity,203,2022-01-01,Saturday
2,24me Smart Personal Assistant,Keep new year's resolutions and get organized ...,8,Free,https://www.producthunt.com/topics/productivity,Productivity,127,2022-01-01,Saturday
3,Habitify Challenge,Track & build new habits with your friends in ...,10,Free,https://www.producthunt.com/topics/productivity,Productivity,168,2022-01-01,Saturday
4,Sunflower iOS App,Rewire your brain to associate sobriety with r...,14,Free,https://www.producthunt.com/topics/ios,iOS,105,2022-01-01,Saturday


Structure of numerical features

In [85]:
product_hunt.describe()

Unnamed: 0,comments,upvotes
count,6464.0,6464.0
mean,47.940594,192.974629
std,84.458988,191.179484
min,0.0,1.0
25%,7.0,58.0
50%,21.0,123.0
75%,54.0,265.25
max,2257.0,997.0


#### Quality investigation

Missing values

In [86]:
product_hunt.isna().sum() #there are no missing values in my dataset

product_name    0
tagline         0
comments        0
price           0
URL             0
category        0
upvotes         0
date            0
day_of_week     0
dtype: int64

 Duplicates

In [87]:
#Returns True if there is at least a value in product_name seen more than once
boolean = not product_hunt["product_name"].is_unique
print(boolean)

# Find the duplicates in the "product_name" field
duplicates = product_hunt[product_hunt.duplicated(["product_name"])]
display(duplicates)

True


Unnamed: 0,product_name,tagline,comments,price,URL,category,upvotes,date,day_of_week
1413,Mage,Own your medical records,43,Free,https://www.producthunt.com/topics/android,Android,294,2022-03-19,Saturday
1915,Writerly,Start thinking like a writer,11,Free,https://www.producthunt.com/topics/writing,Writing,122,2022-04-16,Saturday
2349,Vessel,Your passport for the internet,74,Free,https://www.producthunt.com/topics/crypto,Crypto,356,2022-05-11,Wednesday
2562,Sobriety Hub,Quit your addiction with ease,3,Payment Required,https://www.producthunt.com/topics/health-fitness,Health & Fitness,54,2022-05-23,Monday
2928,Custom Wordle,Wordle for the classroom,0,Free,https://www.producthunt.com/topics/free-games,Free Games,33,2022-06-12,Sunday
3012,Letsmint,Сollab management for NFT collections,13,Free,https://www.producthunt.com/topics/tech,Tech,113,2022-06-16,Thursday
3069,Koala,"Free, open source and web based bear alternative.",8,Free,https://www.producthunt.com/topics/productivity,Productivity,167,2022-06-20,Monday
3094,Dioptra,The metrics engine for better data and better ...,5,Free Options,https://www.producthunt.com/topics/developer-t...,Developer Tools,36,2022-06-21,Tuesday
3136,Clay,If Apple's focus mode and Miro had a baby,3,Free,https://www.producthunt.com/topics/productivity,Productivity,29,2022-06-23,Thursday
3404,Campground,Sell your content as NFTS,2,Free,https://www.producthunt.com/topics/web3,Web3,37,2022-07-08,Friday


To differentiate multiple launches from the same product, I will add a number of occurrence to the name

In [88]:
# Create a new column called "product_name_numbered"
product_hunt["product_name_numbered"] = product_hunt["product_name"]

# Group the DataFrame by "product_name" and keep all of the columns
product_hunt = product_hunt.groupby("product_name").apply(lambda x: x)

# Add the number of occurrences to the product name for products that have been launched more than once
product_hunt.loc[product_hunt.duplicated(["product_name"]), "product_name_numbered"] = product_hunt["product_name"] + " " + (product_hunt.groupby("product_name").cumcount() + 1).astype(str)

# Rename the "product_name_numbered" column to "product_name" and drop the old "product_name" column
product_hunt= product_hunt.drop(columns=["product_name"])
product_hunt = product_hunt.rename(columns={"product_name_numbered": "product_name"})

display(product_hunt)

Unnamed: 0,tagline,comments,price,URL,category,upvotes,date,day_of_week,product_name
0,Insurance meets travel tech for the global wor...,84,,https://www.producthunt.com/topics/global-nomad,Global Nomad,389,2022-01-01,Saturday,World Explorer by Insured Nomads
1,Curated list of box shadows for your cards to ...,25,Free,https://www.producthunt.com/topics/productivity,Productivity,203,2022-01-01,Saturday,Tailwind Box Shadows
2,Keep new year's resolutions and get organized ...,8,Free,https://www.producthunt.com/topics/productivity,Productivity,127,2022-01-01,Saturday,24me Smart Personal Assistant
3,Track & build new habits with your friends in ...,10,Free,https://www.producthunt.com/topics/productivity,Productivity,168,2022-01-01,Saturday,Habitify Challenge
4,Rewire your brain to associate sobriety with r...,14,Free,https://www.producthunt.com/topics/ios,iOS,105,2022-01-01,Saturday,Sunflower iOS App
...,...,...,...,...,...,...,...,...,...
6459,The 'No BS' wallpaper app,5,Free,https://www.producthunt.com/topics/android,Android,24,2022-12-23,Friday,Backpaper
6460,Maximize ROAS with real-time marketing attribu...,7,Free Options,https://www.producthunt.com/topics/saas,SaaS,23,2022-12-23,Friday,Attribuly Attribution for Shopify
6461,Add a santa hat or approval badge to your soci...,17,Free Options,https://www.producthunt.com/topics/funny-games,Funny Games,28,2022-12-23,Friday,Avatar Maker - Add a Santa Hat
6462,"Manage Everything in your life, entirely in No...",4,Payment Required,https://www.producthunt.com/topics/productivity,Productivity,21,2022-12-23,Friday,Ultimate Life Planner 1.4


#### Saving the changes to a modified CSV 

In [89]:
product_hunt.to_csv('modified_producthunt_data.csv')