In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import seaborn as sns

## Product Hunt 2022 data- Data Wrangling & Cleaning

#### Structure investigation and adjustments

Importing the dataset in csv format. Data extracted from https://www.producthunt.com/time-travel using my own script (you can find it here)

In [52]:
product_hunt= pd.read_csv('/Users/martinahackbartt/Documents/producthunt_2022_EDA/productHunt2022_scrapingdata.csv', sep=',', decimal=',')
display(product_hunt)

Unnamed: 0,product_name,upvotes,category,tagline,comments,image link,pricing,position,date
0,World Explorer by Insured Nomads,389.0,Global Nomad,Insurance meets travel tech for the global wor...,84,https://ph-files.imgix.net/9bc773c8-1720-45b6-...,,1,"Posts for January 1, 2022 | Product Hunt | Pr..."
1,Tailwind Box Shadows,203.0,Productivity,Curated list of box shadows for your cards to ...,25,https://ph-files.imgix.net/f182991d-f034-43b8-...,Free,2,"Posts for January 1, 2022 | Product Hunt | Pr..."
2,24me Smart Personal Assistant,127.0,Productivity,Keep new year's resolutions and get organized ...,8,https://ph-files.imgix.net/9afadb00-151b-4f19-...,Free,3,"Posts for January 1, 2022 | Product Hunt | Pr..."
3,Habitify Challenge,168.0,Productivity,Track & build new habits with your friends in ...,10,https://ph-files.imgix.net/8941b193-9073-4864-...,Free,4,"Posts for January 1, 2022 | Product Hunt | Pr..."
4,Sunflower iOS App,108.0,iOS,Rewire your brain to associate sobriety with r...,14,https://ph-files.imgix.net/35f3ef3a-3a39-49b3-...,Free,5,"Posts for January 1, 2022 | Product Hunt | Pr..."
...,...,...,...,...,...,...,...,...,...
6602,Cloud Rebels,25.0,Tech,IT has never been easier,1,https://ph-files.imgix.net/a1da6bb2-796c-4020-...,Payment Required,12,"Posts for December 31, 2022 | Product Hunt | ..."
6603,SuenaGringo AI,24.0,Productivity,Helps Spanish immigrants write natural & engag...,5,https://ph-files.imgix.net/ab8126a3-206e-483c-...,Free Options,13,"Posts for December 31, 2022 | Product Hunt | ..."
6604,Gmax CRM Open Source,23.0,Productivity,Gmax CRM is an open source invoicing software,2,https://ph-files.imgix.net/90f32823-fb61-4b22-...,Free,14,"Posts for December 31, 2022 | Product Hunt | ..."
6605,Grocery Delivery App Development,18.0,Productivity,SpotnEats developing customized apps for you,2,https://ph-files.imgix.net/96b9a8f3-459b-4557-...,Payment Required,15,"Posts for December 31, 2022 | Product Hunt | ..."


Changing data types and reformatting the 'date' field

In [20]:
product_hunt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product_name  6607 non-null   object 
 1   upvotes       6607 non-null   float64
 2   category      6607 non-null   object 
 3   tagline       6607 non-null   object 
 4   comments      6607 non-null   object 
 5   image link    4666 non-null   object 
 6   pricing       6506 non-null   object 
 7   position      6607 non-null   int64  
 8   date          6607 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 464.7+ KB


In [53]:
#Strings
product_hunt[["product_name", "tagline", "category","pricing", "image link", "position"]] = product_hunt[["product_name", "tagline", "category", "pricing", "image link", "position"]].astype(str)
#Float
product_hunt[["upvotes", "comments"]] = product_hunt[["upvotes", "comments"]].astype(int)
#Date
date_format = ' Posts for %B %d, %Y | Product Hunt | Product Hunt'
product_hunt["date"] = pd.to_datetime(product_hunt["date"], format=date_format)

Adding a 'day of the week' column which will be used in the visualization

In [54]:
def get_day_of_week(timestamp):
    date_str = timestamp.strftime('%Y-%m-%d')
    dt = datetime.strptime(date_str, '%Y-%m-%d')
    return dt.strftime('%A')

product_hunt['day_of_week'] = product_hunt['date'].apply(get_day_of_week)

In [55]:
product_hunt.head(5)

Unnamed: 0,product_name,upvotes,category,tagline,comments,image link,pricing,position,date,day_of_week
0,World Explorer by Insured Nomads,389,Global Nomad,Insurance meets travel tech for the global wor...,84,https://ph-files.imgix.net/9bc773c8-1720-45b6-...,,1,2022-01-01,Saturday
1,Tailwind Box Shadows,203,Productivity,Curated list of box shadows for your cards to ...,25,https://ph-files.imgix.net/f182991d-f034-43b8-...,Free,2,2022-01-01,Saturday
2,24me Smart Personal Assistant,127,Productivity,Keep new year's resolutions and get organized ...,8,https://ph-files.imgix.net/9afadb00-151b-4f19-...,Free,3,2022-01-01,Saturday
3,Habitify Challenge,168,Productivity,Track & build new habits with your friends in ...,10,https://ph-files.imgix.net/8941b193-9073-4864-...,Free,4,2022-01-01,Saturday
4,Sunflower iOS App,108,iOS,Rewire your brain to associate sobriety with r...,14,https://ph-files.imgix.net/35f3ef3a-3a39-49b3-...,Free,5,2022-01-01,Saturday


Structure of numerical features

In [56]:
product_hunt.describe()

Unnamed: 0,upvotes,comments
count,6607.0,6607.0
mean,192.507038,47.699561
std,190.26315,84.171545
min,1.0,0.0
25%,58.0,7.0
50%,123.0,21.0
75%,265.0,54.0
max,997.0,2257.0


#### Quality investigation

Missing values

In [57]:
product_hunt.isna().sum() #there are no missing values in my dataset

product_name    0
upvotes         0
category        0
tagline         0
comments        0
image link      0
pricing         0
position        0
date            0
day_of_week     0
dtype: int64

 Duplicates

In [58]:
#Returns True if there is at least a value in product_name seen more than once
boolean = not product_hunt["product_name"].is_unique
print(boolean)

# Find the duplicates in the "product_name" field
duplicates = product_hunt[product_hunt.duplicated(["product_name"])]
display(duplicates)

True


Unnamed: 0,product_name,upvotes,category,tagline,comments,image link,pricing,position,date,day_of_week
1413,Mage,295,Android,Own your medical records,43,https://ph-files.imgix.net/a391cf58-088e-45f8-...,Free,1,2022-03-19,Saturday
1915,Writerly,122,Writing,Start thinking like a writer,11,https://ph-files.imgix.net/84987aab-56a8-4249-...,Free,4,2022-04-16,Saturday
2349,Vessel,358,Crypto,Your passport for the internet,74,https://ph-files.imgix.net/e0868038-4dc6-44c1-...,Free,7,2022-05-11,Wednesday
2562,Sobriety Hub,54,Health & Fitness,Quit your addiction with ease,3,,Payment Required,11,2022-05-23,Monday
2928,Custom Wordle,33,Free Games,Wordle for the classroom,0,https://ph-files.imgix.net/bb9d9e52-383b-4c57-...,Free,12,2022-06-12,Sunday
3012,Letsmint,114,Tech,Сollab management for NFT collections,13,https://ph-files.imgix.net/9ab2b6fd-b30f-4fc3-...,Free,19,2022-06-16,Thursday
3069,Koala,167,Productivity,"Free, open source and web based bear alternative.",8,https://ph-files.imgix.net/33eb4129-3601-451d-...,Free,10,2022-06-20,Monday
3094,Dioptra,36,Developer Tools,The metrics engine for better data and better ...,5,https://ph-files.imgix.net/0b826642-92d2-4738-...,Free Options,15,2022-06-21,Tuesday
3136,Clay,29,Productivity,If Apple's focus mode and Miro had a baby,3,https://ph-files.imgix.net/5c7cd867-9e90-40f8-...,Free,17,2022-06-23,Thursday
3404,Campground,38,Web3,Sell your content as NFTS,2,https://ph-files.imgix.net/08acebca-c0a3-4793-...,Free,16,2022-07-08,Friday


To differentiate multiple launches from the same product, I will add a number of occurrence to the name

In [63]:
# Create a new column called "product_name_numbered" and add it to the dataset
product_hunt["product_name_numbered"] = product_hunt["product_name"]
product_hunt['product_name_numbered'] = product_hunt['product_name_numbered']

# Group the DataFrame by "product_name" and keep all of the columns
product_hunt = product_hunt.groupby("product_name").apply(lambda x: x)

#Add the number of occurrences to the product name for products that have been launched more than once
product_hunt.loc[product_hunt.duplicated(["product_name"]), "product_name_numbered"] = product_hunt["product_name"] + " " + (product_hunt.groupby("product_name").cumcount() + 1).astype(str)

#Rename the "product_name_numbered" column to "product_name" and drop the old "product_name" column
product_hunt= product_hunt.drop(columns=["product_name"])
product_hunt = product_hunt.rename(columns={"product_name_numbered": "product_name"})

display(product_hunt)

Unnamed: 0,upvotes,category,tagline,comments,image link,pricing,position,date,day_of_week,product_name
0,389,Global Nomad,Insurance meets travel tech for the global wor...,84,https://ph-files.imgix.net/9bc773c8-1720-45b6-...,,1,2022-01-01,Saturday,World Explorer by Insured Nomads
1,203,Productivity,Curated list of box shadows for your cards to ...,25,https://ph-files.imgix.net/f182991d-f034-43b8-...,Free,2,2022-01-01,Saturday,Tailwind Box Shadows
2,127,Productivity,Keep new year's resolutions and get organized ...,8,https://ph-files.imgix.net/9afadb00-151b-4f19-...,Free,3,2022-01-01,Saturday,24me Smart Personal Assistant
3,168,Productivity,Track & build new habits with your friends in ...,10,https://ph-files.imgix.net/8941b193-9073-4864-...,Free,4,2022-01-01,Saturday,Habitify Challenge
4,108,iOS,Rewire your brain to associate sobriety with r...,14,https://ph-files.imgix.net/35f3ef3a-3a39-49b3-...,Free,5,2022-01-01,Saturday,Sunflower iOS App
...,...,...,...,...,...,...,...,...,...,...
6602,25,Tech,IT has never been easier,1,https://ph-files.imgix.net/a1da6bb2-796c-4020-...,Payment Required,12,2022-12-31,Saturday,Cloud Rebels
6603,24,Productivity,Helps Spanish immigrants write natural & engag...,5,https://ph-files.imgix.net/ab8126a3-206e-483c-...,Free Options,13,2022-12-31,Saturday,SuenaGringo AI
6604,23,Productivity,Gmax CRM is an open source invoicing software,2,https://ph-files.imgix.net/90f32823-fb61-4b22-...,Free,14,2022-12-31,Saturday,Gmax CRM Open Source
6605,18,Productivity,SpotnEats developing customized apps for you,2,https://ph-files.imgix.net/96b9a8f3-459b-4557-...,Payment Required,15,2022-12-31,Saturday,Grocery Delivery App Development


#### Saving the changes to a modified CSV 

In [64]:
product_hunt.to_csv('modified_producthunt_data.csv', index= False)