In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import seaborn as sns

## Product Hunt 2022 data- Exploratory Data Analysis

#### Structure investigation and adjustments

Importing the dataset in csv format. Data extracted from https://www.producthunt.com/time-travel using https://www.octoparse.com/ 

In [42]:
product_hunt= pd.read_csv('/Users/martinahackbartt/Documents/producthunt_scraping_dec23.csv', sep=';', decimal=',')
product_hunt= product_hunt.drop(columns = ["image"])
display(product_hunt)

Unnamed: 0,product_name,tagline,comments,price,URL,category,upvotes,date
0,World Explorer by Insured Nomads,Insurance meets travel tech for the global wor...,84,,https://www.producthunt.com/topics/global-nomad,Global Nomad,389.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
1,Tailwind Box Shadows,Curated list of box shadows for your cards to ...,25,Free,https://www.producthunt.com/topics/productivity,Productivity,203.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
2,24me Smart Personal Assistant,Keep new year's resolutions and get organized ...,8,Free,https://www.producthunt.com/topics/productivity,Productivity,127.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
3,Habitify Challenge,Track & build new habits with your friends in ...,10,Free,https://www.producthunt.com/topics/productivity,Productivity,168.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
4,Sunflower iOS App,Rewire your brain to associate sobriety with r...,14,Free,https://www.producthunt.com/topics/ios,iOS,105.0,"Posts for January 1, 2022 | Product Hunt | Pro..."
...,...,...,...,...,...,...,...,...
6459,Backpaper,The 'No BS' wallpaper app,5,Free,https://www.producthunt.com/topics/android,Android,24.0,"Posts for December 23, 2022 | Product Hunt | P..."
6460,Attribuly Attribution for Shopify,Maximize ROAS with real-time marketing attribu...,7,Free Options,https://www.producthunt.com/topics/saas,SaaS,23.0,"Posts for December 23, 2022 | Product Hunt | P..."
6461,Avatar Maker - Add a Santa Hat,Add a santa hat or approval badge to your soci...,17,Free Options,https://www.producthunt.com/topics/funny-games,Funny Games,28.0,"Posts for December 23, 2022 | Product Hunt | P..."
6462,Ultimate Life Planner 1.4,"Manage Everything in your life, entirely in No...",4,Payment Required,https://www.producthunt.com/topics/productivity,Productivity,21.0,"Posts for December 23, 2022 | Product Hunt | P..."


Changing data types and reformatting the 'date' field

In [57]:
product_hunt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6464 entries, 0 to 6463
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   product_name  6464 non-null   object        
 1   tagline       6464 non-null   object        
 2   comments      6464 non-null   int64         
 3   price         6464 non-null   object        
 4   URL           6464 non-null   object        
 5   category      6464 non-null   object        
 6   upvotes       6464 non-null   int64         
 7   date          6464 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 404.1+ KB


In [102]:
#Strings
product_hunt[["product_name", "tagline", "URL", "category","price"]] = product_hunt[["product_name", "tagline", "URL", "category", "price"]].astype(str)
#Float
product_hunt[["upvotes", "comments"]] = product_hunt[["upvotes", "comments"]].astype(int)
#Date
date_format = 'Posts for %B %d, %Y | Product Hunt | Product Hunt'
product_hunt["date"] = pd.to_datetime(product_hunt["date"], format=date_format)

Adding a 'day of the week' column which will be used in the visualization

In [105]:
def get_day_of_week(timestamp):
    date_str = timestamp.strftime('%Y-%m-%d')
    dt = datetime.strptime(date_str, '%Y-%m-%d')
    return dt.strftime('%A')

product_hunt['day_of_week'] = product_hunt['date'].apply(get_day_of_week)

In [106]:
product_hunt.head(5)

Unnamed: 0,product_name,tagline,comments,price,URL,category,upvotes,date,day_of_week
0,World Explorer by Insured Nomads,Insurance meets travel tech for the global wor...,84,,https://www.producthunt.com/topics/global-nomad,Global Nomad,389,2022-01-01,Saturday
1,Tailwind Box Shadows,Curated list of box shadows for your cards to ...,25,Free,https://www.producthunt.com/topics/productivity,Productivity,203,2022-01-01,Saturday
2,24me Smart Personal Assistant,Keep new year's resolutions and get organized ...,8,Free,https://www.producthunt.com/topics/productivity,Productivity,127,2022-01-01,Saturday
3,Habitify Challenge,Track & build new habits with your friends in ...,10,Free,https://www.producthunt.com/topics/productivity,Productivity,168,2022-01-01,Saturday
4,Sunflower iOS App,Rewire your brain to associate sobriety with r...,14,Free,https://www.producthunt.com/topics/ios,iOS,105,2022-01-01,Saturday


Structure of numerical features

In [67]:
product_hunt.describe()

Unnamed: 0,comments,upvotes
count,6464.0,6464.0
mean,47.940594,192.974629
std,84.458988,191.179484
min,0.0,1.0
25%,7.0,58.0
50%,21.0,123.0
75%,54.0,265.25
max,2257.0,997.0


#### Quality investigation

Missing values

In [83]:
product_hunt.isna().sum() #there are no missing values in my dataset

product_name    0
tagline         0
comments        0
price           0
URL             0
category        0
upvotes         0
date            0
dtype: int64

 Duplicates

In [111]:
# Check number of duplicates while ignoring the index feature
n_duplicates = product_hunt.drop(labels=["product_name"], axis=1).duplicated().sum()
print(f"There seem to be {n_duplicates} duplicates in the database.")

There seem to be 0 duplicates in the database.


#### Saving the changes to a modified CSV 

In [109]:
product_hunt.to_csv('modified_producthunt_data.csv')