# Bored Panda Sentiment Analysis - Data Prep

In [1]:
#import libraries
import os
import pandas as pd

In [2]:
# Specfify directories and gather list of csv files to merge
data_dir = "scraping/csv/"
csv_files = os.listdir(data_dir)
csv_files

['scraping_result_pages_1-20_2025-01-27_11-32-11.csv',
 'scraping_result_pages_21-40_2025-01-27_11-34-38.csv',
 'scraping_result_pages_41-60_2025-01-27_11-37-54.csv',
 'scraping_result_pages_61-80_2025-01-27_11-40-05.csv',
 'scraping_result_pages_81-100_2025-01-27_11-44-13.csv']

In [3]:
# Concat datafiles
df = pd.read_csv(data_dir+csv_files.pop(0))

while csv_files:
    file = csv_files.pop(0)
    temp_df = pd.read_csv(data_dir+file)
    df = pd.concat([df, temp_df], axis=0)
    
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2135 entries, 0 to 423
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              2135 non-null   object 
 1   title                   2135 non-null   object 
 2   voting                  2135 non-null   int64  
 3   nr_comments             2006 non-null   float64
 4   has_experimental_title  2135 non-null   bool   
 5   page_link               2135 non-null   object 
 6   link                    2135 non-null   object 
 7   date_posted             2101 non-null   object 
 8   categories              2124 non-null   object 
 9   nr_postings_total       2135 non-null   int64  
 10  nr_postings_shown       2135 non-null   int64  
 11  authors                 2123 non-null   object 
 12  authors_role            2124 non-null   object 
 13  error_occured           2135 non-null   bool   
dtypes: bool(2), float64(1), int64(3), object(8)
me

In [4]:
# Rename id column; reset index
df = df.rename(columns={"Unnamed: 0": "post_id"})
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2135 entries, 0 to 2134
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   post_id                 2135 non-null   object 
 1   title                   2135 non-null   object 
 2   voting                  2135 non-null   int64  
 3   nr_comments             2006 non-null   float64
 4   has_experimental_title  2135 non-null   bool   
 5   page_link               2135 non-null   object 
 6   link                    2135 non-null   object 
 7   date_posted             2101 non-null   object 
 8   categories              2124 non-null   object 
 9   nr_postings_total       2135 non-null   int64  
 10  nr_postings_shown       2135 non-null   int64  
 11  authors                 2123 non-null   object 
 12  authors_role            2124 non-null   object 
 13  error_occured           2135 non-null   bool   
dtypes: bool(2), float64(1), int64(3), object

## Sort Dataframe

Having used asyncio tasks for the data scraping script, the stored data is not in its original order (e.g. page 20 is listed before page 1)

In [5]:
# Sort by page nr, preserving original order on page
df["page_link_nr"] = df["page_link"].str.extract('(\d+)')
df["df_index"] = df.index.to_list()

df = df.sort_values(by=["page_link_nr", "df_index"])

# drop columns created for sorting
df = df.drop(["page_link_nr", "df_index"], axis=1)

df.head()

Unnamed: 0,post_id,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured
160,8078562,"80 Posts So Unhinged, You May End Up Laughing ...",61,6.0,True,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/surreal-weird-image...,"Jan 27, 2025",Curiosities,80,80,Adelaide Ross | Ieva Pečiulytė,"Writer, BoredPanda staff | Writer, BoredP...",False
161,8090374,"""It Was A Warning"": Woman Receives Disturbing ...",26,4.0,True,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/stranger-approaches...,"Jan 27, 2025","Lifestyle, News",0,0,Binitha Jacob | Donata Leskauskaite,"Writer, BoredPanda staff | Writer, BoredP...",False
162,8077890,Woman Reveals She Understood Every Insult From...,31,29.0,False,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/hiding-speak-husban...,"Jan 27, 2025","Family, Relationships",0,0,Nikita Manot | Shelly Fourer,"Writer, BoredPanda staff | Writer, BoredP...",False
163,8089627,Ryan Reynolds’ “Horrifically Mean” Treatment O...,-10,6.0,True,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/tj-miller-speaks-ou...,"Jan 27, 2025","Celebrities, News",0,0,Marina Urman | Karina Babenok,"Writer, BoredPanda staff | Writer, BoredP...",False
164,8091431,Zoe Saldaña’s Blackface Scandal Comes Back To ...,22,4.0,True,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/zoe-saldana-blackfa...,"Jan 27, 2025","Celebrities, News",0,0,Marina Urman | Lei RV,"Writer, BoredPanda staff | Writer, BoredP...",False


## Remove Duplicates

The same posts are sometimes repeated on later pages of the website. They cover the same title, link, and id. 

Note that duplicate IDs were already somewhat accounted for in the scraping script. As it was not 100% clear whether IDs might be reused, the script added "_2" to the ID of the currently scraped post if this particular ID was already existent in the stored data. Since pages were scraped in blocks (1-20; 21-40,...) it is still possible that a duplicate ID did not get suffixed with "_2" as the duplicate may have appeared in a different block.

In [6]:
# Define columns which should be used for duplicates flagging
dupl_cols_short = ["title", "link"]

print(df.duplicated(subset=dupl_cols_short, keep=False).sum())
# -> 238 duplicates

dupl_ids_title_link = df[df.duplicated(subset=dupl_cols_short, keep=False)].index.to_list()

238


In [7]:
# Define a stricter subset; i.e. include more columns.
dupl_cols_long = ["title", 
                  "voting", 
                  "nr_comments",
                  "categories", 
                  "nr_postings_total", 
                  "nr_postings_shown", 
                  "link", 
                  "authors",
                  "authors_role"]

print(df.duplicated(subset=dupl_cols_long, keep=False).sum())
# -> 228 duplicates

dupl_ids_more_cols = df[df.duplicated(subset=dupl_cols_long, keep=False)].index.to_list()

228


In [8]:
# Show ids flagged by "title" and "link", but not flagged by stricter subset postings.
dupl_ids_dif = list(set(dupl_ids_title_link) - set(dupl_ids_more_cols))
df.loc[dupl_ids_dif].sort_values(by="title")

Unnamed: 0,post_id,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured
225,7986310,20 Cat Drawings By This Artist That Are Probab...,74,6.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/cute-cat-art-mjmajc...,"Jan 16, 2025",Art,20,20,Hidrėlėy,"Author, Pro member | Author, Pro member ...",False
388,7986310_2,20 Cat Drawings By This Artist That Are Probab...,75,6.0,False,https://www.boredpanda.com/page/15/,https://www.boredpanda.com/cute-cat-art-mjmajc...,"Jan 16, 2025",Art,20,20,Hidrėlėy,"Author, Pro member | Author, Pro member ...",False
228,7937504,32 Ameowsing Pics Of Stuff On Cats That Might ...,77,7.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/funny-stuff-cats-st...,"Jan 16, 2025","Funny, Funny Memes",32,32,Ivan Ayliffe | Denis Krotovas,"Writer, BoredPanda staff | Writer, BoredP...",False
327,7937504_2,32 Ameowsing Pics Of Stuff On Cats That Might ...,80,8.0,False,https://www.boredpanda.com/page/14/,https://www.boredpanda.com/funny-stuff-cats-st...,"Jan 16, 2025","Funny, Funny Memes",32,32,Ivan Ayliffe | Denis Krotovas,"Writer, BoredPanda staff | Writer, BoredP...",False
393,8018680_2,Lab Coat Laughs: 50 Science Memes To Leave You...,135,19.0,False,https://www.boredpanda.com/page/15/,https://www.boredpanda.com/science-funniest-pi...,"Jan 15, 2025","Funny, Funny Memes",105,49,Justin Sandberg | Ilona Baliūnaitė,"Writer, BoredPanda staff | Writer, BoredP...",False
213,8018680,Lab Coat Laughs: 50 Science Memes To Leave You...,134,19.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/science-funniest-pi...,"Jan 15, 2025","Funny, Funny Memes",105,49,Justin Sandberg | Ilona Baliūnaitė,"Writer, BoredPanda staff | Writer, BoredP...",False
224,8023271,This Artist From Taiwan Creates A Magical Worl...,69,8.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/surreal-cats-photos...,"Jan 16, 2025","Animals, Cats",17,17,Hidrėlėy,"Author, Pro member | Author, Pro member ...",False
387,8023271_2,This Artist From Taiwan Creates A Magical Worl...,70,8.0,False,https://www.boredpanda.com/page/15/,https://www.boredpanda.com/surreal-cats-photos...,"Jan 16, 2025","Animals, Cats",17,17,Hidrėlėy,"Author, Pro member | Author, Pro member ...",False
328,7981652_2,“Today I Learned”: 30 Interesting And Weird Fa...,72,6.0,False,https://www.boredpanda.com/page/14/,https://www.boredpanda.com/weird-cool-til-facts/,"Jan 16, 2025",Facts,97,29,Kornelija Viečaitė | Evelina Šiukšterytė | Ind...,"Writer, BoredPanda staff | Writer, BoredP...",False
216,7981652,“Today I Learned”: 30 Interesting And Weird Fa...,71,6.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/weird-cool-til-facts/,"Jan 16, 2025",Facts,97,29,Kornelija Viečaitė | Evelina Šiukšterytė | Ind...,"Writer, BoredPanda staff | Writer, BoredP...",False


In [9]:
# Voting and number of comments have changed. Likely more recent posts that had active
#  users interacting with it (in the few minutes between scraping)

# See what happens if these two columns are removed from subset.
dupl_cols_long2 = ["title",   "categories", "nr_postings_total", "nr_postings_shown", "link", "authors", "authors_role"]
dupl_ids_more_cols2 = df[df.duplicated(subset=dupl_cols_long2, keep=False)].index.to_list()
dupl_ids_dif2 = list(set(dupl_ids_title_link) - set(dupl_ids_more_cols2))
dupl_ids_dif2
# -> Empty list, i.e. duplicate results are the same.

[]

In [10]:
# Observe duplicates based on title and link only
df[df.duplicated(subset=["title", "link"], keep=False)].sort_values(by=["title"])

Unnamed: 0,post_id,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured
212,8036948,10 David Lynch Quotes That Will Alter How You ...,43,9.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/10-david-lynch-quot...,"Jan 16, 2025","Celebrities, News",10,10,Abel Musa Miño | Renan Duarte,"Writer, BoredPanda staff | Writer, BoredP...",False
331,8036948_2,10 David Lynch Quotes That Will Alter How You ...,43,9.0,False,https://www.boredpanda.com/page/14/,https://www.boredpanda.com/10-david-lynch-quot...,"Jan 16, 2025","Celebrities, News",10,10,Abel Musa Miño | Renan Duarte,"Writer, BoredPanda staff | Writer, BoredP...",False
1028,7833620,10 Game-Changing Sleep Tips From Redditors Who...,-53,,False,https://www.boredpanda.com/page/59/,https://www.boredpanda.com/shopping/solutions/...,"Dec 09, 2024","Problem Solvers, Shopping",25,25,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False
1318,7833620_2,10 Game-Changing Sleep Tips From Redditors Who...,-53,,False,https://www.boredpanda.com/page/49/,https://www.boredpanda.com/shopping/solutions/...,"Dec 09, 2024","Problem Solvers, Shopping",25,25,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False
251,8036794,15 Cringe-Worthy Moments That Defined 2024 Awa...,-11,3.0,False,https://www.boredpanda.com/page/11/,https://www.boredpanda.com/15-most-awkward-mom...,"Jan 16, 2025","Celebrities, News",15,15,Marina Urman | Lei RV,"Writer, BoredPanda staff | Writer, BoredP...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,7936936,“Unforgivable”: Amazon Faces Backlash For Airi...,20,39.0,False,https://www.boredpanda.com/page/37/,https://www.boredpanda.com/amazon-cut-importan...,"Dec 27, 2024","Movies&TV, News",0,0,Abel Musa Miño | Renan Duarte,"Writer, BoredPanda staff | Writer, BoredP...",False
1649,7720406,“What Is This?”: 50 Times People Spotted Somet...,75,29.0,False,https://www.boredpanda.com/page/65/,https://www.boredpanda.com/tiny-miniature-things/,"Nov 24, 2024",Curiosities,80,49,Justin Sandberg | Gabija Saveiskyte,"Writer, BoredPanda staff | Writer, BoredP...",False
1939,7720406,“What Is This?”: 50 Times People Spotted Somet...,75,29.0,False,https://www.boredpanda.com/page/82/,https://www.boredpanda.com/tiny-miniature-things/,"Nov 24, 2024",Curiosities,80,49,Justin Sandberg | Gabija Saveiskyte,"Writer, BoredPanda staff | Writer, BoredP...",False
383,7969922_2,“Who’s Babysitting The Kids?”: Women-Only Nigh...,28,15.0,False,https://www.boredpanda.com/page/15/,https://www.boredpanda.com/mama-goes-dancing-g...,"Jan 16, 2025","Lifestyle, News",0,0,Marina Urman | Karina Babenok,"Writer, BoredPanda staff | Writer, BoredP...",False


In [11]:
# Remove duplicates. As seen above, identical posts were found on different pages.
#  The posts that were scraped later show more comments in a few instances. Hence, 
#  the last occurance of the duplicated values are kept to reflect latest count.
len_df_pre = len(df)
print("# rows before duplicates removal:", len(df))
df = df.drop_duplicates(subset=["title", "link"], keep="last", )
print("# rows after duplicates removal:", len(df))
print("# rows removed:", len_df_pre - len(df))

# rows before duplicates removal: 2135
# rows after duplicates removal: 2016
# rows removed: 119


## Post ID

Change type to string and remove "_2" suffix.

In [12]:
# Clean id column
df["post_id"] = df["post_id"].astype(str)
df["post_id"].str.contains("_").value_counts(dropna=False)

post_id
False    1989
True       27
Name: count, dtype: int64

In [13]:
df["post_id_new"] = df["post_id"].str.replace(r"_.*$", "", regex=True)

In [14]:
df.loc[df["post_id"].str.contains("_"), ["post_id", "post_id_new"]].head()

Unnamed: 0,post_id,post_id_new
327,7937504_2,7937504
328,7981652_2,7981652
329,8023496_2,8023496
330,6807053_2,6807053
331,8036948_2,8036948


In [15]:
print("len df:", len(df))
print("unique IDs:", df["post_id_new"].nunique())

len df: 2016
unique IDs: 2016


In [16]:
# drop old column, rename new column
df = df.drop("post_id", axis=1)
df = df.rename(columns={"post_id_new": "post_id"})

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2016 entries, 160 to 2010
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   title                   2016 non-null   object 
 1   voting                  2016 non-null   int64  
 2   nr_comments             1896 non-null   float64
 3   has_experimental_title  2016 non-null   bool   
 4   page_link               2016 non-null   object 
 5   link                    2016 non-null   object 
 6   date_posted             1984 non-null   object 
 7   categories              2007 non-null   object 
 8   nr_postings_total       2016 non-null   int64  
 9   nr_postings_shown       2016 non-null   int64  
 10  authors                 2006 non-null   object 
 11  authors_role            2007 non-null   object 
 12  error_occured           2016 non-null   bool   
 13  post_id                 2016 non-null   object 
dtypes: bool(2), float64(1), int64(3), object(8)

## Errors / Missings

### NAs

In [18]:
# Count NAs in columns
df.isna().sum()

title                       0
voting                      0
nr_comments               120
has_experimental_title      0
page_link                   0
link                        0
date_posted                32
categories                  9
nr_postings_total           0
nr_postings_shown           0
authors                    10
authors_role                9
error_occured               0
post_id                     0
dtype: int64

In [19]:
# 145 postings with containing a missing in any column
len(df[df.isna().any(axis=1)])

145

#### NAs in Number of comments

In [20]:
# Missings in nr_comments stem from the fact that the particular postings are ads/shopping posts
#  and do not provide commenting per default.
df[df["nr_comments"].isna()].head()

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id
64,Home Office Heroes: 24 Products That’ll Help Y...,-58,,False,https://www.boredpanda.com/page/10/,https://www.boredpanda.com/shopping/office/ele...,"Jan 21, 2025","Office Essentials, Shopping",24,24,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,7963845
65,24 Fixes For Your Most Embarrassing Yet Unavoi...,-8,,False,https://www.boredpanda.com/page/10/,https://www.boredpanda.com/shopping/beauty-sho...,"Jan 21, 2025","Beauty & Self Care, Shopping",24,24,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,8033934
1951,Transforming Rentals (And Keeping Landlords Ha...,-59,,False,https://www.boredpanda.com/page/100/,https://www.boredpanda.com/shopping/home-shopp...,"Nov 12, 2024","Home & Garden, Shopping",21,21,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,7667524
1952,"23 Products With Reviews So Good, They’ll Make...",-91,,False,https://www.boredpanda.com/page/100/,https://www.boredpanda.com/shopping/deals/thin...,"Nov 12, 2024","Shopping, Shopping Trends and Deals",23,23,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,7652990
1953,21 White Elephant Gifts That Prove Laughter Is...,-83,,False,https://www.boredpanda.com/page/100/,https://www.boredpanda.com/shopping/funny-shop...,"Nov 12, 2024","Funny And Unique Finds, Shopping",21,21,Mariia Tkachenko,"Author, BoredPanda staff | Author, BoredP...",False,7601955


In [21]:
df.loc[df["nr_comments"].isna(), "categories"].value_counts()

categories
Gift Guides, Shopping                  29
Shopping, Shopping Trends and Deals    27
Problem Solvers, Shopping              17
Home & Garden, Shopping                16
Funny And Unique Finds, Shopping        8
Office Essentials, Shopping             5
Beauty & Self Care, Shopping            3
Shopping, Travel & Adventure            3
Shopping, Tech & Gadgets                2
Home & Garden, Problem Solvers          1
Pets, Shopping                          1
Name: count, dtype: int64

All but 1 posting with missing "nr_comments" contains the category "Shopping"

In [22]:
# See whether the reverse is also true (those containing the word "Shopping" having NA in nr_comments)
df.loc[df["categories"].str.contains("Shopping") == True, "nr_comments"].value_counts(dropna=False)

nr_comments
NaN     111
0.0       4
1.0       2
3.0       1
26.0      1
8.0       1
7.0       1
Name: count, dtype: int64

In [23]:
df[(df["categories"].str.contains("Shopping") == True) & (df["nr_comments"].isna() == False)]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id
1211,25 Gifts For People Who Love Their Car More Th...,-85,3.0,False,https://www.boredpanda.com/page/57/,https://www.boredpanda.com/shopping/gifts-shop...,"Dec 10, 2024","Gift Guides, Shopping",25,25,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,7835693
1552,23 Amazon Finds You’ll Use More Than Your Phone,-132,0.0,False,https://www.boredpanda.com/page/66/,https://www.boredpanda.com/shopping/solutions/...,"Dec 04, 2024","Problem Solvers, Shopping",22,22,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,7805237
1497,2025 Success Starts Now: 25 Ways To Begin Purs...,-324,0.0,False,https://www.boredpanda.com/page/70/,https://www.boredpanda.com/new-year-new-me-2025/,"Dec 02, 2024",Shopping,25,25,Karolina Kondratavičiūtė,"Author, BoredPanda staff | Author, BoredP...",False,6142301
1533,Socks Are So Last Year. These 26 Gifts Are *Wa...,-202,1.0,False,https://www.boredpanda.com/page/75/,https://www.boredpanda.com/shopping/funny-shop...,"Nov 28, 2024","Funny And Unique Finds, Shopping",26,26,Karolina Kondratavičiūtė,"Author, BoredPanda staff | Author, BoredP...",False,6022889
1462,25 Kitchen Gadgets That Have Reviewers Raving ...,-218,26.0,False,https://www.boredpanda.com/page/77/,https://www.boredpanda.com/shopping/solutions/...,"Nov 27, 2024","Problem Solvers, Shopping",25,25,Karolina Kondratavičiūtė,"Author, BoredPanda staff | Author, BoredP...",False,6412753
1419,23 Giftable Things That You’ll End Up Buying F...,-187,8.0,False,https://www.boredpanda.com/page/78/,https://www.boredpanda.com/shopping/gifts-shop...,"Nov 26, 2024","Gift Guides, Shopping",23,23,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False,6387236
2078,13 Gifts That’ll Make Any Travel Buff Say “For...,-280,0.0,False,https://www.boredpanda.com/page/86/,https://www.boredpanda.com/shopping/travel-ide...,"Nov 21, 2024","Shopping, Travel & Adventure",13,13,Karolina Kondratavičiūtė,"Author, BoredPanda staff | Author, BoredP...",False,6049502
1802,"Work-From-Home Warriors, Rejoice! 22 Office Es...",-348,1.0,False,https://www.boredpanda.com/page/87/,https://www.boredpanda.com/shopping/office/hom...,"Nov 20, 2024","Office Essentials, Shopping",21,21,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False,6198501
1755,22 Travel Gadgets That’ll Make Your Holiday Jo...,-239,0.0,False,https://www.boredpanda.com/page/98/,https://www.boredpanda.com/shopping/travel-ide...,"Nov 13, 2024","Shopping, Travel & Adventure",22,22,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False,6414581
1760,24 Presents That’ll Have 12-Year-Olds Losing T...,-229,7.0,False,https://www.boredpanda.com/page/98/,https://www.boredpanda.com/shopping/gifts-shop...,"Nov 13, 2024","Gift Guides, Shopping",24,24,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False,5981618


Those seem to be ads as well.

In [24]:
# Check NAs for those not being ads again.
df[((df["categories"].str.contains("Shopping") != True)
    & (df.isna().any(axis=1))
    & (df["link"].str.contains(r"%cat%") != True))].isna().sum()

title                      0
voting                     0
nr_comments                1
has_experimental_title     0
page_link                  0
link                       0
date_posted               23
categories                 0
nr_postings_total          0
nr_postings_shown          0
authors                    1
authors_role               0
error_occured              0
post_id                    0
dtype: int64

In [25]:
# Some posts do in fact not provide a date. Hence the remaining NAs to inspect are those
#  in nr_comments (1) and authors (1)
df[((df["categories"].str.contains("Shopping") != True)
    & (df.isna().any(axis=1))
    & (df["link"].str.contains(r"%cat%") != True)
    & (df["nr_comments"].isna() | df["authors"].isna()))]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id
791,Tom Brady’s Net Worth: How The NFL Legend Buil...,-59,4.0,False,https://www.boredpanda.com/page/26/,https://www.boredpanda.com/tom-brady-net-worth/,"Jan 07, 2025","Celebrities, Entertainment",0,0,,"Author, BoredPanda staff | Author, BoredP...",False,7978709
1531,Is Your Apartment Tiny? These 23 Space-Saving ...,-181,,False,https://www.boredpanda.com/page/75/,https://www.boredpanda.com/shopping/solutions/...,"Nov 28, 2024","Home & Garden, Problem Solvers",23,23,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False,6272633


In [26]:
# The 1 post should have an author. It will be added manually.
print(df.loc[791, "authors"])
df.loc[791, "authors"] = "Tegan Springate"
print(df.loc[791, "authors"])

nan
Tegan Springate


In [27]:
# The other post with NaN in nr_comments is an ad and should be marked as such in the next step.

#### NAs Date Posted

In [28]:
# Quick check
df.loc[df["date_posted"].isna(), "link"].to_list()[0]

# -> Post does not show date posted
# --> Ok!

'https://www.boredpanda.com/funny-sketchbook-comics-toonhole-chris/'

### Errors

During scraping, posts were flagged as "error_occured" if certain information could not been collected. These will be checked, as well as NAs in different columns.

In [29]:
df["error_occured"].value_counts()

error_occured
False    2007
True        9
Name: count, dtype: int64

In [30]:
# Show rows where error occured during scraping
df[df["error_occured"]]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id
502,"Boring Tasks, But Make It Fun: 22 Items That’l...",-66,,False,https://www.boredpanda.com/page/24/,https://www.boredpanda.com/%cat%/novelty-produ...,,,0,0,,,True,7963820
484,25 Hobbies That’ll Make You The Most Interesti...,-12,,False,https://www.boredpanda.com/page/27/,https://www.boredpanda.com/%cat%/new-hobbies-2...,,,0,0,,,True,7962525
811,Need It Now: 21 Cool Items That Are Calling Ou...,-61,,False,https://www.boredpanda.com/page/30/,https://www.boredpanda.com/%cat%/coolest-thing...,,,0,0,,,True,7901801
658,Beauty Meets Brains: 52 Kitchen Finds That Che...,-116,,False,https://www.boredpanda.com/page/31/,https://www.boredpanda.com/%cat%/kitchen-thing...,,,0,0,,,True,7875591
733,20 Products That Are Totally Dividing The Inte...,-80,,False,https://www.boredpanda.com/page/35/,https://www.boredpanda.com/%cat%/random-things...,,,0,0,,,True,7868010
1029,Shark Tank Success Stories: 25 Products That B...,-50,,False,https://www.boredpanda.com/page/59/,https://www.boredpanda.com/%cat%/best-shark-ta...,,,0,0,,,True,7833512
1560,A Year’s Worth Of Weird: 19 Amazon Discoveries...,-138,13.0,False,https://www.boredpanda.com/page/64/,https://www.boredpanda.com/%cat%/weirdest-thin...,,,0,0,,,True,7805588
2082,26 Novelty Kitchen Items That Just Want To Bri...,-230,,False,https://www.boredpanda.com/page/86/,https://www.boredpanda.com/%cat%/kitchen-novel...,,,0,0,,,True,6922388
1741,24 Novelty Products That’ll Add A Dose Of Fun ...,-243,,False,https://www.boredpanda.com/page/92/,https://www.boredpanda.com/%cat%/unheard-novel...,,,0,0,,,True,6864253


In [31]:
# -> Those are all ads and do not contain most of the elements scraped.
# They will be flagged as ads in the next step

## Ads

In [32]:
# Mark postings as ads to drop them potentially
df["is_ad"] = df["categories"].str.contains("Shopping") == True

In [33]:
# Take link page into account
df[(df["link"].str.contains(r"/shopping")) & (df["categories"].str.contains("Shopping") != True)]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id,is_ad
1531,Is Your Apartment Tiny? These 23 Space-Saving ...,-181,,False,https://www.boredpanda.com/page/75/,https://www.boredpanda.com/shopping/solutions/...,"Nov 28, 2024","Home & Garden, Problem Solvers",23,23,Justina Čiapaitė,"Author, BoredPanda staff | Author, BoredP...",False,6272633,False
1466,21 White Elephant Gifts That Prove Good Things...,-332,0.0,False,https://www.boredpanda.com/page/77/,https://www.boredpanda.com/shopping/gifts-shop...,"Nov 27, 2024","Funny And Unique Finds, Gift Guides",21,21,Karolina Kondratavičiūtė,"Author, BoredPanda staff | Author, BoredP...",False,6052706,False


In [34]:
df.loc[((df["link"].str.contains(r"/shopping"))
         & (df["categories"].str.contains("Shopping") != True)), 
         "is_ad"] = True

In [35]:
# Some ads also contain the directory "%cat%" in their link
df.loc[(df["link"].str.contains(r"%cat%"))]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id,is_ad
502,"Boring Tasks, But Make It Fun: 22 Items That’l...",-66,,False,https://www.boredpanda.com/page/24/,https://www.boredpanda.com/%cat%/novelty-produ...,,,0,0,,,True,7963820,False
484,25 Hobbies That’ll Make You The Most Interesti...,-12,,False,https://www.boredpanda.com/page/27/,https://www.boredpanda.com/%cat%/new-hobbies-2...,,,0,0,,,True,7962525,False
811,Need It Now: 21 Cool Items That Are Calling Ou...,-61,,False,https://www.boredpanda.com/page/30/,https://www.boredpanda.com/%cat%/coolest-thing...,,,0,0,,,True,7901801,False
658,Beauty Meets Brains: 52 Kitchen Finds That Che...,-116,,False,https://www.boredpanda.com/page/31/,https://www.boredpanda.com/%cat%/kitchen-thing...,,,0,0,,,True,7875591,False
733,20 Products That Are Totally Dividing The Inte...,-80,,False,https://www.boredpanda.com/page/35/,https://www.boredpanda.com/%cat%/random-things...,,,0,0,,,True,7868010,False
1029,Shark Tank Success Stories: 25 Products That B...,-50,,False,https://www.boredpanda.com/page/59/,https://www.boredpanda.com/%cat%/best-shark-ta...,,,0,0,,,True,7833512,False
1560,A Year’s Worth Of Weird: 19 Amazon Discoveries...,-138,13.0,False,https://www.boredpanda.com/page/64/,https://www.boredpanda.com/%cat%/weirdest-thin...,,,0,0,,,True,7805588,False
2082,26 Novelty Kitchen Items That Just Want To Bri...,-230,,False,https://www.boredpanda.com/page/86/,https://www.boredpanda.com/%cat%/kitchen-novel...,,,0,0,,,True,6922388,False
1741,24 Novelty Products That’ll Add A Dose Of Fun ...,-243,,False,https://www.boredpanda.com/page/92/,https://www.boredpanda.com/%cat%/unheard-novel...,,,0,0,,,True,6864253,False


In [36]:
df.loc[((df["link"].str.contains(r"%cat%"))
         & (df["categories"].str.contains("Shopping") != True)), 
         "is_ad"] = True

In [37]:
# Check missings again
df[df["is_ad"] == False].isna().sum()

title                      0
voting                     0
nr_comments                0
has_experimental_title     0
page_link                  0
link                       0
date_posted               23
categories                 0
nr_postings_total          0
nr_postings_shown          0
authors                    0
authors_role               0
error_occured              0
post_id                    0
is_ad                      0
dtype: int64

In [38]:
# Inspect some distributions.
df[df["is_ad"]].describe()

Unnamed: 0,voting,nr_comments,nr_postings_total,nr_postings_shown
count,132.0,12.0,132.0,132.0
mean,-94.469697,4.916667,22.272727,22.272727
std,69.519885,7.856188,12.406654,12.406654
min,-348.0,0.0,0.0,0.0
25%,-115.25,0.0,20.0,20.0
50%,-78.5,1.0,22.0,22.0
75%,-50.0,7.25,24.0,24.0
max,25.0,26.0,100.0,100.0


In [39]:
# Show postings with positive voting albeit being an ad
#  (on Bored Panda, ads are usually always strongly downvoted.)
df[(df["is_ad"]) & (df["voting"] > 0)]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id,is_ad
14,Reddit’s WFH Royalty Just Dropped These 17 Gam...,25,,True,https://www.boredpanda.com/page/2/,https://www.boredpanda.com/shopping/office/red...,"Jan 27, 2025","Office Essentials, Shopping",17,17,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,8063265,True
2072,23 Rarely-Reduced Black Friday Finds To Look F...,11,,False,https://www.boredpanda.com/page/86/,https://www.boredpanda.com/shopping/deals/blac...,"Nov 21, 2024","Shopping, Shopping Trends and Deals",23,23,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,7742813,True
43,26 Retro Finds That Are Just The Right Amount ...,9,,False,https://www.boredpanda.com/page/9/,https://www.boredpanda.com/shopping/funny-shop...,"Jan 22, 2025","Funny And Unique Finds, Shopping",26,26,Eligijus Sinkunas,"Author, BoredPanda staff | Author, BoredP...",False,8053282,True


In [40]:
df[(df["is_ad"]) & (df["voting"] > 0)]["link"].to_list()

['https://www.boredpanda.com/shopping/office/reddit-tips-for-working-from-home/?cexp_id=115721&cexp_var=17&_f=homepage_featured',
 'https://www.boredpanda.com/shopping/deals/black-friday-deals-that-rarely-go-on-sale/',
 'https://www.boredpanda.com/shopping/funny-shopping/retro-fever/']

In [41]:
# These are definitely ads. The latter even shows a negative voting by now.

## Categories

In [42]:
df["categories"].value_counts()

categories
Family, Relationships             222
Curiosities                       160
Entitled People, Social Issues    134
Couples, Relationships            112
Comics                             92
                                 ... 
Art, Food                           1
Funny, Home & Design                1
Christmas, News                     1
Home & Design, Organizing           1
Comics, Funny                       1
Name: count, Length: 90, dtype: int64

In [43]:
df["categories"].str.split(", ", expand=True).head()
# -> 2 categories used max.

Unnamed: 0,0,1
160,Curiosities,
161,Lifestyle,News
162,Family,Relationships
163,Celebrities,News
164,Celebrities,News


In [44]:
# Split and expand categories column into 2 seperate columns
df[["category_1", "category_2"]] = df["categories"].str.split(", ", expand=True)

df.filter(like="categor")

Unnamed: 0,categories,category_1,category_2
160,Curiosities,Curiosities,
161,"Lifestyle, News",Lifestyle,News
162,"Family, Relationships",Family,Relationships
163,"Celebrities, News",Celebrities,News
164,"Celebrities, News",Celebrities,News
...,...,...,...
2006,News,News,
2007,"Celebrities, Entertainment",Celebrities,Entertainment
2008,Art,Art,
2009,"Comics, Funny",Comics,Funny


In [45]:
# Check column for missings.
print(len(df[df["categories"].isna()]))
# 9 rows do not contain values for "category". Those postings are ads and 
#  do not come with comments by default.
# --> Ok!
df[df["categories"].isna()]

9


Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id,is_ad,category_1,category_2
502,"Boring Tasks, But Make It Fun: 22 Items That’l...",-66,,False,https://www.boredpanda.com/page/24/,https://www.boredpanda.com/%cat%/novelty-produ...,,,0,0,,,True,7963820,True,,
484,25 Hobbies That’ll Make You The Most Interesti...,-12,,False,https://www.boredpanda.com/page/27/,https://www.boredpanda.com/%cat%/new-hobbies-2...,,,0,0,,,True,7962525,True,,
811,Need It Now: 21 Cool Items That Are Calling Ou...,-61,,False,https://www.boredpanda.com/page/30/,https://www.boredpanda.com/%cat%/coolest-thing...,,,0,0,,,True,7901801,True,,
658,Beauty Meets Brains: 52 Kitchen Finds That Che...,-116,,False,https://www.boredpanda.com/page/31/,https://www.boredpanda.com/%cat%/kitchen-thing...,,,0,0,,,True,7875591,True,,
733,20 Products That Are Totally Dividing The Inte...,-80,,False,https://www.boredpanda.com/page/35/,https://www.boredpanda.com/%cat%/random-things...,,,0,0,,,True,7868010,True,,
1029,Shark Tank Success Stories: 25 Products That B...,-50,,False,https://www.boredpanda.com/page/59/,https://www.boredpanda.com/%cat%/best-shark-ta...,,,0,0,,,True,7833512,True,,
1560,A Year’s Worth Of Weird: 19 Amazon Discoveries...,-138,13.0,False,https://www.boredpanda.com/page/64/,https://www.boredpanda.com/%cat%/weirdest-thin...,,,0,0,,,True,7805588,True,,
2082,26 Novelty Kitchen Items That Just Want To Bri...,-230,,False,https://www.boredpanda.com/page/86/,https://www.boredpanda.com/%cat%/kitchen-novel...,,,0,0,,,True,6922388,True,,
1741,24 Novelty Products That’ll Add A Dose Of Fun ...,-243,,False,https://www.boredpanda.com/page/92/,https://www.boredpanda.com/%cat%/unheard-novel...,,,0,0,,,True,6864253,True,,


## Authors

In [46]:
df.filter(like="author").head()

Unnamed: 0,authors,authors_role
160,Adelaide Ross | Ieva Pečiulytė,"Writer, BoredPanda staff | Writer, BoredP..."
161,Binitha Jacob | Donata Leskauskaite,"Writer, BoredPanda staff | Writer, BoredP..."
162,Nikita Manot | Shelly Fourer,"Writer, BoredPanda staff | Writer, BoredP..."
163,Marina Urman | Karina Babenok,"Writer, BoredPanda staff | Writer, BoredP..."
164,Marina Urman | Lei RV,"Writer, BoredPanda staff | Writer, BoredP..."


In [47]:
# Check missings
df[df["authors"].isna()]

Unnamed: 0,title,voting,nr_comments,has_experimental_title,page_link,link,date_posted,categories,nr_postings_total,nr_postings_shown,authors,authors_role,error_occured,post_id,is_ad,category_1,category_2
502,"Boring Tasks, But Make It Fun: 22 Items That’l...",-66,,False,https://www.boredpanda.com/page/24/,https://www.boredpanda.com/%cat%/novelty-produ...,,,0,0,,,True,7963820,True,,
484,25 Hobbies That’ll Make You The Most Interesti...,-12,,False,https://www.boredpanda.com/page/27/,https://www.boredpanda.com/%cat%/new-hobbies-2...,,,0,0,,,True,7962525,True,,
811,Need It Now: 21 Cool Items That Are Calling Ou...,-61,,False,https://www.boredpanda.com/page/30/,https://www.boredpanda.com/%cat%/coolest-thing...,,,0,0,,,True,7901801,True,,
658,Beauty Meets Brains: 52 Kitchen Finds That Che...,-116,,False,https://www.boredpanda.com/page/31/,https://www.boredpanda.com/%cat%/kitchen-thing...,,,0,0,,,True,7875591,True,,
733,20 Products That Are Totally Dividing The Inte...,-80,,False,https://www.boredpanda.com/page/35/,https://www.boredpanda.com/%cat%/random-things...,,,0,0,,,True,7868010,True,,
1029,Shark Tank Success Stories: 25 Products That B...,-50,,False,https://www.boredpanda.com/page/59/,https://www.boredpanda.com/%cat%/best-shark-ta...,,,0,0,,,True,7833512,True,,
1560,A Year’s Worth Of Weird: 19 Amazon Discoveries...,-138,13.0,False,https://www.boredpanda.com/page/64/,https://www.boredpanda.com/%cat%/weirdest-thin...,,,0,0,,,True,7805588,True,,
2082,26 Novelty Kitchen Items That Just Want To Bri...,-230,,False,https://www.boredpanda.com/page/86/,https://www.boredpanda.com/%cat%/kitchen-novel...,,,0,0,,,True,6922388,True,,
1741,24 Novelty Products That’ll Add A Dose Of Fun ...,-243,,False,https://www.boredpanda.com/page/92/,https://www.boredpanda.com/%cat%/unheard-novel...,,,0,0,,,True,6864253,True,,


In [48]:
# -> Those showing NA in "authors" are all ads

### Author Names

In [49]:
# Author count (names are separated by "|")
df["nr_authors"] = df["authors"].str.count("\|") + 1
df.loc[df["nr_authors"].isna(), "nr_authors"] = 0
df["nr_authors"] = df["nr_authors"].astype(int)

print(df["nr_authors"].describe())
df["nr_authors"].value_counts()

count    2016.000000
mean        1.815972
std         0.566530
min         0.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         4.000000
Name: nr_authors, dtype: float64


nr_authors
2    1331
1     515
3     160
0       9
4       1
Name: count, dtype: int64

In [50]:
# Expand list of authors into individual columns
author_cols = [f"author_{n}" for n in range(1, max(df["nr_authors"])+1)]

df[author_cols] = df["authors"].str.split(" \| ", expand=True)

df.filter(like="author").head()

Unnamed: 0,authors,authors_role,nr_authors,author_1,author_2,author_3,author_4
160,Adelaide Ross | Ieva Pečiulytė,"Writer, BoredPanda staff | Writer, BoredP...",2,Adelaide Ross,Ieva Pečiulytė,,
161,Binitha Jacob | Donata Leskauskaite,"Writer, BoredPanda staff | Writer, BoredP...",2,Binitha Jacob,Donata Leskauskaite,,
162,Nikita Manot | Shelly Fourer,"Writer, BoredPanda staff | Writer, BoredP...",2,Nikita Manot,Shelly Fourer,,
163,Marina Urman | Karina Babenok,"Writer, BoredPanda staff | Writer, BoredP...",2,Marina Urman,Karina Babenok,,
164,Marina Urman | Lei RV,"Writer, BoredPanda staff | Writer, BoredP...",2,Marina Urman,Lei RV,,


### Author roles

In [51]:
# See example
df.loc[0, ["authors", "nr_authors", "authors_role"]].to_list()

['Kornelija Viečaitė | Greta Jaruševičiūtė | Ieva Pečiulytė',
 3,
 ' Writer,  BoredPanda staff  |  Writer,  BoredPanda staff  |  Author,  BoredPanda staff  |  Author,  BoredPanda staff  |  Author,  BoredPanda staff  |  Author,  BoredPanda staff ']

In [52]:
# Each author's role was stored twice successively (website had divs for both mobile 
#  and desktop view. While both were not visibile at the same time, the content appears twice)
# -> Keep only odd numbered role column

# Create list containing each column name twice
author_role_cols = [2*[f"author_{n}_role"] for n in range(1, max(df["nr_authors"])+1)]
# Flatten above list
author_role_cols = [x for x_sub in author_role_cols for x in x_sub]
# Add prefix "DROP_" to every even-numbered column name
author_role_cols = [f"DROP_{x}" if i % 2 == 0 else x for i, x in enumerate(author_role_cols)]

# Expand author role column into 8 new columns
df[author_role_cols] = df["authors_role"].str.split(" \| ", expand=True)

# Drop columns
drop_cols = df.filter(like="DROP_").columns.to_list()
df = df.drop(drop_cols, axis=1)

# Show result
df.filter(like="author").head()

Unnamed: 0,authors,authors_role,nr_authors,author_1,author_2,author_3,author_4,author_1_role,author_2_role,author_3_role,author_4_role
160,Adelaide Ross | Ieva Pečiulytė,"Writer, BoredPanda staff | Writer, BoredP...",2,Adelaide Ross,Ieva Pečiulytė,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,
161,Binitha Jacob | Donata Leskauskaite,"Writer, BoredPanda staff | Writer, BoredP...",2,Binitha Jacob,Donata Leskauskaite,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,
162,Nikita Manot | Shelly Fourer,"Writer, BoredPanda staff | Writer, BoredP...",2,Nikita Manot,Shelly Fourer,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,
163,Marina Urman | Karina Babenok,"Writer, BoredPanda staff | Writer, BoredP...",2,Marina Urman,Karina Babenok,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,
164,Marina Urman | Lei RV,"Writer, BoredPanda staff | Writer, BoredP...",2,Marina Urman,Lei RV,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,


## Add compilation column

Add a column depicting whether a post is a compilation based on how many entries it has. Note that on the website, compaliations may ultimately be shortened, resulting in less posts shown than initially uploaded. This appears to be based on voting of the individal entries. The original list in its full length can still be displayed. While the full post is hidden behind a paywall, the details for all entries remain in the html doc.

Also note that the content paywal usually starts at and with post e.g. 30, 40, 50; which explains the odd numbers of postings shown (e.g. 29, 39, 49).

In [53]:
print(df["nr_postings_shown"].value_counts()[:5])
print("\n", df["nr_postings_total"].value_counts()[:5])

df["is_compilation"] = df["nr_postings_shown"].map(lambda x: x > 0)

print("\n", df["is_compilation"].value_counts())

nr_postings_shown
0     1023
29     219
49     204
30      66
39      43
Name: count, dtype: int64

 nr_postings_total
0     1023
30      66
80      45
50      45
20      37
Name: count, dtype: int64

 is_compilation
False    1023
True      993
Name: count, dtype: int64


## Date Posted

In [54]:
df["date_posted"].value_counts()

date_posted
Nov 21, 2024    49
Dec 09, 2024    48
Dec 10, 2024    48
Jan 16, 2025    46
Nov 29, 2024    45
                ..
Dec 22, 2024     8
Jan 04, 2025     8
Dec 14, 2024     8
Jan 19, 2025     3
Jan 01, 2025     2
Name: count, Length: 79, dtype: int64

In [55]:
# Change string to datetime format
df["date_posted_formatted"] = pd.to_datetime(df['date_posted'])

print(df["date_posted_formatted"].dtype)

df.filter(like="date").head()

datetime64[ns]


Unnamed: 0,date_posted,date_posted_formatted
160,"Jan 27, 2025",2025-01-27
161,"Jan 27, 2025",2025-01-27
162,"Jan 27, 2025",2025-01-27
163,"Jan 27, 2025",2025-01-27
164,"Jan 27, 2025",2025-01-27


## Export Cleaned Dataset

### Prepare Export

In [56]:
df.columns

Index(['title', 'voting', 'nr_comments', 'has_experimental_title', 'page_link',
       'link', 'date_posted', 'categories', 'nr_postings_total',
       'nr_postings_shown', 'authors', 'authors_role', 'error_occured',
       'post_id', 'is_ad', 'category_1', 'category_2', 'nr_authors',
       'author_1', 'author_2', 'author_3', 'author_4', 'author_1_role',
       'author_2_role', 'author_3_role', 'author_4_role', 'is_compilation',
       'date_posted_formatted'],
      dtype='object')

In [57]:
# Select and rename columns
drop_cols = ['has_experimental_title', 'date_posted', 'categories', 'authors', 'authors_role', 'error_occured']
df = df.drop(drop_cols, axis=1)

df = df.rename(columns={"date_posted_formatted": "date_posted"})

df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   title              2016 non-null   object        
 1   voting             2016 non-null   int64         
 2   nr_comments        1896 non-null   float64       
 3   page_link          2016 non-null   object        
 4   link               2016 non-null   object        
 5   nr_postings_total  2016 non-null   int64         
 6   nr_postings_shown  2016 non-null   int64         
 7   post_id            2016 non-null   object        
 8   is_ad              2016 non-null   bool          
 9   category_1         2007 non-null   object        
 10  category_2         1358 non-null   object        
 11  nr_authors         2016 non-null   int32         
 12  author_1           2007 non-null   object        
 13  author_2           1492 non-null   object        
 14  author_3

In [58]:
df.head()

Unnamed: 0,title,voting,nr_comments,page_link,link,nr_postings_total,nr_postings_shown,post_id,is_ad,category_1,...,author_1,author_2,author_3,author_4,author_1_role,author_2_role,author_3_role,author_4_role,is_compilation,date_posted
0,"80 Posts So Unhinged, You May End Up Laughing ...",61,6.0,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/surreal-weird-image...,80,80,8078562,False,Curiosities,...,Adelaide Ross,Ieva Pečiulytė,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,,True,2025-01-27
1,"""It Was A Warning"": Woman Receives Disturbing ...",26,4.0,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/stranger-approaches...,0,0,8090374,False,Lifestyle,...,Binitha Jacob,Donata Leskauskaite,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,,False,2025-01-27
2,Woman Reveals She Understood Every Insult From...,31,29.0,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/hiding-speak-husban...,0,0,8077890,False,Family,...,Nikita Manot,Shelly Fourer,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,,False,2025-01-27
3,Ryan Reynolds’ “Horrifically Mean” Treatment O...,-10,6.0,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/tj-miller-speaks-ou...,0,0,8089627,False,Celebrities,...,Marina Urman,Karina Babenok,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,,False,2025-01-27
4,Zoe Saldaña’s Blackface Scandal Comes Back To ...,22,4.0,https://www.boredpanda.com/page/1/,https://www.boredpanda.com/zoe-saldana-blackfa...,0,0,8091431,False,Celebrities,...,Marina Urman,Lei RV,,,"Writer, BoredPanda staff","Author, BoredPanda staff",,,False,2025-01-27


### Export as csv

In [59]:
df.to_csv("bored_panda_posts.csv", header=True, index=False)