In [2]:
import pandas as pd

df = pd.read_csv('goodreads_books_dataset.csv', encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,rank,percentile_rank,book_id,title,author,rating,rating_category,rating_tier,is_high_rated,title_length,title_complexity,word_count,author_count,author_name_length,has_series_info,series_number,title_type,has_subtitle,has_middle_name,estimated_popularity
0,1,0.0,56859736,Metaphysics of Sound,NataÅ¡a PantoviÄ,4.93,Excellent,Tier_10,True,20,Simple,3,1,15,False,,Standard,False,False,High
1,2,0.1,41212190,Learn Spanish with stories and audios as workb...,Anton Hager,4.92,Excellent,Tier_10,True,166,Very Complex,24,1,11,False,,Subtitle,True,False,High
2,3,0.1,38479831,Rivers Never Fill The Sea,Giselle V. Steele,4.88,Excellent,Tier_10,True,25,Moderate,5,1,17,False,,Standard,False,True,High
3,4,0.1,29380718,Secret of the Cassin's Family Curse (Castle of...,Julie-Anne Gamble,4.88,Excellent,Tier_10,True,61,Very Complex,10,1,17,True,1.0,Series,False,False,High
4,5,0.2,35514861,What Healing Should Be: How to relieve pain an...,George Alexandru,4.88,Excellent,Tier_10,True,62,Very Complex,11,1,17,False,,Subtitle,True,False,High


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3045 entries, 0 to 3044
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   rank                  3045 non-null   int64  
 1   percentile_rank       3045 non-null   float64
 2   book_id               3045 non-null   int64  
 3   title                 3045 non-null   object 
 4   author                3045 non-null   object 
 5   rating                3045 non-null   float64
 6   rating_category       3043 non-null   object 
 7   rating_tier           3045 non-null   object 
 8   is_high_rated         3045 non-null   bool   
 9   title_length          3045 non-null   int64  
 10  title_complexity      3045 non-null   object 
 11  word_count            3045 non-null   int64  
 12  author_count          3045 non-null   int64  
 13  author_name_length    3045 non-null   int64  
 14  has_series_info       3045 non-null   bool   
 15  series_number        

In [5]:
df.isnull().sum()

rank                       0
percentile_rank            0
book_id                    0
title                      0
author                     0
rating                     0
rating_category            2
rating_tier                0
is_high_rated              0
title_length               0
title_complexity           0
word_count                 0
author_count               0
author_name_length         0
has_series_info            0
series_number           1851
title_type                 0
has_subtitle               0
has_middle_name            0
estimated_popularity       0
dtype: int64

In [6]:
def get_series_number(title):
    if '#' in title and ')' in title:
        start = title.find('#') + 1
        end = title.find(')', start)
        number_part = title[start:end]
        try:
            return float(number_part)
        except:
            return None
    return None

df['series_number'] = df['title'].apply(get_series_number)
df['has_series_info'] = df['series_number'].notna()
df['title_type'] = df['has_series_info'].apply(lambda x: "Series" if x else "Standalone")

print(f"Found {df['has_series_info'].sum()} books in a series")

Found 1143 books in a series


In [7]:
df.loc[df['rating'] == 0, 'rating category'] = 'unrated'

In [8]:
unrated_entry = df['rating category'] == 'unrated'
print(df[unrated_entry])

      rank  percentile_rank   book_id                            title  \
3043  3044            100.0  35179967  Medley of Mayhem: A Manic State   
3044  3045            100.0    334230   Understanding Graciliano Ramos   

                       author  rating rating_category rating_tier  \
3043       Geoffrey C. Porter     0.0             NaN      Tier_1   
3044  Celso Lemos de Oliveira     0.0             NaN      Tier_1   

      is_high_rated  title_length  ... word_count  author_count  \
3043          False            31  ...          6             1   
3044          False            30  ...          3             1   

      author_name_length  has_series_info  series_number  title_type  \
3043                  18            False            NaN  Standalone   
3044                  23            False            NaN  Standalone   

     has_subtitle  has_middle_name  estimated_popularity rating category  
3043         True             True                   Low         unrated  


In [12]:
author_counts = df.groupby('author')['title'].count()
df['author_book_count'] = df['author'].map(author_counts)

df['series_popularity'] = 'Standalone'
df.loc[df['has_series_info'] == True, 'series_popularity'] = 'Part of Series'

df['is_top_performer'] = 0
df.loc[df['percentile_rank'] >= 90, 'is_top_performer'] = 1

print("3 new features:")
print(f"  - author_book_count: How many books each author has in top list")
print(f"  - series_popularity: Series vs Standalone")
print(f"  - is_top_performer: Flags top 10% books")

3 new features:
  - author_book_count: How many books each author has in top list
  - series_popularity: Series vs Standalone
  - is_top_performer: Flags top 10% books


In [13]:
df.to_csv('GOODREADS_dataset_cleaned.csv', index=False)