In [2]:
import pandas as pd

In [3]:
try:
    df = pd.read_csv('quotes.csv', encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv('quotes.csv', encoding='latin1')


In [4]:
print("--- Initial Data Info ---")
print(f"Shape of the data (rows, columns): {df.shape}")
df.info()

--- Initial Data Info ---
Shape of the data (rows, columns): (100, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   author  100 non-null    object
 1   tags    100 non-null    object
 2   text    100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [5]:
print("\nFirst 5 rows of the raw data:")
print(df.head())


First 5 rows of the raw data:
         author                  tags  \
0  Steve Martin  humor,obvious,simile   
1  Steve Martin  humor,obvious,simile   
2  Steve Martin  humor,obvious,simile   
3  Steve Martin  humor,obvious,simile   
4  Steve Martin  humor,obvious,simile   

                                                text  
0  “A day without sunshine is like, you know, nig...  
1  “A day without sunshine is like, you know, nig...  
2  “A day without sunshine is like, you know, nig...  
3  “A day without sunshine is like, you know, nig...  
4  “A day without sunshine is like, you know, nig...  


In [6]:
print("\n--- Handling Missing Values ---")
print("Missing values per column:")
print(df.isnull().sum())


--- Handling Missing Values ---
Missing values per column:
author    0
tags      0
text      0
dtype: int64


In [7]:
print("\n--- Handling Duplicates ---")
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {len(duplicate_rows)}")


--- Handling Duplicates ---
Number of duplicate rows: 90


In [8]:
df.drop_duplicates(inplace=True)
print(f"Shape after dropping duplicates: {df.shape}")

Shape after dropping duplicates: (10, 3)


In [9]:
df['text'] = df['text'].str.replace('“', '').str.replace('”', '')

In [10]:
print("\n--- Final Cleaned Data ---")
print("First 5 rows of the processed data:")
print(df.head())


--- Final Cleaned Data ---
First 5 rows of the processed data:
            author                                               tags  \
0     Steve Martin                               humor,obvious,simile   
10  Allen Saunders  fate,life,misattributed-john-lennon,planning,p...   
20      Bob Marley                                              music   
30    George Eliot                                      inspirational   
40   James Baldwin                                               love   

                                                 text  
0    A day without sunshine is like, you know, night.  
10  Life is what happens to us while we are making...  
20  One good thing about music, when it hits you, ...  
30  It is never too late to be what you might have...  
40  Love does not begin and end the way we seem to...  


In [11]:
df.to_csv('quotes_cleaned.csv', index=False)
print("\n Cleaned data has been successfully saved to quotes_cleaned.csv")


 Cleaned data has been successfully saved to quotes_cleaned.csv
