In [1]:
import pandas as pd

In [2]:
# Load the Dataset
file_path = 'sales_leads_dataset_1000_leads.csv'
df = pd.read_csv('../data/sales_leads_dataset_1000_leads.csv')

In [3]:
print("--- Data Before Preprocessing ---")
print(df.head())
print(df.info())

--- Data Before Preprocessing ---
     lead_id company_size     industry  annual_revenue_lkr    location  \
0  LEAD_0001       Medium  Agriculture           116701433       Kandy   
1  LEAD_0002       Medium      Finance           142242154  Kurunegala   
2  LEAD_0003       Medium  IT/Software           134980432  Kurunegala   
3  LEAD_0004        Small      Tourism             7239690     Colombo   
4  LEAD_0005       Medium  IT/Software           128031305      Jaffna   

   engagement_score  website_visits  email_opens demo_requested  \
0                87              25           16             No   
1                77              45            4             No   
2               100              27            7            Yes   
3               100              33           11             No   
4                48              17            3             No   

   days_since_first_contact contact_level  budget_indicated_lkr  \
0                        15       Manager          

In [4]:
# Check for missing values and duplicates
print(f"Missing values:\n{df.isnull().sum()}")
print(f"\nTotal duplicate rows: {df.duplicated().sum()}")

Missing values:
lead_id                     0
company_size                0
industry                    0
annual_revenue_lkr          0
location                    0
engagement_score            0
website_visits              0
email_opens                 0
demo_requested              0
days_since_first_contact    0
contact_level               0
budget_indicated_lkr        0
competitor_using            0
referral_source             0
converted                   0
dtype: int64

Total duplicate rows: 0


In [5]:
# Data Cleaning: Strip whitespace from categorical data
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].astype(str).str.strip()

In [6]:
print("\n--- Cleaned Data Preview ---")
print(df.head())


--- Cleaned Data Preview ---
     lead_id company_size     industry  annual_revenue_lkr    location  \
0  LEAD_0001       Medium  Agriculture           116701433       Kandy   
1  LEAD_0002       Medium      Finance           142242154  Kurunegala   
2  LEAD_0003       Medium  IT/Software           134980432  Kurunegala   
3  LEAD_0004        Small      Tourism             7239690     Colombo   
4  LEAD_0005       Medium  IT/Software           128031305      Jaffna   

   engagement_score  website_visits  email_opens demo_requested  \
0                87              25           16             No   
1                77              45            4             No   
2               100              27            7            Yes   
3               100              33           11             No   
4                48              17            3             No   

   days_since_first_contact contact_level  budget_indicated_lkr  \
0                        15       Manager              

In [7]:
# Save the cleaned dataset for the next step (Preprocessing)
df.to_csv('../data/cleaned_sales_leads_dataset.csv', index=False)