In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('marketing_campaign_dataset.csv')
print("Dataset loaded successfully!")

df.head()

Dataset loaded successfully!


Unnamed: 0,Campaign_ID,Company,Campaign_Type,Target_Audience,Duration,Channel_Used,Conversion_Rate,Acquisition_Cost,ROI,Location,Language,Clicks,Impressions,Engagement_Score,Customer_Segment,Date
0,1,Innovate Industries,Email,Men 18-24,30 days,Google Ads,0.04,"$16,174.00",6.29,Chicago,Spanish,506,1922,6,Health & Wellness,2021-01-01
1,2,NexGen Systems,Email,Women 35-44,60 days,Google Ads,0.12,"$11,566.00",5.61,New York,German,116,7523,7,Fashionistas,2021-01-02
2,3,Alpha Innovations,Influencer,Men 25-34,30 days,YouTube,0.07,"$10,200.00",7.18,Los Angeles,French,584,7698,1,Outdoor Adventurers,2021-01-03
3,4,DataTech Solutions,Display,All Ages,60 days,YouTube,0.11,"$12,724.00",5.55,Miami,Mandarin,217,1820,7,Health & Wellness,2021-01-04
4,5,NexGen Systems,Email,Men 25-34,15 days,YouTube,0.05,"$16,452.00",6.5,Los Angeles,Mandarin,379,4201,3,Health & Wellness,2021-01-05


In [3]:
print("\n--- Data types of each column ---")
print(df.info())

print("\n--- Summary statistics for numerical columns ---")
print(df.describe())

print("\n--- Missing values count per column ---")
print(df.isnull().sum())

print("\n--- Unique values for key categorical columns ---")
for column in ['Campaign_Type', 'Target_Audience', 'Channel_Used', 'Location', 'Language', 'Customer_Segment']:
    unique_values = df[column].unique()
    print(f"\nColumn: {column}")
    print(unique_values)


--- Data types of each column ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Campaign_ID       200000 non-null  int64  
 1   Company           200000 non-null  object 
 2   Campaign_Type     200000 non-null  object 
 3   Target_Audience   200000 non-null  object 
 4   Duration          200000 non-null  object 
 5   Channel_Used      200000 non-null  object 
 6   Conversion_Rate   200000 non-null  float64
 7   Acquisition_Cost  200000 non-null  object 
 8   ROI               200000 non-null  float64
 9   Location          200000 non-null  object 
 10  Language          200000 non-null  object 
 11  Clicks            200000 non-null  int64  
 12  Impressions       200000 non-null  int64  
 13  Engagement_Score  200000 non-null  int64  
 14  Customer_Segment  200000 non-null  object 
 15  Date              200000 non-null

**Observations:**

- **Campaign_ID, Clicks, Impressions, Engagement_Score** are int64 – Good.

- **Conversion_Rate and ROI are float64** – Good

- **Duration** is object - Should be a number (e.g., number of days).

- **Acquisition_Cost** is object - Should be a numerical value (cost).

- **Date** is object - Need to convert this to a datetime object to enable time-based analysis.

- All other categorical columns (**Company, Campaign_Type, Target_Audience, Channel_Used, Location, Language, Customer_Segment**) are object which is expected.

In [4]:
# Convert Date to datetime object
df['Date'] = pd.to_datetime(df['Date'])
print(f"Data type of 'Date' after conversion: {df['Date'].dtype}")

# Convert 'Acquisition_Cost' to numeric
df['Acquisition_Cost'] = df['Acquisition_Cost'].astype(str).str.replace(r'[$,]', '', regex=True)
df['Acquisition_Cost'] = pd.to_numeric(df['Acquisition_Cost'], errors='coerce')
print(f"Data type of 'Acquisition_Cost' after conversion: {df['Acquisition_Cost'].dtype}")

# Convert 'Duration' to numeric
df['Duration'] = df['Duration'].astype(str).str.extract('(\d+)', expand=False)
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

print("\n--- Missing values count after conversions ---")
print(df.isnull().sum())


Data type of 'Date' after conversion: datetime64[ns]
Data type of 'Acquisition_Cost' after conversion: float64

--- Missing values count after conversions ---
Campaign_ID         0
Company             0
Campaign_Type       0
Target_Audience     0
Duration            0
Channel_Used        0
Conversion_Rate     0
Acquisition_Cost    0
ROI                 0
Location            0
Language            0
Clicks              0
Impressions         0
Engagement_Score    0
Customer_Segment    0
Date                0
dtype: int64


**Dataset is now clean and ready for deeper analysis**

In [5]:
# Save clean dataset
output_file_path = 'marketing_campaign_data_cleaned.csv'
df.to_csv(output_file_path, index=False)
print(f"Cleaned dataset saved successfully to: {output_file_path}")

Cleaned dataset saved successfully to: marketing_campaign_data_cleaned.csv
