# DATA PREPROCESSING

In [1]:
#Library Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import os
import csv
from io import StringIO


In [2]:
# Load the CSV file
# Read the CSV, keeping all rows as raw text
input_path = "/Users/patriciajaquez/Documents/GitHub/module1_project/data/raw/marketingcampaigns.csv"
rows = []
expected_columns = 10

with open(input_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        if len(line.strip().split(',')) == expected_columns:
            rows.append(line)

# Join the clean rows and load into pandas
clean_data = pd.read_csv(StringIO(''.join(rows)))

In [3]:
# Display the first few rows of the DataFrame
#This prints the first 5 rows
clean_data.head()


Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.4,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.1
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.3,social media,B2B,promotion,0.81,47511.35


In [4]:
# Display the last few rows of the DataFrame
#This prints the last 5 rows
clean_data.tail()

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1027,No revenue campaign,2023-02-01,2023-08-01,20000,0.3,social media,B2B,organic,0.5,
1028,Random mess,2023-06-06,,100000,,podcast,,referral,,300000.0
1029,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.2,50000.0
1030,Overlapping dates,2023-03-01,2022-12-31,60000,0.6,webinar,B2B,paid,0.7,90000.0
1031,Too many conversions,2023-05-01,2023-11-01,40000,0.8,social media,B2C,organic,1.5,120000.0


In [5]:
#Dataframe info, including data types (Dtype) and its total, number of entries and total of columns
clean_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1032 non-null   object 
 1   start_date       1031 non-null   object 
 2   end_date         1030 non-null   object 
 3   budget           1029 non-null   object 
 4   roi              1028 non-null   float64
 5   type             1031 non-null   object 
 6   target_audience  1030 non-null   object 
 7   channel          1031 non-null   object 
 8   conversion_rate  1028 non-null   float64
 9   revenue          1029 non-null   float64
dtypes: float64(3), object(7)
memory usage: 80.8+ KB


In [6]:
#Count of empty values per column
empty_values = clean_data.isnull().sum()
print(empty_values)

print("Total of empty values: ", sum(empty_values))

# Rows with at least one missing value
rows_with_missing = clean_data.isnull().any(axis=1).sum()
print(f"Rows with at least one missing value: {rows_with_missing}")

# Empty rows
empty_rows = clean_data[clean_data.isnull().all(axis=1)]
print(f"Number of empty rows: {empty_rows.shape[0]}")

campaign_name      0
start_date         1
end_date           2
budget             3
roi                4
type               1
target_audience    2
channel            1
conversion_rate    4
revenue            3
dtype: int64
Total of empty values:  21
Rows with at least one missing value: 11
Number of empty rows: 0


In [7]:
#Percentage of empty values per column
empty_values_percentage = (empty_values / len(clean_data)) * 100
print(empty_values_percentage)

campaign_name      0.000000
start_date         0.096899
end_date           0.193798
budget             0.290698
roi                0.387597
type               0.096899
target_audience    0.193798
channel            0.096899
conversion_rate    0.387597
revenue            0.290698
dtype: float64


In [8]:
# Review unique values in categorical columns
cat_cols = ['type', 'target_audience', 'channel']
for col in cat_cols:
    print(f"\nUnique values in {col}:")
    print(clean_data[col].unique())


Unique values in type:
['email' 'podcast' 'webinar' 'social media' nan 'event' 'B2B']

Unique values in target_audience:
['B2B' 'B2C' 'social media' nan]

Unique values in channel:
['organic' 'promotion' 'paid' 'referral' nan]


In [9]:
# Descriptive statistics and possible outliers
clean_data.describe()


Unnamed: 0,roi,conversion_rate,revenue
count,1028.0,1028.0,1029.0
mean,0.533804,0.541936,511591.195277
std,0.261869,0.267353,287292.729847
min,-0.2,0.0,108.21
25%,0.31,0.3,267820.25
50%,0.53,0.55,518001.77
75%,0.76,0.77,765775.14
max,0.99,1.5,999712.49


## Data Issues Identified

During data preprocessing, the following data quality issues were found:

1. **Rows with Incorrect Number of Columns**  
    - Some rows in the raw CSV did not match the expected number of columns and were excluded.

2. **Missing Values**  
    - Several columns contained missing values, including `start_date`, `end_date`, `budget`, `roi`, `type`, `target_audience`, `channel`, `conversion_rate`, and `revenue`.

3. **Invalid (Non-numeric) Values in Numeric Columns**  
    - Non-numeric values were present in columns expected to be numeric, such as `budget`, `conversion_rate`, `revenue`, and `roi`.

4. **Incorrect Column Data Type**  
    - Columns like `budget` is expected to be float instead of object.

5. **Empty Values**  
    - Empty values were found in all columns except `campaign_name`.

6. **Unexpected Categorical Values**  
    - The `type` and `target_audience` columns contained values outside the expected categories or possible misplacements.

7. **Outliers**  
    - Outliers were present in numeric columns, especially in `conversion_rate` (values > 100%) and `revenue` (values much higher than average).


## Cleaning Process

Checked the row where `type` is 'B2B' and `target_audience` is 'socialmedia'.  
**Resolution:** This row was dropped from the dataset because it is mostly null, with many missing values, and is not relevant for analysis.

In [10]:
# Print all rows where type is 'B2B' or target_audience is 'socialmedia'
misplaced_rows = clean_data[
    (clean_data['type'] == 'B2B') | (clean_data['target_audience'] == 'social media')
]
print(misplaced_rows)

# Drop the misplaced rows
clean_data = clean_data[
    ~((clean_data['type'] == 'B2B') & (clean_data['target_audience'] == 'social media'))
]

            campaign_name  start_date end_date budget  roi type  \
1024  Null-heavy campaign  2023-01-01      NaN    NaN  NaN  B2B   

     target_audience channel  conversion_rate  revenue  
1024    social media     NaN              NaN      NaN  


`roi` values were recalculated for rows where both `revenue` and `budget` were present and valid, ensuring that missing or invalid `roi` entries were accurately filled using the formula: (`revenue` - `budget`) / `budget`. To enable this, the `budget` column was converted from object to float, and rows with non-numeric or missing `budget` or `revenue` values were removed prior to calculation. This ensures all ROI values are based on valid, clean numeric data.

In [11]:
# Convert 'budget' to numeric, setting errors='coerce' will turn invalid values (like 'abc') into NaN
clean_data['budget'] = pd.to_numeric(clean_data['budget'], errors='coerce')

# Check 'budget' data type after conversion
print(f"budget column is now: {clean_data['budget'].dtype}")

budget column is now: float64


In [12]:
# Find rows where ROI is missing, but revenue and budget are present
mask = clean_data['roi'].isnull() & clean_data['revenue'].notnull() & clean_data['budget'].notnull()
rows_to_calculate = clean_data[mask]

print(f"Rows where ROI can be calculated: {len(rows_to_calculate)}")
print(rows_to_calculate[['revenue', 'budget', 'roi']])

# Now, calculate ROI for rows where it's missing but revenue and budget are present
mask = clean_data['roi'].isnull() & clean_data['revenue'].notnull() & clean_data['budget'].notnull()
clean_data.loc[mask, 'roi'] = (clean_data.loc[mask, 'revenue'] - clean_data.loc[mask, 'budget']) / clean_data.loc[mask, 'budget']

Rows where ROI can be calculated: 2
       revenue    budget  roi
1021  120000.0   50000.0  NaN
1028  300000.0  100000.0  NaN


In [13]:
# Check if there are still missing values in 'roi'
missing_roi = clean_data['roi'].isnull().sum()
print(f"Missing ROI values after calculation: {missing_roi}")

Missing ROI values after calculation: 1


Rows with missing, invalid, or nonsensical dates (such as a `start_date` after `end_date`) were removed. All date fields were standardized to the `yyyy-mm-dd` format to ensure consistency and reliability for time-based analyses. This process improves data quality and ensures accurate temporal comparisons.

In [14]:
# Print rows where start date is after end date
invalid_date_range = clean_data[clean_data['start_date'] > clean_data['end_date']]
print(f"Invalid date ranges (start date after end date): {invalid_date_range[['start_date', 'end_date']]}")


Invalid date ranges (start date after end date):       start_date    end_date
1030  2023-03-01  2022-12-31


In [15]:
# Convert start_date and end_date to datetime, invalid parsing will become NaT (missing)
clean_data['start_date'] = pd.to_datetime(clean_data['start_date'], errors='coerce')
clean_data['end_date'] = pd.to_datetime(clean_data['end_date'], errors='coerce')

# Remove rows where either start_date or end_date is missing or invalid
clean_data = clean_data.dropna(subset=['start_date', 'end_date'])

# Find and remove rows where start_date is after end_date (nonsensical date ranges)
clean_data = clean_data[clean_data['start_date'] <= clean_data['end_date']]

# Format dates as yyyy-mm-dd strings for consistency
clean_data['start_date'] = clean_data['start_date'].dt.strftime('%Y-%m-%d')
clean_data['end_date'] = clean_data['end_date'].dt.strftime('%Y-%m-%d')

Checked whether any values in the float columns (`budget`, `roi`, `conversion_rate`, `revenue`) contain a comma (`,`), which could indicate improper formatting or parsing issues. This helps ensure all numeric data is correctly recognized for analysis.

In [16]:
# List of float columns to check
float_cols = ['budget', 'roi', 'conversion_rate', 'revenue']

for col in float_cols:
    # Convert to string and check for commas
    has_comma = clean_data[col].astype(str).str.contains(',', na=False).any()
    print(f"Column '{col}' has values with commas: {has_comma}")
    
    # Optionally, display some examples
    if has_comma:
        print(f"Examples from '{col}' with commas:")
        print(clean_data[clean_data[col].astype(str).str.contains(',', na=False)][col].head())

Column 'budget' has values with commas: False
Column 'roi' has values with commas: False
Column 'conversion_rate' has values with commas: False
Column 'revenue' has values with commas: False


Missing values in the 'type' and 'target_audience' columns were filled with 'Unknown' to maintain data integrity without introducing potentially misleading assumptions.

In [17]:
# Fill missing values in 'type' and 'target_audience' with 'Unknown' to avoid introducing artificial categories
clean_data['type'] = clean_data['type'].fillna('Unknown')
clean_data['target_audience'] = clean_data['target_audience'].fillna('Unknown')

In [18]:
# Drop rows with remaining missing values
clean_data = clean_data.dropna()

In [19]:
# Progress: no missing values or invalid data types

# Check data types
print(clean_data.dtypes)

# Check for missing values in each column
print(clean_data.isnull().sum())

campaign_name       object
start_date          object
end_date            object
budget             float64
roi                float64
type                object
target_audience     object
channel             object
conversion_rate    float64
revenue            float64
dtype: object
campaign_name      0
start_date         0
end_date           0
budget             0
roi                0
type               0
target_audience    0
channel            0
conversion_rate    0
revenue            0
dtype: int64


In [20]:
# Get summary statistics for float columns
# This shows count, mean, std, min, 25%, 50%, 75%, max
# This will also include the 'budget' column, now numeric

print(clean_data.describe())

             budget          roi  conversion_rate        revenue
count  1.021000e+03  1021.000000      1021.000000    1021.000000
mean   5.911153e+04     0.534878         0.542351  514061.251548
std    3.127481e+05     0.261452         0.267276  286658.491608
min    1.052570e+03     0.000000         0.000000     108.210000
25%    2.476960e+04     0.310000         0.300000  268528.690000
50%    4.691995e+04     0.530000         0.550000  520022.100000
75%    7.500000e+04     0.760000         0.770000  768567.700000
max    9.999999e+06     0.990000         1.500000  999712.490000


In [21]:
# Use IQR to detect outliers for each float column
for col in float_cols:
    Q1 = clean_data[col].quantile(0.25)
    Q3 = clean_data[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = clean_data[(clean_data[col] < Q1 - 1.5 * IQR) | (clean_data[col] > Q3 + 1.5 * IQR)]
    print(f"\nOutliers in {col}:")
    print(outliers[[col]])
    


Outliers in budget:
         budget
1008  9999999.0

Outliers in roi:
Empty DataFrame
Columns: [roi]
Index: []

Outliers in conversion_rate:
      conversion_rate
1031              1.5

Outliers in revenue:
Empty DataFrame
Columns: [revenue]
Index: []


In [22]:
# Ensure dates are in correct format and order
print(clean_data[['start_date', 'end_date']].head())

   start_date    end_date
0  2023-04-01  2024-02-23
1  2023-02-15  2024-04-22
2  2022-12-20  2023-10-11
3  2022-09-26  2023-09-27
4  2023-07-07  2024-05-15


In [23]:
# Find all exact duplicate rows (excluding the first occurrence)
duplicates = clean_data[clean_data.duplicated(keep=False)]

print(f"Number of exact duplicate rows: {duplicates.shape[0]}")
print(duplicates)



Number of exact duplicate rows: 27
                                      campaign_name  start_date    end_date  \
0               Public-key multi-tasking throughput  2023-04-01  2024-02-23   
1                De-engineered analyzing task-force  2023-02-15  2024-04-22   
2     Balanced solution-oriented Local Area Network  2022-12-20  2023-10-11   
3                 Distributed real-time methodology  2022-09-26  2023-09-27   
4               Front-line executive infrastructure  2023-07-07  2024-05-15   
5            Upgradable transitional data-warehouse  2023-06-29  2023-12-13   
6            Innovative context-sensitive framework  2023-03-01  2024-02-23   
7          User-friendly client-driven service-desk  2023-01-06  2023-12-11   
8                     Proactive neutral methodology  2022-09-06  2024-01-11   
9                      Intuitive responsive support  2022-11-25  2024-04-04   
10                Multi-lateral dedicated workforce  2023-06-15  2024-06-15   
11            Cro