In [56]:
#Library Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import os
import csv
from io import StringIO

# Load the CSV file
# Read the CSV, keeping all rows as raw text
input_path = "/Users/patriciajaquez/Documents/GitHub/module1_project/data/raw/marketingcampaigns.csv"
rows = []
expected_columns = 10

with open(input_path, 'r', encoding='utf-8') as infile:
    for line in infile:
        if len(line.strip().split(',')) == expected_columns:
            rows.append(line)

# Join the clean rows and load into pandas
clean_data = pd.read_csv(StringIO(''.join(rows)))

In [57]:
#Dataframe info, including data types (Dtype) and its total, number of entries and total of columns
clean_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   campaign_name    1032 non-null   object 
 1   start_date       1031 non-null   object 
 2   end_date         1030 non-null   object 
 3   budget           1029 non-null   object 
 4   roi              1028 non-null   float64
 5   type             1031 non-null   object 
 6   target_audience  1030 non-null   object 
 7   channel          1031 non-null   object 
 8   conversion_rate  1028 non-null   float64
 9   revenue          1029 non-null   float64
dtypes: float64(3), object(7)
memory usage: 80.8+ KB


In [58]:
#This prints the first 5 rows of the dataframe
clean_data.head()


Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
0,Public-key multi-tasking throughput,2023-04-01,2024-02-23,8082.3,0.35,email,B2B,organic,0.4,709593.48
1,De-engineered analyzing task-force,2023-02-15,2024-04-22,17712.98,0.74,email,B2C,promotion,0.66,516609.1
2,Balanced solution-oriented Local Area Network,2022-12-20,2023-10-11,84643.1,0.37,podcast,B2B,paid,0.28,458227.42
3,Distributed real-time methodology,2022-09-26,2023-09-27,14589.75,0.47,webinar,B2B,organic,0.19,89958.73
4,Front-line executive infrastructure,2023-07-07,2024-05-15,39291.9,0.3,social media,B2B,promotion,0.81,47511.35


In [59]:
#This prints the last 5 rows
clean_data.tail()

Unnamed: 0,campaign_name,start_date,end_date,budget,roi,type,target_audience,channel,conversion_rate,revenue
1027,No revenue campaign,2023-02-01,2023-08-01,20000,0.3,social media,B2B,organic,0.5,
1028,Random mess,2023-06-06,,100000,,podcast,,referral,,300000.0
1029,Invalid budget,2022-12-01,2023-06-01,abc,,email,B2C,promotion,0.2,50000.0
1030,Overlapping dates,2023-03-01,2022-12-31,60000,0.6,webinar,B2B,paid,0.7,90000.0
1031,Too many conversions,2023-05-01,2023-11-01,40000,0.8,social media,B2C,organic,1.5,120000.0


In [60]:
#Count of empty values per column
empty_values = clean_data.isnull().sum()
print(empty_values)

print("Total of empty values: ", sum(empty_values))

campaign_name      0
start_date         1
end_date           2
budget             3
roi                4
type               1
target_audience    2
channel            1
conversion_rate    4
revenue            3
dtype: int64
Total of empty values:  21


In [61]:
#Percentage of empty values per column
empty_values_percentage = (empty_values / len(clean_data)) * 100
print(empty_values_percentage)

campaign_name      0.000000
start_date         0.096899
end_date           0.193798
budget             0.290698
roi                0.387597
type               0.096899
target_audience    0.193798
channel            0.096899
conversion_rate    0.387597
revenue            0.290698
dtype: float64


In [62]:
rows_with_missing = clean_data.isnull().any(axis=1).sum()
print(f"Rows with at least one missing value: {rows_with_missing}")

Rows with at least one missing value: 11


In [63]:
# Check for duplicate rows
duplicates = clean_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 15


In [64]:
# Drop duplicate rows if any
if duplicates > 0:
    data = clean_data.drop_duplicates()
    print(f"Duplicate rows dropped. Remaining rows: {data.shape[0]}")
else:
    print("No duplicate rows found.")


Duplicate rows dropped. Remaining rows: 1017


In [65]:
#empty rows
empty_rows = clean_data[clean_data.isnull().all(axis=1)]
print(f"Number of empty rows: {empty_rows.shape[0]}")


Number of empty rows: 0


In [77]:
# Create a mask for rows where ROI is missing (NaN) or contains invalid (non-numeric) values,
# and both 'revenue' and 'budget' are present and valid numbers.
# We'll use pd.to_numeric with errors='coerce' to ensure non-numeric values are treated as NaN.

# Mask for ROI being NaN or non-numeric
roi_invalid = pd.to_numeric(clean_data['roi'], errors='coerce').isna()
# Mask for revenue and budget being valid numbers (not NaN after conversion)
revenue_valid = pd.to_numeric(clean_data['revenue'], errors='coerce').notna()
budget_valid = pd.to_numeric(clean_data['budget'], errors='coerce').notna()

# Combine masks: select rows where ROI is invalid and revenue/budget are valid
mask = roi_invalid & revenue_valid & budget_valid
rows_to_calculate = clean_data[mask]

# Print the number of rows and the relevant columns for inspection
print(f"Rows where ROI is missing or invalid and can be calculated: {len(rows_to_calculate)}")
print(rows_to_calculate[['revenue', 'budget', 'roi']])

Rows where ROI is missing or invalid and can be calculated: 2
       revenue  budget  roi
1021  120000.0   50000  NaN
1028  300000.0  100000  NaN


In [78]:
# Show rows where 'budget' is not a valid number
invalid_budget = clean_data[pd.to_numeric(clean_data['budget'], errors='coerce').isna()]
print(f"Invalid data in budget column: {invalid_budget[['budget']]}")
# Show rows where 'revenue' is not a valid number
invalid_revenue = clean_data[pd.to_numeric(clean_data['revenue'], errors='coerce').isna()]
print(f"Invalid data in revenue column: {invalid_revenue[['revenue']]}")
# Show rows where 'roi' is not a valid number
invalid_roi = clean_data[pd.to_numeric(clean_data['roi'], errors='coerce').isna()]
print(f"Invalid data in ROI column: {invalid_roi[['roi']]}")

Invalid data in budget column:      budget
1003    NaN
1005    NaN
1024    NaN
1029    abc
Invalid data in revenue column:       revenue
1023      NaN
1024      NaN
1027      NaN
Invalid data in ROI column:       roi
1021  NaN
1024  NaN
1028  NaN
1029  NaN


Rows with invalid (non-numeric) values in critical numeric columns such as budget were removed, as accurate analysis and ROI calculation require valid numerical data.

In [None]:
# Convert 'budget' to numeric, setting errors='coerce' will turn invalid values (like 'abc') into NaN
clean_data['budget'] = pd.to_numeric(clean_data['budget'], errors='coerce')

# Remove rows where 'budget' is NaN (was non-numeric or missing)
clean_data = clean_data.dropna(subset=['budget'])

# Now, calculate ROI for rows where it's missing but revenue and budget are present
mask = clean_data['roi'].isnull() & clean_data['revenue'].notnull() & clean_data['budget'].notnull()
clean_data.loc[mask, 'roi'] = (clean_data.loc[mask, 'revenue'] - clean_data.loc[mask, 'budget']) / clean_data.loc[mask, 'budget']