In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats
from scipy.stats import zscore

## Read the data file

In [2]:
def load_data(file_path):
    return pd.read_csv(file_path)

## Find missing values

In [3]:
def identify_missing_data(df):
    missing_summary = df.isnull().sum()
    missing_percentage = (missing_summary / len(df)) * 100
    missing_report = pd.DataFrame({
        'Missing Count': missing_summary,
        'Missing Percentage': missing_percentage
    })
    print("Missing Data Report:")
    print(missing_report)
    return missing_report

## Drop rows with missing values

In [4]:
def drop_missing(df, column):
    df_cleaned = df.dropna(subset=['TotalCharges'])
    print(f"Rows removed due to missing TotalCharges: {len(df) - len(df_cleaned)}")
    return df_cleaned

## Find and drop duplicates

In [5]:
def find_duplicate_rows(df):
    duplicate_mask = df.duplicated(keep=False)
    duplicate_rows = df[duplicate_mask]
    return duplicate_rows


In [6]:
def drop_duplicates(df):
    duplicate_count = df.duplicated().sum()
    print(f"Number of duplicates before dropping: {duplicate_count}")
    df = df.drop_duplicates()
    return df, duplicate_count

## Remove outliers

In [7]:
def remove_outliers(df, column_name, threshold=3):
    df['z_score'] = zscore(df[column_name])
    original_row_count = len(df)
    df_cleaned = df[df['z_score'].abs() <= threshold]
    cleaned_row_count = len(df_cleaned)
    rows_affected = original_row_count - cleaned_row_count
    df_cleaned = df_cleaned.drop(columns=['z_score'])
    print(f"Number of rows affected by removing outliers: {rows_affected}")
    return df_cleaned

## Find Data Types

In [8]:
def get_dtypes(df):
    return df.dtypes

## Change data type

In [9]:
def change_dtype(df, column, new_dtype):
    df[column] = pd.to_numeric(df[column], errors='coerce').astype(new_dtype)
    return df

## Save cleaned data

In [10]:
def save_data(df, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)  # Ensure the directory exists
    df.to_csv(file_path, index=False)

In [12]:
project_root = os.getcwd()  # Get the directory of this script
raw_data_path = os.path.join(project_root, "../data/raw.csv")
cleaned_data_path = os.path.join(project_root, "../data/cleaned.csv")

# Load raw data
raw_data = load_data(raw_data_path)

df = change_dtype(raw_data, 'TotalCharges', 'float64')
missing_report = identify_missing_data(raw_data)

raw_data = drop_missing(raw_data, 'TotalCharges')
missing_report = identify_missing_data(raw_data)
duplicate_rows = find_duplicate_rows(raw_data)
print("Duplicate rows found:")
raw_data, duplicate_count = drop_duplicates(raw_data)
print(f"Number of duplicates removed: {duplicate_count}")

#raw_data = remove_outliers(raw_data, 'MonthlyCharges')


print(get_dtypes(raw_data))

raw_data.head()
#raw_data = remove_outliers(raw_data, 'TotalCharges')

#save_data(raw_data, cleaned_data_path)
#print(f"Cleaned data saved to {cleaned_data_path}")

Missing Data Report:
                  Missing Count  Missing Percentage
customerID                    0            0.000000
gender                        0            0.000000
SeniorCitizen                 0            0.000000
Partner                       0            0.000000
Dependents                    0            0.000000
tenure                        0            0.000000
PhoneService                  0            0.000000
MultipleLines                 0            0.000000
InternetService               0            0.000000
OnlineSecurity                0            0.000000
OnlineBackup                  0            0.000000
DeviceProtection              0            0.000000
TechSupport                   0            0.000000
StreamingTV                   0            0.000000
StreamingMovies               0            0.000000
Contract                      0            0.000000
PaperlessBilling              0            0.000000
PaymentMethod                 0            

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
