# Data Cleaning & Analysis Template
This notebook provides a structured approach to data cleaning and preprocessing for both tabular and geospatial datasets.

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

## 1. Load Data

In [2]:
# ----------------------
# 1. Load Data
# ----------------------
DEFAULT_DATA_DIR = r"C:\Users\manoe.MC_ASUS\Documents\IRON HACK BOOTCAMP\DAFT_Feb 2025\projects\Project 5\1.0 - data\1.1 - raw"

def load_data(file_name):
    """Load a dataset from the default directory and return it as a DataFrame."""
    file_path = os.path.join(DEFAULT_DATA_DIR, file_name)
    return pd.read_csv(file_path)




In [None]:
#2. Exemple Usage 

if __name__ == "__main__":
    # Define datasets
    datasets = {
        "df_final_demo": "df_final_demo.txt",
        "df_final_experiment_clients": "df_final_experiment_clients.txt",
        "df_final_web_data_pt_1": "df_final_web_data_pt_1.txt",
        "df_final_web_data_pt_2": "df_final_web_data_pt_2.txt"
    }
    
    # Load datasets dynamically into separate variables and store them globally
    globals()["df_final_demo"] = load_data(datasets["df_final_demo"])
    globals()["df_final_experiment_clients"] = load_data(datasets["df_final_experiment_clients"])
    globals()["df_final_web_data_pt_1"] = load_data(datasets["df_final_web_data_pt_1"])
    globals()["df_final_web_data_pt_2"] = load_data(datasets["df_final_web_data_pt_2"])

    print("Datasets loaded and assigned to variables explicitly.")


Datasets loaded and assigned to variables explicitly.


## 2. Basic Inspection

In [4]:
def inspect_data(df):
    """Prints basic inspection details of a DataFrame."""
    print("\nDataset Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe(include='all'))
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nFirst 5 Rows:")
    print(df.head())

def inspect_all_data(dfs):
    """Inspect all datasets dynamically."""
    for df_name, df in dfs.items():
        print(f"\n### Inspecting {df_name} ###")
        inspect_data(df)



In [None]:
inspect_data(df_final_demo)


#missing values: ~15 in each column compared to client_id


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   client_id         70609 non-null  int64  
 1   clnt_tenure_yr    70595 non-null  float64
 2   clnt_tenure_mnth  70595 non-null  float64
 3   clnt_age          70594 non-null  float64
 4   gendr             70595 non-null  object 
 5   num_accts         70595 non-null  float64
 6   bal               70595 non-null  float64
 7   calls_6_mnth      70595 non-null  float64
 8   logons_6_mnth     70595 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.8+ MB
None

Summary Statistics:
           client_id  clnt_tenure_yr  clnt_tenure_mnth      clnt_age  gendr  \
count   7.060900e+04    70595.000000      70595.000000  70594.000000  70595   
unique           NaN             NaN               NaN           NaN      4   
top              NaN             Na

In [25]:
inspect_data(df_final_experiment_clients)



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   client_id  70609 non-null  int64 
 1   Variation  50500 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB
None

Summary Statistics:
           client_id Variation
count   7.060900e+04     50500
unique           NaN         2
top              NaN      Test
freq             NaN     26968
mean    5.004992e+06       NaN
std     2.877278e+06       NaN
min     1.690000e+02       NaN
25%     2.519329e+06       NaN
50%     5.016978e+06       NaN
75%     7.483085e+06       NaN
max     9.999839e+06       NaN

Missing Values:
client_id        0
Variation    20109
dtype: int64

First 5 Rows:
   client_id Variation
0    9988021      Test
1    8320017      Test
2    4033851   Control
3    1982004      Test
4    9294070   Control


In [26]:
inspect_data(df_final_web_data_pt_1)



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 343141 entries, 0 to 343140
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   client_id     343141 non-null  int64 
 1   visitor_id    343141 non-null  object
 2   visit_id      343141 non-null  object
 3   process_step  343141 non-null  object
 4   date_time     343141 non-null  object
dtypes: int64(1), object(4)
memory usage: 13.1+ MB
None

Summary Statistics:
           client_id             visitor_id                     visit_id  \
count   3.431410e+05                 343141                       343141   
unique           NaN                  62936                        75256   
top              NaN  699275239_82397698587  712824876_8175482950_365042   
freq             NaN                     66                           61   
mean    4.996097e+06                    NaN                          NaN   
std     2.875839e+06                    N

In [27]:
inspect_data(df_final_web_data_pt_2)


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412264 entries, 0 to 412263
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   client_id     412264 non-null  int64 
 1   visitor_id    412264 non-null  object
 2   visit_id      412264 non-null  object
 3   process_step  412264 non-null  object
 4   date_time     412264 non-null  object
dtypes: int64(1), object(4)
memory usage: 15.7+ MB
None

Summary Statistics:
           client_id            visitor_id                      visit_id  \
count   4.122640e+05                412264                        412264   
unique           NaN                 71042                         82841   
top              NaN  722943003_3441581446  875138661_34710212496_881092   
freq             NaN                   104                           104   
mean    5.028227e+06                   NaN                           NaN   
std     2.881828e+06                   Na

## 3. Handle Missing Values

In [8]:
def handle_missing_values(df, method='ffill'):
    if method == 'drop':
        return df.dropna()
    return df.fillna(method=method)

## 4. Handle Duplicates

In [9]:
def remove_duplicates(df):
    return df.drop_duplicates()

## 5. Data Type Conversion

In [10]:
def convert_data_types(df, date_columns=None, category_columns=None):
    if date_columns:
        for col in date_columns:
            df[col] = pd.to_datetime(df[col])
    if category_columns:
        for col in category_columns:
            df[col] = df[col].astype('category')
    return df

## 6. Detect Outliers

In [11]:
def detect_outliers(df, column):
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[column])
    plt.title(f'Outlier Detection - {column}')
    plt.show()

## 7. Feature Engineering

In [12]:
def create_features(df):
    if 'column1' in df.columns and 'column2' in df.columns:
        df['new_feature'] = df['column1'] / (df['column2'] + 1e-9)  # Avoid division by zero
    return df

## 8. Normalize/Scale Data

In [13]:
def normalize_data(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

## 9. Save Cleaned Data

In [15]:
def save_data(df, file_path, is_geospatial=False):
    if is_geospatial:
        df.to_file(file_path, driver='GeoJSON')
    else:
        df.to_csv(file_path, index=False)

## Example Usage

In [16]:
if __name__ == "__main__":
    file_path = "your_dataset.csv"
    is_geospatial = False
    
    df = load_data(file_path, is_geospatial)
    inspect_data(df)
    df = handle_missing_values(df, method='ffill')
    df = remove_duplicates(df)
    df = convert_data_types(df, date_columns=['date_column'], category_columns=['category_column'])
    df = create_features(df)
    df = normalize_data(df, columns=['numeric_column'])
    
    if is_geospatial:
        df = clean_geospatial_data(df)
    
    save_data(df, "cleaned_data.csv", is_geospatial)
    print("Data cleaning complete and saved.")

TypeError: load_data() takes 1 positional argument but 2 were given