In [467]:
# Relevant libraries
import pandas as pd

# Class 1: Data Understanding

In [469]:
class DataUnderstanding:
    def __init__(self, train_values_path, train_labels_path, test_values_path):
        
        self.train_values = pd.read_csv(train_values_path)
        self.train_labels = pd.read_csv(train_labels_path)
        self.test_values = pd.read_csv(test_values_path)
    
    def basic_properties_dataframe(self, df, name="DataFrame"):
        
        print(f"\n{name} - First 5 Rows:\n{'-' * 30}")
        print(df.head())
        
        print(f"\n{name} - Shape:\n{'-' * 30}")
        print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
        
        print(f"\n{name} - Info:\n{'-' * 30}")
        df.info()
        
        print(f"\n{name} - Columns:\n{'-' * 30}")
        print(df.columns.tolist())
    
    def summarize_statistics(self, df, name="DataFrame"):
        
        print(f"\n{name} - Summary Statistics:\n{'-' * 30}")
        
        # Summary for numerical columns
        print(f"\n{name} - Numerical Columns Summary:\n{'-' * 30}")
        print(df.describe())
        
        # Summary for categorical columns
        print(f"\n{name} - Categorical Columns Summary:\n{'-' * 30}")
        for column in df.select_dtypes(include=['object', 'category']).columns:
            print(f"\n{column} - Value Counts:\n{'-' * 30}")
            print(df[column].value_counts())
            print('-' * 30)
    
    def missing_values_summary(self, df, name="DataFrame"):

        print(f"\n{name} - Missing Values Summary:\n{'-' * 30}")
        
        # Count missing values per column
        missing_count = df.isnull().sum()
        
        # Calculate percentage of missing values per column
        missing_percentage = (missing_count / df.shape[0]) * 100
        
        # Combine missing count and percentage into a single DataFrame for easy viewing
        missing_data = pd.DataFrame({
            'Missing Values': missing_count,
            'Percentage': missing_percentage
        })
        
        # Print columns with missing data
        print(missing_data[missing_data['Missing Values'] > 0].sort_values(by='Percentage', ascending=False))
    


# Class 2 : Data Cleaning

In [471]:
class DataCleaning:
    def __init__(self, train_values, train_labels, test_values):
        # This class does not need to load the data since it's already loaded by DataUnderstanding
        self.train_values = train_values
        self.train_labels = train_labels
        self.test_values = test_values
    

    def combine_train_data(self, train_values, train_labels):
        
        X_train = train_values.drop(columns=['id'])  # Drop the 'id' column 
        y_train = train_labels.drop(columns=['id'])  # Drop the 'id' column
        df = pd.concat([X_train, y_train], axis=1)  # Concatenate the dataframes along columns
        return df
    
    def handle_missing_values(self, df):
        
        categorical_columns = ['funder', 'installer', 'public_meeting', 'scheme_management', 'permit', 'wpt_name']
        for col in categorical_columns:
            mode_value = df[col].mode()[0]  # Get the mode (most frequent value)
            df[col] = df[col].fillna(mode_value)
        
        # Impute 'subvillage' with 'Unknown'
        df['subvillage'] = df['subvillage'].fillna('Unknown')
        
        # Drop the 'scheme_name' column due to high missing values
        df = df.drop(columns='scheme_name')
        
        # Verifying missing values after handling
        missing_counts = df.isnull().sum()
        missing_percentages = (missing_counts / len(df)) * 100
        print("Missing values after handling:")
        print(pd.DataFrame({
            "Missing Count": missing_counts,
            "Missing Percentage (%)": missing_percentages.round(2)
        }))
        
        return df

    def drop_columns(self, df, columns_to_drop):
        
        df = df.drop(columns=columns_to_drop, axis=1)
        return df

    def extract_date_parts(self, df, date_column):
        
        df[date_column] = pd.to_datetime(df[date_column])  # Convert to datetime
        df['year'] = df[date_column].dt.year
        df['month'] = df[date_column].dt.month
        df['day'] = df[date_column].dt.day
        df = df.drop(columns=[date_column])  # Drop the original 'date_recorded' column
        return df

    def convert_to_title_case(self, df):
    
        df = df.applymap(lambda x: x.title() if isinstance(x, str) else x)
        return df

    def convert_column_names_to_title_case(self, df):
        
        df.columns = df.columns.str.title()
        return df

    def strip_whitespace(self, df):
        
        text_columns = df.select_dtypes(include=['object', 'category']).columns
        df[text_columns] = df[text_columns].apply(lambda col: col.str.strip())
        return df


# Class 3 : Encoding Categorical Columns

In [473]:
class CategoricalEncoder:
    def __init__(self, high_cardinality_threshold=15):
       
        self.high_cardinality_threshold = high_cardinality_threshold
        self.ordinal_encoder = None
        self.frequency_maps = {}

    def identify_categorical_columns(self, df):
        
        return df.select_dtypes(include='object').columns.tolist()

    def frequency_encode(self, df, columns):
        
        for col in columns:
            freq = df[col].value_counts(normalize=True)
            df[col] = df[col].map(freq)
            self.frequency_maps[col] = freq
        return df

    def label_encode(self, df, columns):
        
        for col in columns:
            df[col] = df[col].astype('category').cat.codes
        return df

    def encode_predictors(self, df):
        
        categorical_columns = self.identify_categorical_columns(df)
        high_card_cols = [col for col in categorical_columns if df[col].nunique() > self.high_cardinality_threshold]
        low_card_cols = [col for col in categorical_columns if col not in high_card_cols]
        
        df = self.frequency_encode(df, high_card_cols)
        df = self.label_encode(df, low_card_cols)
        return df

    def encode_target(self, df, target_column, ordinal_categories):
        
        if self.ordinal_encoder is None:
            self.ordinal_encoder = OrdinalEncoder(categories=[ordinal_categories])
        
        df[target_column] = self.ordinal_encoder.fit_transform(df[[target_column]])
        return df


# Class 4 : Boolean To Numeric Convertion

In [475]:
class BooleanToNumericConverter:
    def __init__(self):
       
        self.boolean_columns = []

    def identify_boolean_columns(self, df):
        
        self.boolean_columns = df.select_dtypes(include='bool').columns.tolist()
        return self.boolean_columns

    def convert_booleans_to_numeric(self, df):
        
        for col in self.boolean_columns:
            df[col] = df[col].astype(int)
        return df


# Class 5: Outliers Detection And Treatment

In [477]:
class OutlierDetectionTreatment:
    def __init__(self, iqr_factor=1.5):
        
        self.iqr_factor = iqr_factor

    def detect_outliers_iqr(self, df):
        
        columns_with_outliers = []
        
        # Iterate over each numeric column
        for column in df.select_dtypes(include=[np.number]).columns:
            # Calculate Q1 (5th percentile) and Q3 (95th percentile)
            Q1 = df[column].quantile(0.05)
            Q3 = df[column].quantile(0.95)
            
            # Calculate the IQR (Interquartile Range)
            IQR = Q3 - Q1
            
            # Calculate the lower and upper bounds for outliers
            lower_bound = Q1 - self.iqr_factor * IQR
            upper_bound = Q3 + self.iqr_factor * IQR
            
            # Identify outliers (rows where the value is outside the IQR bounds)
            outliers_in_column = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
            
            # If there are outliers in this column, add it to the list
            if not outliers_in_column.empty:
                columns_with_outliers.append(column)
        
        return columns_with_outliers

    def visualize_outliers(self, df, columns_with_outliers):
        
        # Set up the figure and axes for side-by-side plots
        fig, axes = plt.subplots(1, len(columns_with_outliers), figsize=(18, 6))

        # Iterate over each column and create a box plot
        for i, column in enumerate(columns_with_outliers):
            sns.boxplot(x=df[column], ax=axes[i])
            axes[i].set_title(f'Box Plot of {column}')
            axes[i].set_xlabel(column)

        # Adjust layout for better display
        plt.tight_layout()
        plt.show()

    def remove_outliers_iqr(self, df, columns_with_outliers):
        
        df_cleaned = df.copy()
        
        # Iterate over each column in the specified list
        for column in columns_with_outliers:
            if column in df.select_dtypes(include=[np.number]).columns:
                # Calculate Q1 (5th percentile) and Q3 (95th percentile)
                Q1 = df[column].quantile(0.05)
                Q3 = df[column].quantile(0.95)
                
                # Calculate the IQR (Interquartile Range)
                IQR = Q3 - Q1
                
                # Calculate the lower and upper bounds for outliers
                lower_bound = Q1 - self.iqr_factor * IQR
                upper_bound = Q3 + self.iqr_factor * IQR
                
                # Remove rows where the value is outside the IQR bounds for the current column
                df_cleaned = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]
        
        return df_cleaned


# Class 6: Handling Construction Year

In [479]:
class HandleConstructionYear:
    def __init__(self, df):
        self.df = df

    def handle_zero_construction_year(self):
        #  Plot the distribution of 'Construction_Year' before handling zeros
        plt.figure(figsize=(10, 6))
        self.df['Construction_Year'].plot(kind='hist', bins=20, color='lightblue', title='Distribution of Construction Year (Before Handling Zeros)')
        plt.xlabel('Construction Year')
        plt.ylabel('Frequency')
        plt.show()

        # Calculate the median of 'Construction_Year', excluding zeros
        median_year = self.df['Construction_Year'].replace(0, np.nan).median()
        
        # Replace 0 values with the median
        self.df['Construction_Year'] = self.df['Construction_Year'].replace(0, median_year)
        
        # Plot the distribution of 'Construction_Year' after handling zeros
        plt.figure(figsize=(10, 6))
        self.df['Construction_Year'].plot(kind='hist', bins=20, color='lightgreen', title='Distribution of Construction Year (After Handling Zeros)')
        plt.xlabel('Construction Year')
        plt.ylabel('Frequency')
        plt.show()

        return self.df


# Class 7: Explanatory Data Analysis (Univariate Analysis) 

In [481]:
class UnivariateEDA:
    def __init__(self, df):
        self.df = df

    def plot_target_distribution(self, target_column):
        
        plt.figure(figsize=(8, 6))
        sns.countplot(x=target_column, data=self.df)
        plt.title(f'Distribution of {target_column}')
        plt.show()

    def plot_histograms(self, column_list):
        
        num_cols = len(column_list)
        fig, axes = plt.subplots(1, num_cols, figsize=(14, 6), sharey=True)

        if num_cols == 1:
            axes = [axes]

        for i, column in enumerate(column_list):
            axes[i].hist(self.df[column], bins=30, color='skyblue', edgecolor='black')
            axes[i].set_title(f'Distribution of {column}', fontsize=14)
            axes[i].set_xlabel(column, fontsize=12)
            axes[i].set_ylabel('Frequency', fontsize=12)
            axes[i].grid(axis='y', linestyle='--', alpha=0.7)

        plt.tight_layout()
        plt.show()

    def plot_boxplot(self, column_list):
        
        plt.figure(figsize=(18, 6))
        for i, column in enumerate(column_list, 1):
            plt.subplot(1, len(column_list), i)
            sns.boxplot(x=self.df[column], color='skyblue', whis=[0.05, 0.95])
            plt.title(f'Boxplot of {column}')
            plt.xlabel(column)
        plt.tight_layout()
        plt.show()

    def check_and_transform_skewness(self, threshold=1):
        
        columns = self.df.select_dtypes(include=['float64', 'int64']).columns
        for column in columns:
            skew = self.df[column].skew()
            if abs(skew) > threshold:
                self.df[column] = np.log1p(self.df[column])  # Apply log transformation
        return self.df

# Class 8: Explanatory Data Analysis (Bivariate Analysis) 

In [483]:
class BivariateEDA:
    def __init__(self, df):
        
        self.df = df

    def plot_scatter(self, x_column, y_column):
        
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=x_column, y=y_column, data=self.df, color='blue')
        plt.title(f'{x_column} vs {y_column}')
        plt.xlabel(x_column)
        plt.ylabel(y_column)
        plt.show()

    def plot_feature_target_relationship(self, feature_column, target_column):
        
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=target_column, y=feature_column, data=self.df)
        plt.title(f'Relationship between {feature_column} and {target_column}')
        plt.show()

    def plot_feature_importance(self, target_column):
        
        X = self.df.drop(columns=[target_column])
        y = self.df[target_column]
        model = RandomForestClassifier()
        model.fit(X, y)
        feature_importances = model.feature_importances_
        importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': feature_importances
        }).sort_values(by='Importance', ascending=False)
        
        # Print the importance of features
        print("\nFeature Importance:")
        print(importance_df)
        
        # Plotting feature importance
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df)
        plt.title('Feature Importance')
        plt.show()

    def plot_countplot(self, x_column, hue_column=None):
        
        plt.figure(figsize=(10, 6))
        sns.countplot(x=x_column, hue=hue_column, data=self.df)
        plt.title(f'Distribution of {x_column}' + (f' by {hue_column}' if hue_column else ''))
        plt.xlabel(x_column)
        plt.ylabel('Count')
        plt.show()

    def plot_sorted_correlation(self, top_n=20):
        
        corr_matrix = self.df.corr()  # Compute the correlation matrix
        corr_values = corr_matrix.abs().unstack().sort_values(ascending=False)  # Flatten and sort correlations
        corr_values = corr_values[corr_values < 1]  # Remove self-correlations (where corr == 1)

        # Remove duplicates where (A, B) and (B, A) are both present
        seen_pairs = set()
        final_corr_values = {}

        for (feature_1, feature_2), corr_value in corr_values.items():
            pair = tuple(sorted([feature_1, feature_2]))  # Sort pairs to avoid duplicate pairs like (A, B) and (B, A)
            if pair not in seen_pairs:
                seen_pairs.add(pair)
                final_corr_values[(feature_1, feature_2)] = corr_value

        # Convert the final dictionary of correlations to a Series
        final_corr_values_series = pd.Series(final_corr_values)

        # Filter strong correlations (greater than a threshold)
        strong_corr = final_corr_values_series[final_corr_values_series > 0.3]

        # Print strong correlations
        print("\nStrong Correlations (>0.3):")
        print(strong_corr)

        # Plotting the strong correlations
        plt.figure(figsize=(12, 8))
        strong_corr.plot(kind='bar', color='red', alpha=0.7)
        plt.title(f'Strong Feature Correlations (>0.3)')
        plt.xlabel('Feature Pair')
        plt.ylabel('Correlation Value')
        plt.xticks(rotation=90)
        plt.show()


# Class 9: Explanatory Data Analysis ( Multivariate Analysis) 

In [485]:
class MultivariateEDA:
    def __init__(self, df):
        self.df = df

    def plot_correlation_with_target(self, target_column):
        
        target_corr = self.df.corr()[target_column].sort_values(ascending=False)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=target_corr.index, y=target_corr.values)
        plt.xticks(rotation=90)
        plt.title(f"Correlation of Features with Target Variable ({target_column})")
        plt.show()

    def calculate_vif(self):
        
        # Select numeric features
        X = self.df.select_dtypes(include=[np.number])
        
        # Add constant (intercept) to the data
        X = add_constant(X)
        
        # Calculate VIF for each feature
        vif_data = pd.DataFrame()
        vif_data["Feature"] = X.columns
        vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        
        # Remove the constant term (intercept) from VIF output
        vif_data = vif_data[vif_data["Feature"] != "const"]
        
        # Sort VIF values in descending order
        vif_data = vif_data.sort_values(by="VIF", ascending=False)
        
        # Filter for features with VIF less than 5
        vif_filtered = vif_data[vif_data["VIF"] < 5]
        
        # Plot VIF values for features with VIF less than 5
        plt.figure(figsize=(10, 6))
        sns.barplot(x="VIF", y="Feature", data=vif_filtered, color='green', alpha=0.7)
        plt.title('Variance Inflation Factor (VIF) for Features (VIF < 5)')
        plt.show()
        
        return vif_filtered


# Class 10: Feature Engineering

In [487]:
class FeatureEngineering:
    def __init__(self, df):
        
        self.df = df

    def create_interaction_terms(self):
        
        # Creating interaction terms
        self.df['Pop_Management'] = self.df['Population'] * self.df['Management_Group']
        self.df['Waterpoint_Quality'] = self.df['Water_Quality'] * self.df['Quantity']
        self.df['Years_Since_Construction'] = 2024 - self.df['Construction_Year']

        # Interaction term for years since construction and GPS height
        self.df['Years_Height_Interaction'] = self.df['Years_Since_Construction'] * self.df['Gps_Height']

    def binning_construction_age(self):
       
        self.df['Construction_Age_Group'] = pd.cut(
            self.df['Years_Since_Construction'], 
            bins=[0, 10, 20, 50, 100], 
            labels=['0-10', '11-20', '21-50', '50+']
        )

    def encode_construction_age_group(self):
        
        label_encoder = LabelEncoder()
        self.df['Construction_Age_Group_Encoded'] = label_encoder.fit_transform(self.df['Construction_Age_Group'])
        self.df = self.df.drop(columns=['Construction_Age_Group'])  # Dropping the original categorical column

    def apply_feature_engineering(self):
        
        self.create_interaction_terms()
        self.binning_construction_age()
        self.encode_construction_age_group()

    def add_custom_features(self):
        
        # Example of custom feature creation (Add more as needed)
        self.df['Waterpoint_Age'] = 2024 - self.df['Construction_Year']
        self.df['Population_Management_Interaction'] = self.df['Population'] * self.df['Management_Group']
        self.df['Waterpoint_Performance'] = self.df['Waterpoint_Type'] * self.df['Water_Quality']  # Example

    def get_df(self):
       
        return self.df


# Class 11: Solving for Class Imbalance

In [489]:
class ClassImbalanceSolver:
    def __init__(self, df, target_column):
        
        self.df = df
        self.target_column = target_column
        self.X = df.drop(columns=[target_column])
        self.y = df[target_column]
        self.X_resampled = None
        self.y_resampled = None

    def apply_smote(self):
        
        smote = SMOTE(random_state=42)
        self.X_resampled, self.y_resampled = smote.fit_resample(self.X, self.y)
        return self.X_resampled, self.y_resampled

    def visualize_class_distribution(self):
        
        # Before SMOTE
        plt.figure(figsize=(8, 5))
        self.y.value_counts().plot(kind='bar', color='lightblue', title="Class Distribution Before SMOTE")
        plt.show()

        # After SMOTE
        plt.figure(figsize=(8, 5))
        self.y_resampled.value_counts().plot(kind='bar', color='lightgreen', title="Class Distribution After SMOTE")
        plt.show()


# Class 12: Standarding using StandardScaler

In [491]:
class DataScaler:
    def __init__(self, X_train, X_test):
        
        self.X_train = X_train
        self.X_test = X_test
        self.scaler = StandardScaler()

    def scale_data(self):
        
        # Fit on training data and transform
        X_train_scaled = self.scaler.fit_transform(self.X_train)

        # Transform test data using the same scaler
        X_test_scaled = self.scaler.transform(self.X_test)

        return X_train_scaled, X_test_scaled


# Class 13: Outputting and Loading processed data

In [493]:
class OutputPreprocessedData:
    def __init__(self, X_train_scaled, X_test_scaled, y_train, y_test, output_path="Preprocessed Data/"):
        
        self.X_train_scaled = X_train_scaled
        self.X_test_scaled = X_test_scaled
        self.y_train = y_train
        self.y_test = y_test
        self.output_path = output_path

    def save_data(self):
        
        # Ensure output path exists
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)

        # Save the scaled features and labels to CSV
        pd.DataFrame(self.X_train_scaled).to_csv(os.path.join(self.output_path, 'X_train_scaled.csv'), index=False)
        pd.DataFrame(self.X_test_scaled).to_csv(os.path.join(self.output_path, 'X_test_scaled.csv'), index=False)
        pd.DataFrame(self.y_train).to_csv(os.path.join(self.output_path, 'y_train.csv'), index=False)
        pd.DataFrame(self.y_test).to_csv(os.path.join(self.output_path, 'y_test.csv'), index=False)

        print(f"Preprocessed data saved to {self.output_path}")


# Class 14: Modeling the 3 models(Logistic Model, Untuned and Tuned Decision Tree) 

In [495]:
class ModelTrainer:
    def __init__(self, X_train, y_train, X_test, y_test):
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def logistic_regression(self):
        
        model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', random_state=42, max_iter=1000)
        model.fit(self.X_train, self.y_train)
        return model

    def untuned_decision_tree(self):
        
        model = DecisionTreeClassifier(random_state=42)
        model.fit(self.X_train, self.y_train)
        return model

    def tuned_decision_tree(self, param_grid=None):
        
        if param_grid is None:
            param_grid = {
                'criterion': ['gini', 'entropy'],
                'max_depth': [None, 5, 10, 15, 20, 30],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 5, 10],
                'ccp_alpha': [0.0, 0.01, 0.05, 0.1],
                'max_features': [None, 'sqrt', 'log2']
            }

        model = DecisionTreeClassifier(random_state=42, class_weight='balanced')
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
        grid_search.fit(self.X_train, self.y_train)
        return grid_search.best_estimator_


# Class 15: Evaluation Metrics and the 3 models(Logistic Model, Untuned and Tuned Decision Tree)

In [497]:
class ModelEvaluator:
    def __init__(self, X_train, y_train, X_test, y_test):
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

    def evaluate(self, model):
        
        # Train accuracy
        train_accuracy = accuracy_score(self.y_train, model.predict(self.X_train))

        # Test accuracy
        test_accuracy = accuracy_score(self.y_test, model.predict(self.X_test))

        # Classification reports
        y_train_pred = model.predict(self.X_train)
        y_test_pred = model.predict(self.X_test)

        train_report = classification_report(self.y_train, y_train_pred)
        test_report = classification_report(self.y_test, y_test_pred)

        # Confusion matrices
        train_conf_matrix = confusion_matrix(self.y_train, y_train_pred)
        test_conf_matrix = confusion_matrix(self.y_test, y_test_pred)

        # Visualize Confusion Matrix for Test Data
        self.plot_confusion_matrix(test_conf_matrix, model)

        # Visualize ROC-AUC Curve for Test Data
        self.plot_roc_curve(model)

        return {
            "train_accuracy": train_accuracy,
            "test_accuracy": test_accuracy,
            "train_report": train_report,
            "test_report": test_report,
            "train_conf_matrix": train_conf_matrix,
            "test_conf_matrix": test_conf_matrix
        }

    def plot_confusion_matrix(self, cm, model):
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted Labels')
        plt.ylabel('True Labels')
        plt.show()

    def plot_roc_curve(self, model):
        
        y_prob = model.predict_proba(self.X_test)[:, 1]  # Probabilities for the positive class
        fpr, tpr, _ = roc_curve(self.y_test, y_prob)
        roc_auc = auc(fpr, tpr)

        # Plot ROC Curve
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc='lower right')
        plt.show()
