# EDA and Modeling Classes for Machine Learning and Datascience Projects

In this code i developed a method that helped me to do EDA faster than do it step by step and reduce your line codes.
You can do classes for repetitive processes for example: Regression, Classification models, Logistic regression, among others i hope this notebook will help you to do more efficient coding

## Import libraries for the project

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas_gbq as gbq
import db_dtypes
plt.rcParams['lines.linewidth'] = 1.5
#plt.style.use('fivethirtyeight')
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import SilhouetteVisualizer
from scipy.stats import zscore
from sklearn.cluster import KMeans
from termcolor import colored as cl # text customization

## Outlier detection and treatment class
This class works as follows:
1. First we find the outliers of our dataset using IQR Method, it is only to explore our data
2. Next we drop the outliers with differents methos like Drop, Cap, Replace

In [None]:
class OutliersStats:
    def __init__(self, dataframe_process):
        self.dataframe_process = dataframe_process

    def find_outliers_iqr(self):
        q1 = self.dataframe_process.quantile(0.25)
        q3 = self.dataframe_process.quantile(0.75)
        IQR = q3 - q1
        outliers = self.dataframe_process[
            ((self.dataframe_process < (q1 - 1.5 * IQR)) | (self.dataframe_process > (q3 + 1.5 * IQR)))]
        print("number of outliers: " + str(len(outliers)))
        print("max outlier value: " + str(outliers.max()))
        print("min outlier value: " + str(outliers.min()))
        return outliers

    def outliers_drop_method(self):
        q1 = self.dataframe_process.quantile(0.25)
        q3 = self.dataframe_process.quantile(0.75)
        IQR = q3 - q1
        not_outliers = self.dataframe_process[
            ~((self.dataframe_process < (q1 - 1.5 * IQR)) | (self.dataframe_process > (q3 + 1.5 * IQR)))]
        outliers_dropped = not_outliers.dropna().reset_index()
        return outliers_dropped

    def outliers_cap_method(self, column_outlier):
        upper_limit = self.dataframe_process[column_outlier].mean() + 3 * self.dataframe_process[column_outlier].std()
        lower_limit = self.dataframe_process[column_outlier].mean() - 3 * self.dataframe_process[column_outlier].std()
        print("Upper limit: " + str(upper_limit))
        print("Lower limit: " + str(lower_limit))
        self.dataframe_process[column_outlier] = np.where(self.dataframe_process[column_outlier] > upper_limit,
                                             upper_limit,
                                             np.where(
                                                 self.dataframe_process[column_outlier] < lower_limit,
                                                 lower_limit,
                                                 self.dataframe_process[column_outlier]
                                             )
                                             )
        return self.dataframe_process[column_outlier]

    def outliers_replace_method(self):
        q1 = self.dataframe_process.quantile(0.25)
        q3 = self.dataframe_process.quantile(0.75)
        IQR = q3 - q1
        upper = self.dataframe_process[~(self.dataframe_process > (q3 + 1.5 * IQR))].max()
        lower = self.dataframe_process[~(self.dataframe_process < (q1 - 1.5 * IQR))].min()
        df = np.where(self.dataframe_process > upper,
                      self.dataframe_process.mean(),
                      np.where(
                          self.dataframe_process < lower,
                          self.dataframe_process.mean(),
                          self.dataframe_process
                      )
                      )
        return df

## Read from Google Cloud Platform
This class allows you to load data from databases in Google Cloud Platform by enter your query and the client of your account with the json of your project

In [None]:
class ReadFromGCP:
    def __init__(self, query, client):
        self.query = query
        self.client = client

    def data_read(self):
        data = self.client.query(self.query, location="US", ).result().to_dataframe(progress_bar_type='tqdm')
        return data


## Class for EDA Analysis

Now in this class we use a lot of functions that allows us to do some of the classic analysis that usually are used in EDA
1. Dimension: get the dimension of the dataframe in rows and columns
2. Mean: get the mean of the dataset in a specific column
3. Standard Deviation: get the sd of the dataset in a specific column
4. Missing Values Barplot: The barplot provides a simple plot where each bar represents a column within the dataframe. The height of the bar indicates how complete that column is, i.e, how many non-null values are present
5. Group Mean: we can calculate the mean of specific grouped column like a pivot table in order to get some metrics faster
6. Proportion: proportion of the data in a column

And other interesting functions

In [None]:
class EDA:
    def __init__(self, data):
        self.data = data

    def get_dim(self):
        print('Filas:', len(self.data))
        print('Columnas:', len(list(self.data.columns)))

    def get_mean(self, column):
        print(f"Mean {column}:", self.data[column].mean())

    def get_standard_dev(self, column):
        print(f"STD {column}:", self.data[column].std())

    def missing_values_bar(self):
        msno.bar(self.data)
        plt.tight_layout()
        plt.show()

    def group_mean_metric(self, column, valor):
        metrica_mean = pd.DataFrame(self.data.groupby(column)[valor].mean()).reset_index()
        metrica_mean.rename(columns={valor: 'Promedio del ' + valor}, inplace=True)
        metrica_media = pd.DataFrame(self.data.groupby(column)[valor].median()).reset_index()
        metrica_media.rename(columns={valor: 'Media del ' + valor}, inplace=True)
        final_data = metrica_mean.merge(metrica_media, on=column, how='left')
        return final_data

    def proporcion_variables(self, column):
        fig = plt.figure(figsize=(15, 10))
        self.data[column].value_counts(normalize=True).plot.barh(width=0.4)
        plt.yticks(fontsize=12)
        plt.title("Proporción de ventas por categoría", fontsize=12)
        plt.tight_layout()
        plt.show()

    def get_barplot_agrupado(self, column_categorica, value, funcion_agrupadora):
        grupos = self.data.groupby(by=[column_categorica]).aggregate(
            {value: funcion_agrupadora})
        grupos2 = grupos.reset_index()
        plt.bar(grupos2[column_categorica], grupos2[value])
        plt.xticks(rotation=30, ha='right')
        plt.xticks(fontsize=7)
        plt.ylabel(value, fontsize=10)
        plt.yticks(fontsize=10)
        plt.title("Variable categórica: " + column_categorica +
                  " y Métrica: " + value, fontsize=9)
        plt.suptitle("Análisis a nivel de " + column_categorica, fontsize=15)
        plt.tight_layout()
        plt.style.use('seaborn-colorblind')
        # plt.margins(0.5)
        plt.show()

    def get_pairplot(self, column_categorica, value, indice):
        analisis_df = pd.pivot_table(self.data, values=value, index=[indice],
                                     columns=[column_categorica], aggfunc=np.sum).reset_index()
        sns.set()
        pairplot = sns.pairplot(analisis_df)
        plt.title(f"{column_categorica} PairPlot")
        plt.show()

    def get_heatmap_corr(self, column_categorica, value, indice):
        analisis_df = pd.pivot_table(self.data, values=value, index=[indice],
                                     columns=[column_categorica], aggfunc=np.sum).reset_index()
        plt.figure(figsize=(16, 6))
        correlaciones = analisis_df.corr()
        heatmap_corr = sns.heatmap(correlaciones, vmin=-1, vmax=1, annot=True)
        heatmap_corr.set_title('Análisis de correlaciones', fontdict={'fontsize': 12}, pad=12)
        plt.show()

    def get_heatmap_improve(self, column_categorica, value, indice):
        analisis_df = pd.pivot_table(self.data, values=value, index=[indice],
                                     columns=[column_categorica], aggfunc=np.sum).reset_index()
        plt.figure(figsize=(16, 6))
        correlaciones = analisis_df.corr()
        heatmap_corr = sns.heatmap(correlaciones, vmin=-1, vmax=1, annot=True)
        heatmap_corr.set_title('Análisis de correlaciones', fontdict={'fontsize': 12}, pad=12)
        plt.show()

    def get_heatmap_corr_v2(self):
        plt.figure(figsize=(10, 8))
        sns.heatmap(self.data.corr(),
                    annot=True,
                    linewidths=.5,
                    center=0,
                    cbar=False,
                    cmap="YlGnBu")
        plt.title('Mapa de calor - Correlación variables')
        plt.tight_layout()
        plt.show()

    def get_boxplot_horizontal(self, column_categorica, value):
        p = sns.boxplot(y=self.data[column_categorica], x=self.data[value])
        p.set_xlabel(value, fontsize=10)
        p.set_ylabel(column_categorica, fontsize=10)
        p.set_title(value + ' Distribution by ' + column_categorica, fontsize=12, fontweight='bold')
        plt.tight_layout()
        plt.show()

    def get_boxplot_vertical(self, column_categorica, value):
        plt.figure(figsize=(16, 6))
        sns.boxplot(x=self.data[column_categorica], y=self.data[value], showmeans=True)
        plt.tight_layout()
        plt.show()

    def transformacion_base(self, column_categorica, value, indice):
        analisis_df = pd.pivot_table(self.data, values=value, index=[indice],
                                     columns=[column_categorica], aggfunc=np.sum)
        return analisis_df