# Multivariate analysis

Multivariate analysis serves as a powerful toolkit enabling us to derive deeper insights where multiple variables interact. By examining the relationships and patterns among several variables simultaneously, this approach enhances our understanding of the data and helps us find hidden connections, identifying underlying factors that drive observed trends, and making more informed decisions.

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pyMechkar
import sweetviz as sv
#import pandas_profiling
from ydata_profiling import ProfileReport
from scipy.stats import chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import spearmanr
import matplotlib.backends.backend_pdf


Read the data

In [2]:
df = pd.read_csv('../sql/home_credit_train_ff.csv', index_col=0)

As before, divide the feautes into numerical and categorical types

In [3]:
cat_cols = []
num_cols = []

for col in df.columns:
    if df[col].dtype == 'object' or pd.api.types.is_categorical_dtype(df[col]) or df[col].nunique() <= 8:
        cat_cols.append(col)
    else:
        num_cols.append(col)

print(f'string or categorical columns:\n {cat_cols}')
print('--------------------------------------------------------------------------------------------------------------------')
print(f'numeric columns:\n {num_cols}')


string or categorical columns:
 ['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOC

We will display the numerical columns in a scatter plots

In [4]:
# Create a PDF file for each variable
for variable in num_cols:
    pdf_filename = f'{variable}_plots.pdf'
    with PdfPages(pdf_filename) as pdf:
        plt.figure(figsize=(6, 4))
        for other_variable in num_cols:
            if other_variable != variable:
                plt.scatter(df[variable], df[other_variable], color='blue', alpha=0.7, rasterized=True)
                plt.xlabel(variable)
                plt.ylabel(other_variable)
                plt.title(f'{variable} vs {other_variable}')
                plt.grid(True)
                pdf.savefig()  # Save the current figure to the PDF
                plt.close()

print("PDF files with plots created.")


PDF files with plots created.


And the categorical columns using countplots

In [5]:
# Create a PDF file for each variable
for variable in cat_cols:
    pdf_filename = f'{variable}_plots.pdf'
    with PdfPages(pdf_filename) as pdf:
        plt.figure(figsize=(6, 4))
        for other_variable in cat_cols:
            if other_variable != variable:
                #plt.scatter(df[variable], df[other_variable], color='blue', alpha=0.7, rasterized=True)
                sns.countplot(data=df, x=variable, hue=other_variable, rasterized=True)
                plt.xlabel(variable)
                plt.ylabel('Count')
                plt.title(f'{variable} vs {other_variable}')
                plt.grid(True)
                pdf.savefig()  # Save the current figure to the PDF
                plt.legend(title=other_variable)
                plt.xticks(rotation=45)
                plt.close()

print("PDF files with plots created.")

PDF files with plots created.


In [None]:
pdf = PdfPages(r'countplot cat_cols vs target.pdf')

for col in cat_cols:
    if col != 'TARGET':
        plt.figure(figsize=(4,3))
        sns.countplot(data=df, x=col, hue='TARGET')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.title('countplot - ' + col + ' ' + 'vs TARGET')
        plt.legend(title='TARGET')
        plt.xticks(rotation=45)
        pdf.savefig(bbox_inches='tight', dpi=300)
        plt.show()
    
pdf.close()

conclusions from the plots that were created above will appear in the next notebook