# Chapter 03: Descriptive Analysis
## Prof. Leandro Nunes de Castro

In [None]:
# This content was created as a supporting material for the textbook
# EXPLORATORY DATA ANALYSIS: ...
# ... Descriptive Analysis, Visualization and Dashboard Design (with codes in Python)
# authored by Leandro de Castro (c), 2023-2024
# All rights reserved

# Chapter 3 - Descriptive Analysis

In [None]:
# SUMMARY
# 0. Importing the Libraries and Loading the Chapter Data
# 1. Central Tendency and Dispersion Measures: One Variable at a Time
# 2. Central Tendency and Dispersion Measures: All Variables at Once
# 3. Association Measures
# 4. Analyzing Through Visualization
# Final challenge

##  0. Importing the Libraries and Loading the Chapter Data

In [None]:
import statistics as st # Built in Python library for descriptive statistics
import pandas as pd  # Data manipulation and analysis library
import researchpy as rp  # Open source library focused on univariate and bivariate analysis
import numpy as np  # General purpose array processing package
import seaborn as sns  # Data visualization library based on matplotlib
import matplotlib.pyplot as plt  # Data visualization library
import scipy.stats as spy  # Statistical library from Scipy

In [None]:
import statistics as st # Built in Python library for descriptive statistics
import pandas as pd  # Data manipulation and analysis library
import researchpy as rp  # Open source library focused on univariate and bivariate analysis
import numpy as np  # General purpose array processing package
import seaborn as sns  # Data visualization library based on matplotlib
import matplotlib.pyplot as plt  # Data visualization library
import scipy.stats as spy  # Statistical library from Scipy
from scipy.stats import norm, kurtosis, laplace, semicircular
from scipy import stats
from scipy.stats import gmean, hmean, trim_mean

In [None]:
# Loading dataset1
# https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass
# Missing Values? Yes
dmammo = pd.read_csv('mammographic_masses_nominal.csv')
dmammo.shape

In [None]:
dmammo.head

In [None]:
# Loading dataset2
# https://archive.ics.uci.edu/ml/datasets/forest+fires
# Missing Values? No
dforest = pd.read_csv('forestfires.csv')
dforest.shape

In [None]:
dforest.head

## Section 3.3: Frequency Distributions 

In [None]:
# Determining the frequency distribution, frequency table and pie chart 
# of variable 'Shape' in the Mammographic dataset

SShape = pd.Series(dmammo['Shape'])
ftable = SShape.value_counts()  # Generate the frequency table
rftable = ftable/len(SShape)*100  # Relative frequency
cftable = ftable.cumsum()/len(SShape)*100  # Cumulative frequency
df = pd.DataFrame({'Frequency':ftable.to_list(),
                   'Relative Frequency':rftable.to_list(),
                  'Cumulative Frequency':cftable.to_list()})
print(df)
fig, figftable = plt.subplots()
figftable.pie(ftable.to_list(), labels=ftable.index.to_list(),
              autopct='%1.2f%%')  # From Matplotlib

In [None]:
dforest.head
#dforest.shape

In [None]:
# Determining the frequency distribution, frequency table and histogram 
# of continuous variables in the Forest Fire dataset

var = 'temp'  # Choose the target variable
SShape = pd.Series(dforest[var])
nbins = 10
inflimit = 0; suplimit = max(SShape)
ampl = (suplimit - inflimit)/nbins

# Define the range of the variable and bin size
fbins = np.arange(0,suplimit+ampl,ampl)

# The pandas.cut function groups the data into bins and counts 
# the frequency
ftable = pd.cut(SShape,fbins).value_counts() # Absolute frequency
rftable = ftable/len(SShape)*100  # Relative frequency
cftable = ftable.cumsum()/len(SShape)*100  # Cumulative frequency
df = pd.DataFrame({'Bins':ftable.index.to_list(),
                   'Frequency':ftable.to_list(),
                   'Relative Frequency':rftable.to_list(),
                   'Cumulative Frequency':cftable.to_list()})
print(df)
plt.xticks(fbins)
sns.histplot(dforest,x=var,bins=fbins, kde = 2)

In [None]:
# BONUS CODE
# Using Seaborn to determine the number of bins, bin width and the bin edges
# when auto is used for parameter bins in the histplot function
dforest = pd.read_csv('forestfires.csv')
var = 'temp'
ax = sns.histplot(dforest,x=var,bins='auto', kde = 2)
num_bins = len(ax.patches)
bin_width = (max(dforest[var])-min(dforest[var]))/num_bins
num_bins = len(ax.patches)
bin_edges = ax.get_xticks()
print(num_bins,bin_width,bin_edges)

In [None]:
# Plot distributions with different shapes 
# Load the forest fires dataset from UCI
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)
sns.histplot(dforest,x='month',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='day',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='FFMC',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='DMC',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='DC',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='ISI',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='RH',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='wind',bins='auto', kde = 2); plt.show()

In [None]:
# Plot distributions with different shapes 
# Load the forest fires dataset from UCI
# Alternative implementation
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)
var = ['month','day','FFMC','DMC','DC','ISI','RH','wind']
for i in var:
    sns.histplot(dforest,x=i,bins='auto', kde = 2); plt.show()

In [None]:
# Generate Contingency Tables for the Mammographic Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
cols = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
dmammo = pd.read_csv(url, names=cols, na_values='?')

# Remove rows with missing values
dmammo.dropna(inplace=True)

# Print the contingency tables
var = ['Shape','Margin','Density']
print('**Contingency Tables**')
for i in var:
    CT = pd.crosstab(dmammo[i], dmammo['Severity'])
    print('Variables',i, 'and Severity:\n',CT)

## Section 3.4: Central Tendency and Dispersion Measures 
### One Variable at a Time 

In [None]:
# Calculating the mean and mode one by one using the Statistics library
# Numeric variables
print('**Forest Fires Dataset**')
print('\n*Numeric Variable FFMC*')
print('Mean of variable FFMC: {:.2f}'.format(st.mean(dforest['FFMC'])))
print('Median of variable FFMC: {:.2f}'.format(st.median(dforest['FFMC'])))
midpoint = (max(dforest['FFMC'])+min(dforest['FFMC']))/2
print('Midpoint of variable FFMC: {:.2f}'.format(midpoint))

print('\n*Numeric Variable temp*')
print('Mean of variable temp: {:.2f}'.format(st.mean(dforest['temp'])))
print('Median of variable temp: {:.2f}'.format(st.median(dforest['temp'])))
midpoint = (max(dforest['temp'])+min(dforest['temp']))/2
print('Midpoint of variable temp: {:.2f}'.format(midpoint))

# Nominal variables
print('\n*Categorical Variables*')
print('Mode of nominal variable month: {v1}'
      .format(v1=st.mode(dforest['month'])))
print('Mode of nominal variable day: {v1}'
      .format(v1=st.mode(dforest['day'])))

In [None]:
# Plot the central tendency measures over the histogram 
var = 'temp'  # Choose the target variable
mean = st.mean(dforest[var])
median = st.median(dforest[var])
midpoint = (max(dforest[var])+min(dforest[var]))/2
print('Mean, median and midpoint for temp:',mean,median,midpoint)
sns.histplot(dforest,x=var,bins='auto', kde = 2)
plt.axvline(x=mean, color='r', linestyle='--', label='Mean')
plt.axvline(x=median, color='g', linestyle='-', label='Median')
plt.axvline(x=midpoint, color='b', linestyle=':', label='Midpoint')
plt.legend() # Add a legend
plt.show()

In [None]:
# BONUS CODE
# Calculating the mean and mode one by one using the Statistics library
# Numeric variables
print('**Forest Fires Dataset**')
print('\n*Numeric Variables*')
print('Mean of variable X: {:.2f}'.format(st.mean(dforest['X'])))
print('Mean of variable Y: {:.2f}'.format(st.mean(dforest['Y'])))
print('Mean of variable FFMC: {:.2f}'.format(st.mean(dforest['FFMC'])))
print('Mean of variable DMC: {:.2f}'.format(st.mean(dforest['DMC'])))
print('Mean of variable DC: {:.2f}'.format(st.mean(dforest['DC'])))
print('Mean of variable ISI: {:.2f}'.format(st.mean(dforest['ISI'])))
print('Mean of variable temp: {:.2f}'.format(st.mean(dforest['temp'])))
print('Mean of variable RH: {:.2f}'.format(st.mean(dforest['RH'])))
print('Mean of variable wind: {:.2f}'.format(st.mean(dforest['wind'])))
print('Mean of variable rain: {:.2f}'.format(st.mean(dforest['rain'])))
print('Mean of variable area: {:.2f}'.format(st.mean(dforest['area'])))

# Nominal variables
print('\n*Categorical Variables*')
print('Mode of nominal variable month: {v1}'
      .format(v1=st.mode(dforest['month'])))
print('Mode of nominal variable day: {v1}'
      .format(v1=st.mode(dforest['day'])))

In [None]:
# Calculating the mean of a frequency distribution (Eq. 3.4)
# Create a DataFrame from the frequency distribution data
data = {'Bins': ['(16.65, 19.98]', '(19.98, 23.31]', '(13.32, 16.65]',
                 '(23.31, 26.64]', '(9.99, 13.32]', '(26.64, 29.97]',
                 '(3.33, 6.66]', '(6.66, 9.99]', '(29.97, 33.3]',
                 '(0.0, 3.33]'],
        'Frequency': [128, 119, 75, 69, 47, 30, 20, 15, 13, 1]}
df = pd.DataFrame(data)
# Calculate the midpoint of each bin (interval)
df['Midpoint'] = (df['Bins'].str.split(', ').str[0].str.replace('(', '')
                  .astype(float) +
                  df['Bins'].str.split(', ').str[1].str.replace(']', '')
                  .astype(float))/2
# Multiply the midpoint by the frequency to get the product
df['Product_fx'] = df['Midpoint'] * df['Frequency']
# Sum the products and frequencies
sprod = df['Product_fx'].sum()
sfreq = df['Frequency'].sum()
# Calculate the mean
mean = sprod / sfreq
print(df)
print('Mean of the frequency distribution: {:.2f}'.format(mean))

In [None]:
# Calculate the weighted average (Eq. 3.5), geometric (Eq. 3.6) 
# harmonic (Eq. 3.7), and trimmed (Eq. 3.8) means

var = 'FFMC'
weights = np.random.randn(len(dforest[var]))
wavg = np.average(dforest[var], weights=weights)
gavg = spy.gmean(dforest[var])  # From Scipy library
havg = spy.hmean(dforest[var])  # From Scipy library
tavg = spy.trim_mean(dforest[var],0.05)  # 5% trim

print('Weighted average of variable FFMC: {:.2f}'.format(wavg))
print('Geometric mean of variable FFMC: {:.2f}'.format(gavg))
print('Harmonic mean of variable FFMC: {:.2f}'.format(havg))
print('Trimmed mean of variable FFMC: {:.2f}'.format(tavg))

var = 'temp'
weights = np.random.randn(len(dforest[var]))
wavg = np.average(dforest[var], weights=weights)
gavg = spy.gmean(dforest[var])  # From Scipy library
havg = spy.hmean(dforest[var])  # From Scipy library
tavg = spy.trim_mean(dforest[var],0.05)  # 5% trim

print('\nWeighted average of variable temp: {:.2f}'.format(wavg))
print('Geometric mean of variable temp: {:.2f}'.format(gavg))
print('Harmonic mean of variable temp: {:.2f}'.format(havg))
print('Trimmed mean of variable temp: {:.2f}'.format(tavg))

In [None]:
# Central Tendency Measures for the Forest Fires Dataset
# Columns of interest: 'FFMC','DMC','DC', 'ISI', 'temp', 'RH', 'wind', 'rain'

ffmc = dforest['FFMC']; dmc = dforest['DMC']; dc = dforest['DC']
isi = dforest['ISI']; temp = dforest['temp']; rh = dforest['RH']
wind = dforest['wind']; rain = dforest['rain']

# Dictionary to store the results
CTM = {}

# Loop over the columns and calculate the statistics
for col_name, col_data in zip(['FFMC','DMC','DC', 'ISI', 'temp', 'RH', 'wind', 'rain'],
                              [ffmc, dmc, dc, isi, temp, rh, wind, rain]):
    mean = np.mean(col_data)
    median = np.median(col_data)
    midpoint = (np.max(col_data) + np.min(col_data)) / 2
    wavg = np.average(col_data, weights=dforest['area'])
    gavg = spy.gmean(col_data)
    havg = spy.hmean(col_data)
    tavg = spy.trim_mean(col_data, proportiontocut=0.1)
    # Add the results to the dictionary
    CTM[col_name] = {'Mean': mean,
                         'Median': median,
                         'Midpoint': midpoint,
                         'Weighted Mean': wavg,
                         'Geometric Mean': gavg,
                         'Harmonic Mean': havg,
                         'Trimmed Mean': tavg}
    
# Print the results
for col_name, col_results in CTM.items():
    print(col_name)
    for stat_name, stat_value in col_results.items():
        print(f"\t{stat_name}: {stat_value:.2f}")

In [None]:
# Calculate the variability measures range (Eq. 3.9), 
# IQR (Eq. 3.10), sIQR (Eq. 3.11), variance (Eq. 3.12), 
# std (Eq. 3.14), and CV (Eq. 3.16) using Numpy

var = 'FFMC'
drange = np.max(dforest[var]) - np.min(dforest[var])
Q1, Q3 = np.percentile(dforest[var], [25,75])
IQR = Q3 - Q1
sIQR = IQR / 2
dvar = np.var(dforest[var])
dstd = np.std(dforest[var])
CV = dstd / np.mean(dforest[var]) * 100

print('*Variability Measures*')
print('Range of variable FFMC: {:.2f}'.format(drange))
print('IQR of variable FFMC: {:.2f}'.format(IQR))
print('sIQR of variable FFMC: {:.2f}'.format(sIQR))
print('Variance of variable FFMC: {:.2f}'.format(dvar))
print('Standard deviation of variable FFMC: {:.2f}'.format(dstd))
print('Variation coefficient of variable FFMC: {:.2f}'.format(CV))

## Measures of Shape 

In [None]:
# Skewness and Skewed distributions
# Generate random data with a right-skewed distribution
from scipy.stats import skew

data = np.random.beta(a=1, b=5, size=1000) # Beta distribution
mean = st.mean(data)
median = st.median(data)
mode = st.mode(data)
print('Mean, median and mode: {:.2f} {:.2f} {:.2f}'
      .format(mean,median,mode))
print('Skewness (Fischer-Pearson Coefficient): {:.2f}'
      .format(skew(data)))
print('Skewness (First Skewness Coefficient): {:.2f}'
      .format((mean-mode)/np.std(data)))
sns.histplot(data,bins='auto', kde = 2)
plt.axvline(x=mean, color='r', linestyle='--', label='Mean')
plt.axvline(x=median, color='g', linestyle='-', label='Median')
plt.axvline(x=mode, color='b', linestyle=':', label='Mode')
plt.legend(); plt.show()

In [None]:
# Skewness and Skewed distributions
# Generate random data with a left-skewed distribution
from scipy.stats import skew

data_neg = np.random.beta(a=5, b=1, size=1000) # Beta distribution
mean = st.mean(data_neg)
median = st.median(data_neg)
mode = st.mode(data_neg)
print('Mean, median and mode: {:.2f} {:.2f} {:.2f}'
      .format(mean,median,mode))
print('Skewness (Fischer-Pearson Coefficient): {:.2f}'
      .format(skew(data_neg)))
print('Skewness (First Skewness Coefficient): {:.2f}'
      .format((mean-mode)/np.std(data_neg)))
sns.histplot(data_neg,bins='auto', kde = 2)
plt.axvline(x=mean, color='r', linestyle='--', label='Mean')
plt.axvline(x=median, color='g', linestyle='-', label='Median')
plt.axvline(x=mode, color='b', linestyle=':', label='Mode')
plt.legend(); plt.show()

In [None]:
# Kurtosis: mesokurtic, platykurtic and leptokurtic distributions

# Normal distribution (Mesokurtic)
dnorm = norm.rvs(size=10000)
print(type(dnorm))
k = spy.kurtosis(dnorm)
print('Kurtosis: {:.2f}'.format(k))
sns.histplot(dnorm,bins='auto', kde = 2)
plt.title(f"Normal Distribution - Kurtosis: {k:.2f}")
plt.show()

# Uniform distribution (Platykurtic)
dunif = np.random.uniform(0.01, 0.10, 10000)
k = spy.kurtosis(dunif)
print('Kurtosis: {:.2f}'.format(k))
sns.histplot(dunif,bins='auto', kde = 2)
plt.title(f"Uniform Distribution - Kurtosis: {k:.2f}")
plt.show()

# Laplace distribution (Leptokurtic)
dlap = laplace.rvs(loc=0, scale=1, size=10000)
k = spy.kurtosis(dlap)
sns.histplot(dlap, bins='auto', kde = 2)
plt.title(f"Laplace Distribution - Kurtosis: {k:.2f}")
plt.show()

# Wigner semicircle distribution
dwigner = semicircular.rvs(size=10000)
k = spy.kurtosis(dwigner)
sns.histplot(dwigner, bins='auto', kde = 2)
plt.title(f"Wigner Semicircle Distribution - Kurtosis: {k:.2f}")
plt.show()

In [None]:
# Calculate the Skewness and Kurtosis for the Forest Fires variables

# load dataset
dforest = pd.read_csv("forestfires.csv")

# Skewness and kurtosis for each variable
skewness = dforest[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']].skew()
kurt = dforest[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']].kurtosis()

# Print the results and classify the kurtosis
for var in skewness.index:
    print(f"{var} Skewness: {skewness[var]:.2f}")
    if kurt[var] > 0:
        print(f"{var} Kurtosis: {kurt[var]:.2f} (Leptokurtic)")
    elif kurt[var] < 0:
        print(f"{var} Kurtosis: {kurt[var]:.2f} (Platykurtic)")
    else:
        print(f"{var} Kurtosis: {kurt[var]:.2f} (Mesokurtic)")

In [None]:
# Plot Normal Distributions 

# Define an array of mean and standard deviation values
vmu = np.array([-1,0,1])
vsigma = np.array([.5,1,1.5])

# Create an array of x-values
x = np.linspace(-5,5,100)

# Loop through the mean and standard deviation values and plot the normal
# distributions
for mu,sigma in zip(vmu,vsigma):
    y = (1/(sigma*np.sqrt(2*np.pi))) * np.exp(-((x-mu)**2)/(2*sigma**2))
    plt.plot(x, y, label=f'μ={mu},σ={sigma}')

# Add a legend and axis labels
plt.legend()
plt.xlabel('x')
plt.ylabel('Probability density')
plt.title('Normal Distributions')
plt.show()

In [None]:
# BONUS CODE
# Plot Normal Distributions using the pdf() function from Scipy

# Define an array of mean and standard deviation values
vmu = np.array([-1,0,1])
vsigma = np.array([.5,1,1.5])

# Create an array of x-values
x = np.linspace(-5,5,100)

# Loop through the mean and standard deviation values and plot the normal distributions
for mu,sigma in zip(vmu,vsigma):
    y = norm.pdf(x, mu, sigma)
    plt.plot(x, y, label=f'μ={mu},σ={sigma}')

# Add a legend and axis labels
plt.legend()
plt.xlabel('x')
plt.ylabel('Probability density')
plt.title('Normal Distributions')
plt.show()

In [None]:
# Plot a Normal Distribution detaching σ = {-3,-2,-1,0,1,2,3} 

# Define the range of x values (which correspond to z-scores)
x = np.linspace(-4, 4, 1000)

# Calculate the probability density function (PDF) for the normal 
# distribution
y = (1 / (np.sqrt(2 * np.pi))) * np.exp(-(x ** 2) / 2)

# Set up the plot, plot the PDF and vertical lines
fig, vax = plt.subplots()
vax.plot(x, y)
std = 1; mean = 0

for i in range(-3, 4):
    vax.axvline(mean + i * std, color='r', linestyle='--')

# Add a legend and labels to the plot
vax.set_xlabel("x")
vax.set_ylabel("PDF")
vax.set_title("Normal Distribution with Standard Deviation Lines")
plt.show()

## Section 3.5: Measures of Association 

In [None]:
# Calculate and print the Covariance Matrix of the Forest Fires dataset

# Load the dataset
dforest = pd.read_csv('forestfires.csv')

# Select the desired numeric features
features = ['X','Y','FFMC','DMC','DC','ISI','temp','RH','wind','rain']

# Compute the covariance matrix and format to two decimal places
pd.options.display.float_format = '{:.2f}'.format
Mcov = dforest[features].cov()

# Print the covariance matrix
print(Mcov)

In [None]:
# Calculate Correlation Coefficients: PCC, SRCC and KRCC
# for the numerical variables of the Forest Fires dataset

# Read the data into a pandas dataframe
dforest = pd.read_csv("forestfires.csv")
pd.options.display.float_format = '{:.2f}'.format

# Calculate PCC, SRCC, KRCC
print('**Forest Fires Dataset: PCC, SRCC, KRCC**')
PCC = dforest.corr(method='pearson')
print('Pearson Correlation Coefficient (PCC)\n',PCC)
SRCC = dforest.corr(method='spearman')
print('\nSpearman Rank Correlation Coefficient (SRCC)\n',SRCC)
KRCC = dforest.corr(method='kendall')
print('\nKramers Rank Correlation Coefficient (KRCC)\n',KRCC)

In [None]:
# Calculate Correlation Coefficients: Chi-square, Cramer's V and Point Biserial 
# for the categorical variables of the Mammographic dataset
from scipy import stats
from scipy.stats import pointbiserialr

# Read the data from URL into a pandas dataframe
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
cols = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
dmammo = pd.read_csv(url, names=cols, na_values='?')
dmammo.dropna(inplace=True)  # Remove rows with missing values

# Chi-square and Cramer's V
cvars = ['Shape', 'Margin', 'Density', 'Severity']  # Categorical variables
chis = pd.DataFrame()
phi = pd.DataFrame()
for var1 in cvars:
    for var2 in cvars:
        if var1 != var2:
            chi2, p, dof, ex = stats.chi2_contingency(pd.crosstab(dmammo[var1], dmammo[var2]))
            chis.loc[var1, var2] = chi2
            phi.loc[var1, var2] = np.sqrt(chi2 / (dmammo.shape[0] * (min(ex.shape) - 1)))
print('\n**Mammographic Dataset: Chi-Square, Krammers V, PBCC**\n')
print('Chi-Square Correlation Coefficient (chi^2)\n',chis)
print('\nKramers V Correlation Coefficient (phi)\n',phi)

# Point Biserial Correlation between Age and Severity
PBCC, pval = pointbiserialr(dmammo['Severity'], dmammo['Age'])
print('\nPBCC between Age and Severity: {:.2f}'.format(PBCC))

In [None]:
# Scatter plots between pairs of variables from the Forest Fires dataset

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)

# Extract the relevant columns
cols = ['DMC', 'DC', 'FFMC', 'ISI', 'temp', 'RH', 'wind', 'area', 'rain']
dforest = dforest[cols]
df = dforest

# Generate the scatter plots
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))
plt.subplots_adjust(wspace=0.4, hspace=0.4)
axs[0, 0].scatter(df['DMC'], df['DC'], alpha=0.5)
axs[0, 0].set_xlabel('DMC'); axs[0, 0].set_ylabel('DC')
axs[0, 0].set_title('PCC: {:.2f}'.format(df['DMC'].corr(df['DC'], method='pearson')))
axs[0, 1].scatter(dforest['FFMC'], dforest['ISI'], alpha=0.5)
axs[0, 1].set_xlabel('FFMC'); axs[0, 1].set_ylabel('ISI')
axs[0, 1].set_title('PCC: {:.2f}'.format(df['FFMC'].corr(df['ISI'], method='pearson')))
axs[1, 0].scatter(dforest['temp'], dforest['RH'], alpha=0.5)
axs[1, 0].set_xlabel('temp'); axs[1, 0].set_ylabel('RH')
axs[1, 0].set_title('PCC: {:.2f}'.format(df['temp'].corr(df['RH'], method='pearson')))
axs[1, 1].scatter(dforest['temp'], dforest['wind'], alpha=0.5)
axs[1, 1].set_xlabel('temp'); axs[1, 1].set_ylabel('wind')
axs[1, 1].set_title('PCC: {:.2f}'.format(df['temp'].corr(df['wind'], method='pearson')))
axs[2, 0].scatter(dforest['wind'], dforest['area'], alpha=0.5)
axs[2, 0].set_xlabel('wind'); axs[2, 0].set_ylabel('area')
axs[2, 0].set_title('PCC: {:.2f}'.format(df['wind'].corr(df['area'], method='pearson')))
axs[2, 1].scatter(dforest['rain'], dforest['DC'], alpha=0.5)
axs[2, 1].set_xlabel('rain'); axs[2, 1].set_ylabel('DC')
axs[2, 1].set_title('PCC: {:.2f}'.format(df['rain'].corr(df['DC'], method='pearson')))

plt.show()

In [None]:
import pandas as pd
import seaborn as sns

# Load the dataset with categorical variables
#df = pd.read_csv('my_data.csv')

# Calculate the correlation matrix using Cramer's V
corr_matrix = dmammo.corr(method='kendall')

# Create the heatmap using seaborn
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
df = pd.read_csv(url)

# Extract the relevant columns
cols = ['DMC', 'DC', 'FFMC', 'ISI', 'temp', 'RH', 'wind', 'area', 'rain']
df = df[cols]

# Create a scatter plot matrix
pd.plotting.scatter_matrix(df[['DMC', 'DC', 'FFMC', 'ISI', 'temp', 'RH', 'wind', 'area', 'rain']],
                           alpha=0.2, figsize=(12, 12), diagonal='hist')
plt.show()

In [None]:
# Other central tendency measures: Median, Harmonic Mean
# Variable temp
print('Median of variable temp: {v1}'.format(v1=st.median(dforest['temp'])))
print('Harmonic mean of variable temp: {v1}'.format(v1=st.harmonic_mean(dforest['temp'])))
print('Variance of variable temp: {v1}'.format(v1=st.variance(dforest['temp'])))
print('pVariance (population var) of variable temp: {v1}'.format(v1=st.pvariance(dforest['temp'])))

In [None]:
# Calculating the standard deviations (stdev) using the Statistics library
print('Forest Fires Dataset\n')
print('Std of variable X: {v1}'.format(v1=st.stdev(dforest['X'])))
print('Std of variable Y: {v1}'.format(v1=st.stdev(dforest['Y'])))
print('Std of variable FFMC: {v1}'.format(v1=st.stdev(dforest['FFMC'])))
print('Std of variable DMC: {v1}'.format(v1=st.stdev(dforest['DMC'])))
print('Std of variable DC: {v1}'.format(v1=st.stdev(dforest['DC'])))
print('Std of variable ISI: {v1}'.format(v1=st.stdev(dforest['ISI'])))
print('Std of variable temp: {v1}'.format(v1=st.stdev(dforest['temp'])))
print('Std of variable RH: {v1}'.format(v1=st.stdev(dforest['RH'])))
print('Std of variable wind: {v1}'.format(v1=st.stdev(dforest['wind'])))
print('Std of variable rain: {v1}'.format(v1=st.stdev(dforest['rain'])))
print('Std of variable area: {v1}'.format(v1=st.stdev(dforest['area'])))

In [None]:
# Summarizing the data using the Pandas library
dforest1 = dforest.iloc[:,2:4]  # Select the nominal variables ['month','day']
dforest2 = dforest.drop(['month','day'],axis=1)  # Remove nominal variables ['month','day']

# Numeric variables 
print('Mean of the numeric variables\n',dforest2.mean())
print('\nStd of the numeric variables\n',dforest2.std())
print('\nMin of the numeric variables\n',dforest2.min())
print('\nQuartiles of the numeric variables\n',dforest2.quantile([.25,.5,.75]))
print('\nMax of the numeric variables\n',dforest2.max())
print('\nAmplitude of the numeric variables\n',dforest2.max()-dforest2.min())
print('\nCoefficient of Variation (%) of the numeric variables\n',(dforest2.std()/dforest2.mean())*100)

# Nominal variables
print('\nMode of the nominal variables\n',dforest1.mode())

## Extra Codes: Central Tendency and Dispersion Measures
### All Variables at Once 

In [None]:
# Summarizing the data using the describe() method from the Pandas library
print('Forest Fires Dataset\n')
print('Numerical variables \n',dforest.describe().round(2))
print('\nNominal variable: month \n',dforest['month'].describe())
print('\nNominal variable: day \n',dforest['day'].describe())

In [None]:
# Calculating the Shape Measures using the skew() and kurtosis methods from the Pandas library
print('Forest Fires Dataset\n')
print('Numerical variables skew\n',dforest2.skew())  # dforest2 contains only the numeric variables
print('Numerical variables kurtosis\n',dforest2.kurtosis())  # dforest2 contains only the numeric variables

## Extra Codes: Association Measures 

In [None]:
# Calculating covariance between variables (pairwise) using Numpy
print('Covariance between FFMC and DMC:', np.cov(dforest['FFMC'],dforest['DMC'])[0][1])  # Index[0][1] is the covariance between var1 and var2
print('Covariance between DMC and DC:', np.cov(dforest['DMC'],dforest['DC'])[0][1])
print('Covariance between Temperature and Relative Humidity:', np.cov(dforest['temp'],dforest['RH'])[0][1])
print('Covariance between Rain and Temperature:', np.cov(dforest['rain'],dforest['temp'])[0][1])
print('Covariance between Wind and Temperature:', np.cov(dforest['wind'],dforest['temp'])[0][1])
print('Covariance between Wind and Rain:', np.cov(dforest['wind'],dforest['rain'])[0][1])

In [None]:
# Calculating the Covariance Matrix of the numeric variables using Numpy
np.cov(dforest2)

In [None]:
# Calculating correlation between variables (pairwise) using Numpy
print('Correlation between FFMC and DMC:', np.corrcoef(dforest['FFMC'],dforest['DMC'])[0][1])  # Index[0][1] is the covariance between var1 and var2
print('Correlation between DMC and DC:', np.corrcoef(dforest['DMC'],dforest['DC'])[0][1])
print('Correlation between Temperature and Relative Humidity:', np.corrcoef(dforest['temp'],dforest['RH'])[0][1])
print('Correlation between Rain and Temperature:', np.corrcoef(dforest['rain'],dforest['temp'])[0][1])
print('Correlation between Wind and Temperature:', np.corrcoef(dforest['wind'],dforest['temp'])[0][1])
print('Correlation between Wind and Rain:', np.corrcoef(dforest['wind'],dforest['rain'])[0][1])

In [None]:
# Calculating the correlation matrix among all numeric variables in the DataFrame using Pandas
dforest2.corr()

## Extra Codes: Analysis Through Visualization 

In [None]:
# Plotting all frequency distributions in a single figure to analyze Skewness and Kurtosis using Matplotlib and Seaborn
# Variables X, Y, rain and wind were discarded
plt.figure(figsize=(18,12))
plt.subplot(3,3,1), sns.histplot(dforest.iloc[:,2], bins = 'auto', kde = True)
plt.subplot(3,3,2), sns.histplot(dforest.iloc[:,3], bins = 'auto', kde = True)
plt.subplot(3,3,3), sns.histplot(dforest.iloc[:,4], bins = 'auto', kde = True)
plt.subplot(3,3,4), sns.histplot(dforest.iloc[:,5], bins = 'auto', kde = True)
plt.subplot(3,3,5), sns.histplot(dforest.iloc[:,6], bins = 'auto', kde = True)
plt.subplot(3,3,6), sns.histplot(dforest.iloc[:,7], bins = 'auto', kde = True)
plt.subplot(3,3,7), sns.histplot(dforest.iloc[:,8], bins = 'auto', kde = True)
plt.subplot(3,3,8), sns.histplot(dforest.iloc[:,9], bins = 'auto', kde = True)
plt.subplot(3,3,9), sns.histplot(dforest.iloc[:,10], bins = 'auto', kde = True)

In [None]:
# Box plots of the main numeric variables using Matplotlib
# Variables X, Y, month and day were discarded
data = dforest.to_numpy() # Convert the Series into Array
plt.figure(figsize=(18,12))
plt.subplot(3,3,1), plt.boxplot(dforest.iloc[:,4]), plt.xlabel('FFMC')
plt.subplot(3,3,2), plt.boxplot(data[:,5]), plt.xlabel('DMC')
plt.subplot(3,3,3), plt.boxplot(data[:,6]), plt.xlabel('DC')
plt.subplot(3,3,4), plt.boxplot(data[:,7]), plt.xlabel('ISI')
plt.subplot(3,3,5), plt.boxplot(data[:,8]), plt.xlabel('Temperature')
plt.subplot(3,3,6), plt.boxplot(data[:,9]), plt.xlabel('RH')
plt.subplot(3,3,7), plt.boxplot(data[:,10]), plt.xlabel('Wind')
plt.subplot(3,3,8), plt.boxplot(data[:,11]), plt.xlabel('Rain')
plt.subplot(3,3,9), plt.boxplot(data[:,12]), plt.xlabel('Area')
plt.show()

In [None]:
# Scatter plots of the pairs of variables
plt.figure(figsize=(12,12))
plt.subplot(3,2,1)
plt.scatter(dforest.iloc[:,4], dforest.iloc[:,5], color = 'red', facecolors = 'none', marker = 'o')
plt.title('Forest Fire'), plt.xlabel('FFMC'), plt.ylabel('DMC')
plt.subplot(3,2,2)
plt.scatter(dforest.iloc[:,5], dforest.iloc[:,6], color = 'red', facecolors = 'none', marker = 'o')
plt.title('Forest Fire'), plt.xlabel('DMC'), plt.ylabel('DC')
plt.subplot(3,2,3)
plt.scatter(dforest.iloc[:,8], dforest.iloc[:,9], color = 'red', facecolors = 'none', marker = 'o')
plt.title('Forest Fire'), plt.xlabel('Temperature'), plt.ylabel('RH')
plt.subplot(3,2,4)
plt.scatter(dforest.iloc[:,11], dforest.iloc[:,8], color = 'red', facecolors = 'none', marker = 'o')
plt.title('Forest Fire'), plt.xlabel('Rain'), plt.ylabel('Temperature')
plt.subplot(3,2,5)
plt.scatter(dforest.iloc[:,10], dforest.iloc[:,8], color = 'red', facecolors = 'none', marker = 'o')
plt.title('Forest Fire'), plt.xlabel('Wind'), plt.ylabel('Temperature')
plt.subplot(3,2,6)
plt.scatter(dforest.iloc[:,10], dforest.iloc[:,11], color = 'red', facecolors = 'none', marker = 'o')
plt.title('Forest Fire'), plt.xlabel('Wind'), plt.ylabel('Rain')

## Final Challenge 

In [None]:
# Repeat all the analyzes performed for the Forest dataset using the Mammo dataset
# When necessary, replace missing values and transform nominal variables into numeric 