# ISM 6404: BI and Data Visualization
# Descriptive Analysis
## Prof. Leandro Nunes de Castro

In [None]:
# This content was created as a supporting material for the course
# ISM 6404 - BI and Data Visualization
# Prof. Leandro de Castro (c), Spring 2024
# All rights reserved

# Florida Gulf Coast University

# 1. Frequency Distributions 

In [None]:
# Determining the frequency distribution, frequency table and pie chart 
# of variable 'Shape' in the Mammographic dataset

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading dataset1
# https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass
dmammo = pd.read_csv('mammographic_masses_nominal.csv')
print(dmammo.head) # Print the first and last 5 rows

SShape = pd.Series(dmammo['Shape'])
ftable = SShape.value_counts()  # Generate the frequency table
rftable = ftable/len(SShape)*100  # Relative frequency
cftable = ftable.cumsum()/len(SShape)*100  # Cumulative frequency
df = pd.DataFrame({'Frequency':ftable.to_list(),
                   'Relative Frequency':rftable.to_list(),
                  'Cumulative Frequency':cftable.to_list()})
print(df)
fig, figftable = plt.subplots()

# Using a color palette with different levels of the same color
colors = sns.color_palette("Blues", len(ftable))[::-1]

# Plotting the pie chart with the new color palette
figftable.pie(ftable.to_list(), labels=ftable.index.to_list(),
              autopct='%1.2f%%', colors=colors)


**CW 1**: Do a search and find out what are the libraries: Pandas, Matplotlib, and Seaborn

**CW 2**: Test the code above for other variables of the mammographic dataset

In [None]:
# Determining the frequency distribution, frequency table and histogram 
# of continuous variables in the Forest Fire dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading dataset2
# https://archive.ics.uci.edu/ml/datasets/forest+fires
dforest = pd.read_csv('forestfires.csv')
print(dforest.head) # Print the first and last 5 rows

var = 'temp'  # Choose the target variable
SShape = pd.Series(dforest[var])
nbins = 10
inflimit = 0; suplimit = max(SShape)
ampl = (suplimit - inflimit)/nbins

# Define the range of the variable and bin size
fbins = np.arange(0,suplimit+ampl,ampl)

# The pandas.cut function groups the data into bins and counts 
# the frequency
ftable = pd.cut(SShape,fbins).value_counts() # Absolute frequency
rftable = ftable/len(SShape)*100  # Relative frequency
cftable = ftable.cumsum()/len(SShape)*100  # Cumulative frequency
df = pd.DataFrame({'Bins':ftable.index.to_list(),
                   'Frequency':ftable.to_list(),
                   'Relative Frequency':rftable.to_list(),
                   'Cumulative Frequency':cftable.to_list()})
print(df)
plt.xticks(fbins)
sns.histplot(dforest,x=var,bins=fbins, kde = 2)

**CW 3**: Test the code above for other variables of the forest fires dataset

In [None]:
# Plot distributions with different shapes 
# Load the forest fires dataset from UCI

import seaborn as sns

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)

sns.histplot(dforest,x='month',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='day',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='FFMC',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='DMC',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='DC',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='ISI',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='RH',bins='auto', kde = 2); plt.show()
sns.histplot(dforest,x='wind',bins='auto', kde = 2); plt.show()

In [None]:
# Plot distributions with different shapes 
# Load the forest fires dataset from UCI
# Alternative implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)
var = ['month','day','FFMC','DMC','DC','ISI','RH','wind']
for i in var:
    sns.histplot(dforest,x=i,bins='auto', kde = 2); plt.show()

In [None]:
# Generate Contingency Tables for the Mammographic Dataset

import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
cols = ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']
dmammo = pd.read_csv(url, names=cols, na_values='?')

# Remove rows with missing values
dmammo.dropna(inplace=True)

# Print the contingency tables
var = ['Shape','Margin','Density']
print('**Contingency Tables**')
for i in var:
    CT = pd.crosstab(dmammo[i], dmammo['Severity'])
    print('Variables',i, 'and Severity:\n',CT)

# 2. Summary Measures  

In [None]:
# Calculating the mean and mode one by one using the Statistics library
# Numeric variables

import statistics as st

print('**Forest Fires Dataset**')
print('\n*Numeric Variable FFMC*')
print('Mean of variable FFMC: {:.2f}'.format(st.mean(dforest['FFMC'])))
print('Median of variable FFMC: {:.2f}'.format(st.median(dforest['FFMC'])))
midpoint = (max(dforest['FFMC'])+min(dforest['FFMC']))/2
print('Midpoint of variable FFMC: {:.2f}'.format(midpoint))

print('\n*Numeric Variable temp*')
print('Mean of variable temp: {:.2f}'.format(st.mean(dforest['temp'])))
print('Median of variable temp: {:.2f}'.format(st.median(dforest['temp'])))
midpoint = (max(dforest['temp'])+min(dforest['temp']))/2
print('Midpoint of variable temp: {:.2f}'.format(midpoint))

# Nominal variables
print('\n*Categorical Variables*')
print('Mode of nominal variable month: {v1}'
      .format(v1=st.mode(dforest['month'])))
print('Mode of nominal variable day: {v1}'
      .format(v1=st.mode(dforest['day'])))

In [None]:
# Plot the central tendency measures over the histogram 

import statistics as st
import seaborn as sns
import matplotlib.pyplot as plt

var = 'temp'  # Choose the target variable
mean = st.mean(dforest[var])
median = st.median(dforest[var])
midpoint = (max(dforest[var])+min(dforest[var]))/2
print('Mean, median and midpoint for temp:',mean,median,midpoint)
sns.histplot(dforest,x=var,bins='auto', kde = 2)
plt.axvline(x=mean, color='r', linestyle='--', label='Mean')
plt.axvline(x=median, color='g', linestyle='-', label='Median')
plt.axvline(x=midpoint, color='b', linestyle=':', label='Midpoint')
plt.legend() # Add a legend
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate z-scores for 'temp'
dforest['temp_zscore'] = (dforest['temp'] - dforest['temp'].mean()) / dforest['temp'].std()

# Print z-scores for specific values of 'temp' with two decimal places
temp_values = [5, 10, 15, 20, 25, 30]
for value in temp_values:
    z_score = round((value - dforest['temp'].mean()) / dforest['temp'].std(), 2)
    print(f"Z-score for temp value {value}: {z_score}")

# Plot vertically oriented boxplot for 'temp'
plt.figure(figsize=(8, 6))
sns.boxplot(y='temp', data=dforest, orient='v')
plt.title('Boxplot for Variable temp')
plt.show()

In [None]:
# Calculate the weighted average, geometric, harmonic, and trimmed means

import numpy as np
import scipy.stats as spy

var = 'FFMC'
weights = np.random.randn(len(dforest[var]))
wavg = np.average(dforest[var], weights=weights)
gavg = spy.gmean(dforest[var])  # From Scipy library
havg = spy.hmean(dforest[var])  # From Scipy library
tavg = spy.trim_mean(dforest[var],0.05)  # 5% trim

print('Weighted average of variable FFMC: {:.2f}'.format(wavg))
print('Geometric mean of variable FFMC: {:.2f}'.format(gavg))
print('Harmonic mean of variable FFMC: {:.2f}'.format(havg))
print('Trimmed mean of variable FFMC: {:.2f}'.format(tavg))

var = 'temp'
weights = np.random.randn(len(dforest[var]))
wavg = np.average(dforest[var], weights=weights)
gavg = spy.gmean(dforest[var])  # From Scipy library
havg = spy.hmean(dforest[var])  # From Scipy library
tavg = spy.trim_mean(dforest[var],0.05)  # 5% trim

print('\nWeighted average of variable temp: {:.2f}'.format(wavg))
print('Geometric mean of variable temp: {:.2f}'.format(gavg))
print('Harmonic mean of variable temp: {:.2f}'.format(havg))
print('Trimmed mean of variable temp: {:.2f}'.format(tavg))

In [None]:
# Central Tendency Measures for the Forest Fires Dataset
# Columns of interest: 'FFMC','DMC','DC', 'ISI', 'temp', 'RH', 'wind', 'rain'

import numpy as np
import scipy.stats as spy

ffmc = dforest['FFMC']; dmc = dforest['DMC']; dc = dforest['DC']
isi = dforest['ISI']; temp = dforest['temp']; rh = dforest['RH']
wind = dforest['wind']; rain = dforest['rain']

# Dictionary to store the results
CTM = {}

# Loop over the columns and calculate the statistics
for col_name, col_data in zip(['FFMC','DMC','DC', 'ISI', 'temp', 'RH', 'wind', 'rain'],
                              [ffmc, dmc, dc, isi, temp, rh, wind, rain]):
    mean = np.mean(col_data)
    median = np.median(col_data)
    midpoint = (np.max(col_data) + np.min(col_data)) / 2
    wavg = np.average(col_data, weights=dforest['area'])
    gavg = spy.gmean(col_data)
    havg = spy.hmean(col_data)
    tavg = spy.trim_mean(col_data, proportiontocut=0.1)
    # Add the results to the dictionary
    CTM[col_name] = {'Mean': mean,
                         'Median': median,
                         'Midpoint': midpoint,
                         'Weighted Mean': wavg,
                         'Geometric Mean': gavg,
                         'Harmonic Mean': havg,
                         'Trimmed Mean': tavg}
    
# Print the results
for col_name, col_results in CTM.items():
    print(col_name)
    for stat_name, stat_value in col_results.items():
        print(f"\t{stat_name}: {stat_value:.2f}")

In [None]:
# Calculate the variability measures range, 
# IQR, sIQR, variance, std, and CV using Numpy

import numpy as np

var = 'FFMC'
drange = np.max(dforest[var]) - np.min(dforest[var])
Q1, Q3 = np.percentile(dforest[var], [25,75])
IQR = Q3 - Q1
sIQR = IQR / 2
dvar = np.var(dforest[var])
dstd = np.std(dforest[var])
CV = dstd / np.mean(dforest[var]) * 100

print('*Variability Measures*')
print('Range of variable FFMC: {:.2f}'.format(drange))
print('IQR of variable FFMC: {:.2f}'.format(IQR))
print('sIQR of variable FFMC: {:.2f}'.format(sIQR))
print('Variance of variable FFMC: {:.2f}'.format(dvar))
print('Standard deviation of variable FFMC: {:.2f}'.format(dstd))
print('Variation coefficient of variable FFMC: {:.2f}'.format(CV))

**CW 4**: For the Auto MPG dataset, do:
1. Calculate the central tendency measures for all variables, numerical and categorical.
2. Plot the frequency distribution of all numerical variables and the central tendency values of the histogram
3. Plot the boxplot for all independent variables
4. Calculate the variability measures for all independent variables

# 3. Measures of Shape 

In [None]:
# Skewness and Skewed distributions
# Generate random data with a right-skewed distribution

import statistics as st
import numpy as np 
from scipy.stats import skew
import seaborn as sns 
import matplotlib.pyplot as plt 

data = np.random.beta(a=1, b=5, size=1000) # Beta distribution
mean = st.mean(data)
median = st.median(data)
mode = st.mode(data)
print('Mean, median and mode: {:.2f} {:.2f} {:.2f}'
      .format(mean,median,mode))
print('Skewness (Fischer-Pearson Coefficient): {:.2f}'
      .format(skew(data)))
print('Skewness (First Skewness Coefficient): {:.2f}'
      .format((mean-mode)/np.std(data)))
sns.histplot(data,bins='auto', kde = 2)
plt.axvline(x=mean, color='r', linestyle='--', label='Mean')
plt.axvline(x=median, color='g', linestyle='-', label='Median')
plt.axvline(x=mode, color='b', linestyle=':', label='Mode')
plt.legend(); plt.show()

In [None]:
# Skewness and Skewed distributions
# Generate random data with a left-skewed distribution

import statistics as st
import numpy as np
from scipy.stats import skew
import seaborn as sns
import matplotlib.pyplot as plt

data_neg = np.random.beta(a=5, b=1, size=1000)  # Beta distribution
mean = st.mean(data_neg)
median = st.median(data_neg)
midpoint = (max(data_neg) + min(data_neg)) / 2  # Calculate the midpoint
print('Mean, median, and midpoint: {:.2f} {:.2f} {:.2f}'
      .format(mean, median, midpoint))
print('Skewness (Fischer-Pearson Coefficient): {:.2f}'
      .format(skew(data_neg)))
print('Skewness (First Skewness Coefficient): {:.2f}'
      .format((mean - midpoint) / np.std(data_neg)))
sns.histplot(data_neg, bins='auto', kde=2)
plt.axvline(x=mean, color='r', linestyle='--', label='Mean')
plt.axvline(x=median, color='g', linestyle='-', label='Median')
plt.axvline(x=midpoint, color='b', linestyle=':', label='Midpoint')
plt.legend()
plt.show()

In [None]:
# Kurtosis: mesokurtic, platykurtic and leptokurtic distributions

import numpy as np
from scipy.stats import norm, laplace, semicircular, kurtosis
import seaborn as sns
import matplotlib.pyplot as plt

# Normal distribution (Mesokurtic)
dnorm = norm.rvs(size=10000)
k = kurtosis(dnorm)
print('Kurtosis: {:.2f}'.format(k))
sns.histplot(dnorm, bins='auto', kde=2)
plt.title(f"Normal Distribution - Kurtosis: {k:.2f}")
plt.show()

# Uniform distribution (Platykurtic)
dunif = np.random.uniform(0.01, 0.10, 10000)
k = kurtosis(dunif)
print('Kurtosis: {:.2f}'.format(k))
sns.histplot(dunif, bins='auto', kde=2)
plt.title(f"Uniform Distribution - Kurtosis: {k:.2f}")
plt.show()

# Laplace distribution (Leptokurtic)
dlap = laplace.rvs(loc=0, scale=1, size=10000)
k = kurtosis(dlap)
sns.histplot(dlap, bins='auto', kde=2)
plt.title(f"Laplace Distribution - Kurtosis: {k:.2f}")
plt.show()

# Wigner semicircle distribution
dwigner = semicircular.rvs(size=10000)
k = kurtosis(dwigner)
sns.histplot(dwigner, bins='auto', kde=2)
plt.title(f"Wigner Semicircle Distribution - Kurtosis: {k:.2f}")
plt.show()

In [None]:
# Calculate the Skewness and Kurtosis for the Forest Fires variables

import pandas as pd
from scipy.stats import skew, kurtosis

# load dataset
dforest = pd.read_csv("forestfires.csv")

# Skewness and kurtosis for each variable
skewness = dforest[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']].skew()
kurt = dforest[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']].kurtosis()

# Print the results and classify the kurtosis
for var in skewness.index:
    print(f"{var} Skewness: {skewness[var]:.2f}")
    if kurt[var] > 0:
        print(f"{var} Kurtosis: {kurt[var]:.2f} (Leptokurtic)")
    elif kurt[var] < 0:
        print(f"{var} Kurtosis: {kurt[var]:.2f} (Platykurtic)")
    else:
        print(f"{var} Kurtosis: {kurt[var]:.2f} (Mesokurtic)")

**CW 5**: For the Iris dataset, calculate the Skewness and Kurtosis for the independent variables 

# 4. Measures of Association 

In [None]:
# Calculate and print the Covariance Matrix of the Forest Fires dataset

import pandas as pd

# Load the dataset
dforest = pd.read_csv('forestfires.csv')

# Select the desired numeric features
features = ['X','Y','FFMC','DMC','DC','ISI','temp','RH','wind','rain']

# Compute the covariance matrix and format to two decimal places
pd.options.display.float_format = '{:.2f}'.format
Mcov = dforest[features].cov()

# Print the covariance matrix
print(Mcov)

In [None]:
# Calculate Correlation Coefficients: PCC, SRCC and KRCC
# for the numerical variables of the Forest Fires dataset

import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau

# Read the data into a pandas dataframe
dforest = pd.read_csv("forestfires.csv")
pd.options.display.float_format = '{:.2f}'.format

# Calculate PCC, SRCC, KRCC
print('**Forest Fires Dataset: PCC, SRCC, KRCC**')
PCC = dforest.corr(method='pearson')
print('Pearson Correlation Coefficient (PCC)\n',PCC)
SRCC = dforest.corr(method='spearman')
print('\nSpearman Rank Correlation Coefficient (SRCC)\n',SRCC)
KRCC = dforest.corr(method='kendall')
print('\nKramers Rank Correlation Coefficient (KRCC)\n',KRCC)

In [None]:
# Scatter plots between pairs of variables from the Forest Fires dataset

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)

# Extract the relevant columns
cols = ['DMC', 'DC', 'FFMC', 'ISI', 'temp', 'RH', 'wind', 'area', 'rain']
dforest = dforest[cols]
df = dforest

# Generate the scatter plots
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))
plt.subplots_adjust(wspace=0.4, hspace=0.4)
axs[0, 0].scatter(df['DMC'], df['DC'], alpha=0.5)
axs[0, 0].set_xlabel('DMC'); axs[0, 0].set_ylabel('DC')
axs[0, 0].set_title('PCC: {:.2f}'.format(df['DMC'].corr(df['DC'], method='pearson')))
axs[0, 1].scatter(dforest['FFMC'], dforest['ISI'], alpha=0.5)
axs[0, 1].set_xlabel('FFMC'); axs[0, 1].set_ylabel('ISI')
axs[0, 1].set_title('PCC: {:.2f}'.format(df['FFMC'].corr(df['ISI'], method='pearson')))
axs[1, 0].scatter(dforest['temp'], dforest['RH'], alpha=0.5)
axs[1, 0].set_xlabel('temp'); axs[1, 0].set_ylabel('RH')
axs[1, 0].set_title('PCC: {:.2f}'.format(df['temp'].corr(df['RH'], method='pearson')))
axs[1, 1].scatter(dforest['temp'], dforest['wind'], alpha=0.5)
axs[1, 1].set_xlabel('temp'); axs[1, 1].set_ylabel('wind')
axs[1, 1].set_title('PCC: {:.2f}'.format(df['temp'].corr(df['wind'], method='pearson')))
axs[2, 0].scatter(dforest['wind'], dforest['area'], alpha=0.5)
axs[2, 0].set_xlabel('wind'); axs[2, 0].set_ylabel('area')
axs[2, 0].set_title('PCC: {:.2f}'.format(df['wind'].corr(df['area'], method='pearson')))
axs[2, 1].scatter(dforest['rain'], dforest['DC'], alpha=0.5)
axs[2, 1].set_xlabel('rain'); axs[2, 1].set_ylabel('DC')
axs[2, 1].set_title('PCC: {:.2f}'.format(df['rain'].corr(df['DC'], method='pearson')))

plt.show()

In [None]:
# Plot the heatmap

import pandas as pd
import seaborn as sns

# Calculate the correlation matrix using Cramer's V
corr_matrix = dmammo.corr(method='kendall')

# Create the heatmap using seaborn
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
# Calculate and plot the scatterplot matrix

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
df = pd.read_csv(url)

# Extract the relevant columns
cols = ['DMC', 'DC', 'FFMC', 'ISI', 'temp', 'RH', 'wind', 'area', 'rain']
df = df[cols]

# Create a scatterplot matrix
pd.plotting.scatter_matrix(df[['DMC', 'DC', 'FFMC', 'ISI', 'temp', 'RH', 'wind', 'area', 'rain']],
                           alpha=0.2, figsize=(12, 12), diagonal='hist')
plt.show()

**CW 6**: For the independent variables of the Iris dataset, do:
1. Calculate and print the covariance matrix.
2. Calculate and print the correlation coefficient matrix.
3. Plot the scatterplot for all pairs of independent variables.
4. Plot the heatmap of the correlation matrix.
5. Plot the scatterplot matrix.

## Extra Code: All Variables at Once 

In [None]:
# Summarizing the data using the describe() method from the Pandas library
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
dforest = pd.read_csv(url)

print('Forest Fires Dataset\n')
print('Numerical variables \n',dforest.describe().round(2))
print('\nNominal variable: month \n',dforest['month'].describe())
print('\nNominal variable: day \n',dforest['day'].describe())