## Final Project: Phase 2
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 03/11/2024  
___

### Libraries

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import sys
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

import warnings

___

### Define File Paths

In [23]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/notebooks
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/src


___

### Functions

In [24]:
from preprocessing_eda_utils import missingness_cols
from preprocessing_eda_utils import column_dtypes
from preprocessing_eda_utils import compute_vif
from preprocessing_eda_utils import generate_column_hist

___

### Parameters

In [25]:
# design matrix csv file name
design_matrix_file_name = "/curated/beans_design.csv"

# missingness threshold for determining columns to drop
missingness_threshold = 0.20

# min frequency for nomical transformer; OneHotEncoder
min_frequency = 20

___

### Load Data

In [26]:
# read in design matrix
design_df = pd.read_csv( data_path + design_matrix_file_name)

___

## Attribute Exploration

___

### Identify Columns with Missing Value Count Above Threshold

In [27]:
# check columns for missingness
missingness_drop_list = missingness_cols(design_df, missingness_threshold)

# display results
if missingness_drop_list == []:
    print(f'There are no columns that require dropping based on missingness')
else:
    print(f'The following columns have a missingness proportion greater than {missingness_threshold}.')
    print(f'These columns should be dropped:')
    for i in range(len(missingness_drop_list)):
        print(f'\t{i + 1}. {missingness_drop_list[i]}')

There are no columns that require dropping based on missingness


___

### Visually Inspect Design Matrix:

In [28]:
design_df.head(10)

Unnamed: 0.1,Unnamed: 0,id,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
0,0,0,28395.0,610.291,208.178117,173.888747,1.197191,0.549812,28715.0,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724
1,1,1,28734.0,638.018,200.524796,182.734419,1.097356,0.411785,29172.0,191.27275,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843
2,2,2,29380.0,624.11,212.82613,175.931143,1.209713,0.562727,29690.0,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066
3,3,3,30008.0,645.884,210.557999,182.516516,1.153638,0.498616,30724.0,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199
4,4,4,30140.0,620.134,201.847882,190.279279,1.060798,0.33368,30417.0,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166
5,5,5,30279.0,634.927,212.560556,181.510182,1.171067,0.520401,30600.0,196.347702,0.775688,0.98951,0.943852,0.923726,0.00702,0.003153,0.85327,0.999236
6,6,6,30477.0,670.033,211.050155,184.03905,1.146768,0.489478,30970.0,196.988633,0.762402,0.984081,0.85308,0.933374,0.006925,0.003242,0.871186,0.999049
7,7,7,30519.0,629.727,212.996755,182.737204,1.165591,0.51376,30847.0,197.12432,0.770682,0.989367,0.967109,0.92548,0.006979,0.003158,0.856514,0.998345
8,8,8,30685.0,635.681,213.534145,183.157146,1.165852,0.514081,31044.0,197.659696,0.771561,0.988436,0.95424,0.925658,0.006959,0.003152,0.856844,0.998953
9,9,9,30834.0,631.934,217.227813,180.897469,1.200834,0.553642,31120.0,198.139012,0.783683,0.99081,0.970278,0.912125,0.007045,0.003008,0.831973,0.999061


___

### Identify Feature Types: Nominal & Numerical

In [29]:
# get nominal and numerical columns, print column dtype
nominal_cols, numerical_cols, unique_value_cols = column_dtypes(design_df)

# display nominal cols
if nominal_cols == []:
    print(f'There no nominal columns in the dataset.\n')
else:
    print(f'The nominal columns are:')
    for i in range(len(nominal_cols)):
        print(f'{i + 1}: {nominal_cols[i]}')

# display numerical cols
if numerical_cols == []:
    print(f'There no numerical columns in the dataset.')
else:
    print(f'The {len(numerical_cols)} numerical columns are:')
    for i in range(len(numerical_cols)):
        print(f'\t{i + 1}: {numerical_cols[i]}')

# display unique values information
if unique_value_cols == []:
    print(f'\nThere no columns with all unique values in the dataset.')
else:
    print(f'\nThe {len(unique_value_cols)} columns with all unique values are:')
    for i in range(len(unique_value_cols)):
        print(f'\t{i + 1}: {unique_value_cols[i]}')

Column: Unnamed: 0
Data Type: int64
Unique value count: 13611, DF length: 13611, Ratio: 1.0
***
FLAG column Unnamed: 0 for review 
***
__________________________________________________________

Column: id
Data Type: int64
Unique value count: 13611, DF length: 13611, Ratio: 1.0
***
FLAG column id for review 
***
__________________________________________________________

Column: Area
Data Type: float64
Unique value count: 12011, DF length: 13611, Ratio: 0.88
__________________________________________________________

Column: Perimeter
Data Type: float64
Unique value count: 13413, DF length: 13611, Ratio: 0.99
__________________________________________________________

Column: MajorAxisLength
Data Type: float64
Unique value count: 13543, DF length: 13611, Ratio: 1.0
__________________________________________________________

Column: MinorAxisLength
Data Type: float64
Unique value count: 13543, DF length: 13611, Ratio: 1.0
__________________________________________________________

Colum

___

### Create Non Machine Learning Attributes List

In [30]:
# display columns
design_df.columns

Index(['Unnamed: 0', 'id', 'Area', 'Perimeter', 'MajorAxisLength',
       'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea',
       'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness',
       'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4'],
      dtype='object')

In [31]:
# attributes not to be used for machine learning
non_ml_attr_list = ['id']
for col in non_ml_attr_list:
    if col in nominal_cols:
        nominal_cols.remove(col)
    if col in numerical_cols:
        numerical_cols.remove(col)

# add any attributes that should be dropped from dataframe
drop_list = ['Unnamed: 0'] + missingness_drop_list # 'Unnamed: 0' is a secondary id column and can be dropped. 

# drop cols in drop_list from design matrix
cols1 = design_df.columns
design_df.drop(drop_list, axis = 1, inplace = True)
assert(len(design_df.columns) != len(cols1))

# drop col names in drop_list from nominal and numerical lists
for col in drop_list:
    if col in nominal_cols:
        nominal_cols.remove(col)
    if col in numerical_cols:
        numerical_cols.remove(col)
assert(design_df.shape[1] == len(non_ml_attr_list) + len(numerical_cols) + len(nominal_cols))

print(f'Training X Dimensions: \n \t{design_df.shape}')

print('Non Machine Learning Attributes:')
for i, element in enumerate(non_ml_attr_list):
    print(f'\t {i+1}: {element}')

print('Numerical Columns:')
if numerical_cols == []:
    print(f'\t None')
else:
    for i, element in enumerate(numerical_cols):
        print(f'\t {i+1}: {element}')

print('Nominal Columns:')
if nominal_cols == []:
    print(f'\t None')
else:
    for i, element in enumerate(nominal_cols):
        print(f'\t {i+1}: {element}')

Training X Dimensions: 
 	(13611, 17)
Non Machine Learning Attributes:
	 1: id
Numerical Columns:
	 1: Area
	 2: Perimeter
	 3: MajorAxisLength
	 4: MinorAxisLength
	 5: AspectRation
	 6: Eccentricity
	 7: ConvexArea
	 8: EquivDiameter
	 9: Extent
	 10: Solidity
	 11: roundness
	 12: Compactness
	 13: ShapeFactor1
	 14: ShapeFactor2
	 15: ShapeFactor3
	 16: ShapeFactor4
Nominal Columns:


___

### Display Missingness of Machine Learning Attributes:

In [None]:
# display missingness plot; ml attributes
msno.matrix(design_df[numerical_cols + nominal_cols])
plt.title("Missingness: Machine Learning Attributes", weight = 'bold', fontsize = 24)

# save fig
plt.savefig(figs_path + "/missing_attributes_ml_attributes.png", bbox_inches = 'tight')

In [None]:
print('Machine Learning Attributes Datatypes:')
design_df[numerical_cols + nominal_cols].dtypes

___

### Explore Correlation and VIF of Numerical Attributes:

In [None]:
# set figure size
plt.figure(figsize=(10,7))

# generate a mask to only show the bottom triangle
mask = np.triu(np.ones_like(design_df[numerical_cols].corr(), dtype=bool))

# generate heatmap
sns.heatmap(design_df[numerical_cols].corr(), annot = True, fmt = ".2f", mask = mask, vmin = -1, vmax = 1)
plt.title('Correlation Matrix of the Design Matrix', weight = 'bold', fontsize = 16)

# save fig
plt.savefig(figs_path + "/correlation_matrix.png", bbox_inches = 'tight')

#### VIF Factor: All Numerical Attributes 

In [None]:
vif = compute_vif(design_df, numerical_cols)

#### VIF Factor without Shapefactor Attributes

In [None]:
vif = compute_vif(design_df, numerical_cols[:12])


___

## Build Pipeline

___

### Numerical Transformer: Impute & Scale

In [None]:
# instantiate sklearn pipeline
numerical_transformer = Pipeline(
    steps=[
            ('imputer', SimpleImputer()),
            ('scaler', StandardScaler())
          ]
        )

### Nominal Transformer: Impute & Encode

#### Determine min frequency for nominal transformer:

In [None]:
if len(nominal_cols) > 0:
    print(design_df[nominal_cols].value_count())
    print('Set min_frequency for nominal transformer in perameters cell.')
else:
    min_frequency = None


### Nominal Transformer

In [None]:
# instantiate nominal transformer
nominal_transformer = Pipeline(
    steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot_encoder', OneHotEncoder(sparse_output=False, min_frequency=min_frequency))
    ]
)


### Column Transformer:

In [None]:
preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numerical_transformer, numerical_cols),
            ('nominal', nominal_transformer, nominal_cols)
        ]
)

### Execute Pipeline with Design Matrix

In [None]:
# execute pipeline
trans_df = pd.DataFrame(
    data = preprocessor.fit_transform(design_df),
    index = design_df.index, # KEEP TRACK OF INDEX
    columns = preprocessor.get_feature_names_out() #NAME COlumns
)

In [None]:
# display transfored dataframe
trans_df

___
## Exploratory Data Analysis

In [None]:
# generate attributes list
attr_list = trans_df.columns

# display machine learning attributes list
print(f'Attributes List:')
for i, j in enumerate(attr_list):
    print(f'{i + 1}: {j}')

In [None]:
# display dimension of transformed dataframe
print(f'Transformed Dataframe Dimensions:', trans_df[attr_list].shape)

In [None]:
# visually inspect transformed df
display(trans_df.head())

In [None]:
# display transformed df general information
trans_df.info()

In [None]:
# display counts of NA, None, and np.nan
print('\nNA (np.nan or None) Count:\n',
        trans_df.isna().sum(), sep ='')

In [None]:
# display proportion of NA, None, and np.nan
print('\nNA (np.nan or None) Ratio:\n',
        trans_df.isna().sum() / trans_df.shape[0], sep='')

### Pairplots

In [None]:
warnings.filterwarnings('ignore', category = FutureWarning)
plot = sns.pairplot(data = trans_df)
plot.fig.suptitle('Attribute Pairplots', fontsize = 20, y = 1.03)
plt.tight_layout()
plt.show();

In [None]:
def sub_divide_pairplot(trans_df, alpha):

    columns = trans_df.columns

    for i in [0, 4, 8, 12]:


        # 4x4 subset of variables for the pairplot 
        subset_cols = columns[i: (i + 4)]

        fig, axs = plt.subplots(len(subset_cols), len(subset_cols), figsize=(15, 15))

        for i, col1 in enumerate(subset_cols):
            for j, col2 in enumerate(subset_cols):
                if i == j:

                    # histograms on diagonals
                    axs[i, j].hist(trans_df[col1], bins='auto')
                    axs[i, j].set_xlabel(col1)
                    axs[i, j].set_ylabel("Frequency")
                else:
                    # scatter plots
                    axs[i, j].scatter(trans_df[col2], trans_df[col1], alpha=alpha, marker='o')
                    axs[i, j].set_xlabel(col2)
                    axs[i, j].set_ylabel(col1)

        plt.tight_layout()
        plt.show()

sub_divide_pairplot(trans_df, 0.02)

### Histograms of Numerical Columns

In [None]:
cols = trans_df[attr_list]
divided_columns = [cols[i:i+4] for i in range(0, len(cols), 4)]

for i, cols in enumerate(divided_columns):
    generate_column_hist(trans_df, cols)
    
    # save fig
    plt.savefig(figs_path + f"/attribute_hist_{i}.png", bbox_inches = 'tight')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Sample data: Generate two clusters using numpy for demonstration
np.random.seed(0)
x = np.concatenate([np.random.randn(100) + 5, np.random.randn(100)])
y = np.concatenate([np.random.randn(100), np.random.randn(100) + 5])

# Plotting the scatter plot
plt.scatter(x, y)

# Annotating the plot with arrows pointing to clusters
plt.annotate('Cluster 1', xy=(5, 0), xytext=(7, -2),
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=12)

plt.annotate('Cluster 2', xy=(0, 5), xytext=(-3, 7),
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=12)

plt.title('Scatter Plot with Annotated Clusters')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

plt.show()
