# Wordle Solver

In [1]:
# import functions
from wordle_functions import *

In [2]:
# help(main)

In [10]:
# Main function that returns a list of possible words
green_letters = 'AN'
green_positions = [3,4]
yellow_letters = 'OUG'
yellow_positions = [2,3,4]
bad_letters = 'CRESPILTBH'

# Main function that returns a list of possible words
words = main(green_letters, green_positions, yellow_letters, yellow_positions, bad_letters)
print(words)

['guano']


In [11]:
# Repetitive letters
letters_df = repetitive_letters(wordle_list=words)
print(letters_df.to_string(index=False))

Letters  Count
      G      1
      U      1
      A      1
      N      1
      O      1


#### Good words:
SALET, CRANE, SHOUT, AUDIO, SLICE, SLICK, SPLIT, STUDY, INCUR, CHOIR, CROWN, CROWD, CRONE, POISE, OLIVE, DUNCE, BINGO, BISON, MIDST, SPOUT

In [9]:
# Main function that returns a list of possible words
green_letters = ''
green_positions = []
yellow_letters = 'GOBH'
yellow_positions = []
bad_letters = ''

# Main function that returns a list of possible words
words = main(green_letters, green_positions, yellow_letters, yellow_positions, bad_letters)
print(words)

['bough', 'brogh']


In [None]:
# help(score)

In [None]:
df = load_data(file_name='wordle_ranking.csv')
df

In [None]:
# Update score
score_df = score()
score_df

In [None]:
# Print results
print(score_df.to_string(index=False))

In [None]:
# help(reset_score)

#### Set custom score

In [None]:
# help(set_score)

In [None]:
# # Set custom score
# df = set_score(m_score=107, b_score=41, draw_score=105)
# df

#### Reset score

In [None]:
## Reset score
# reset_score()

In [None]:
% CVPR 2022 Paper Template
% based on the CVPR template provided by Ming-Ming Cheng (https://github.com/MCG-NKU/CVPR_Template)
% modified and extended by Stefan Roth (stefan.roth@NOSPAMtu-darmstadt.de)

\documentclass[10pt,twocolumn,letterpaper]{article}

%%%%%%%%% PAPER TYPE  - PLEASE UPDATE FOR FINAL VERSION
% \usepackage[review]{cvpr}      % To produce the REVIEW version
% \usepackage{cvpr}              % To produce the CAMERA-READY version
\usepackage[pagenumbers]{cvpr} % To force page numbers, e.g. for an arXiv version

% Include other packages here, before hyperref.
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{booktabs}


% It is strongly recommended to use hyperref, especially for the review version.
% hyperref with option pagebackref eases the reviewers' job.
% Please disable hyperref *only* if you encounter grave issues, e.g. with the
% file validation for the camera-ready version.
%
% If you comment hyperref and then uncomment it, you should delete
% ReviewTempalte.aux before re-running LaTeX.
% (Or just hit 'q' on the first LaTeX run, let it finish, and you
%  should be clear).
\usepackage[pagebackref,breaklinks,colorlinks]{hyperref}


% Support for easy cross-referencing
\usepackage[capitalize]{cleveref}
\crefname{section}{Sec.}{Secs.}
\Crefname{section}{Section}{Sections}
\Crefname{table}{Table}{Tables}
\crefname{table}{Tab.}{Tabs.}


%%%%%%%%% PAPER ID  - PLEASE UPDATE
\def\cvprPaperID{*****} % *** Enter the CVPR Paper ID here
\def\confName{CVPR}
\def\confYear{2022}


%%%%%%%%% TITLE - PLEASE UPDATE
\title{Exploring Unsupervised Learning and Dimensionality Reduction}

\author{
    Murilo Gustineli\\
    Georgia Institute of Technology\\
    CS 7641: Machine Learning\\
    \texttt{murilogustineli@gatech.edu}
}

\begin{document}
\maketitle


%------------------------------------------------------------------------
%%%%%%%%% INTRODUCTION
\section{Introduction}
This study explores the performance of two unsupervised clustering learners and five dimensionality reduction algorithms on two datasets.
The datasets are the \href{https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(diagnostic)}{Breast Cancer Wisconsin (Diagnostic)} dataset and the \href{https://archive.ics.uci.edu/ml/datasets/wine+quality}{Wine Quality} dataset, both retrieved from the \href{https://archive.ics.uci.edu/ml/index.php}{UCI Machine Learning Repository}.
The datasets were chosen because of their interesting characteristics, such as heterogeneous columns, numerical features, no missing data, and similar class distribution while having different dimensionality sizes.
The Breast Cancer dataset contains 569 instances and 31 features, with each example labeled as either malignant or benign using a binary target variable. The distribution of the target classes is uneven, with 357 examples being benign and 212 being malignant.
Meanwhile, the Wine dataset was created by combining the red wine and white wine datasets from the Wine Quality UCI repository. A binary target variable was created based on the type of wine, with 0 indicating white wines and 1 indicating red wines. The final wine dataset has 6,497 instances and 13 features. The distribution of the target classes is also uneven, with 4,898 examples as white wine and 1,599 as red wine.


% \subsection{Performance Metrics}

\subsection{Methods and Objectives}
This study is divided into five parts:
\textbf{1.} Explore the behavior of two clustering algorithms namely, KMeans and Expectation Maximization (EM) on the datasets.
\textbf{2.} Investigate the performance of four dimensionality reduction algorithms in finding the optimal number of components for each dataset.
\textbf{3.} Examine how the best number of components for each dimensionality reduction algorithm affects clustering.
\textbf{4.} Analyze how the curse of dimensionality impacts the generalization and speed of learning. This was done by tuning a neural network to each of the reduced-dimensionality datasets and comparing their performance to a pre-established benchmark model.
\textbf{5.} Analyze how clustering influences learning generalization and speed. This was done by projecting clustering labels into a reduced dataset and examining the results.


% \begin{table}[ht]
% \centering
% \small
% \scalebox{1}{
%     \begin{tabular}{|p{1.3cm}||p{2.7cm}|p{2.4cm}|}
%      \hline
%      \multicolumn{3}{|c|}{Optimization problems hyperparameters} \\
%      \hline\hline
%       Algorithms & Parameter 1 & Parameter 2 \\
%      \hline
%      SA     & temperature (1.0) & Geometric Decay \\
%      RHC    & restart (0) &  \\
%      GA     & population size (200) & mutation rate (0.4) \\
%      MIMIC  & population size (300) & keep percent (0.5) \\
%      \hline
%     \end{tabular}
% }
% \caption{Randomized optimization hyperparameter values for all search problems. All RO algorithms used 1024 iterations and 100 max attempts.}
% % \label{table:1}
% \end{table}



%------------------------------------------------------------------------
%%%%%%%%% CLUSTERING
\section{Clustering}
In unsupervised learning tasks, choosing the best number of clusters is a crucial step in improving generalization as there
are no labels to predict.
KMeans and EM are two popular clustering algorithms that share the same objective: finding groups of data based on their similarity.
KMeans is centroid-based and partitions data into a pre-determined K number of clusters.
It works by randomly selecting K centroids, assigning each data point to its nearest centroid, and adjusting the centroids towards the mean of the data points.
This process is repeated until the centroids no longer move, indicating convergence.
On the other hand, EM is a probabilistic clustering algorithm that finds the maximum likelihood estimates of the parameters of a mixture model that best fits the data.
It works by randomly initializing the parameters of the mixture model, then it iteratively performs two steps until convergence.
First, instances are assigned to clusters (referred to as the "expectation step").
Second, the clusters are updated (called the "maximization step").
In the expectation step, each instance in the dataset is assigned a probability of belonging to each cluster based on the current cluster parameters.
In the maximization step, the clusters are updated using all the instances in the dataset, with each instance weighted by the probability that it belongs to that cluster.
The update of each cluster is mainly influenced by the instances that it is most responsible for \cite{Geron}.



% What metrics did you use to evaluate/validate?
% How do the metrics work in terms of density and separation?
% How did you choose k?
% Describe what kind of clusters you got.
% Did the clusters line up with the original labels (e.g., binary classification)?
% How is the cluster separation?
% How is the cluster density?
% What changes you might make to improve cluster performance?


%------------------------------------------------------------------------
%%%%%%%%% CLUSTERING
\subsection{KMeans}
The best number of K clusters was chosen in an unsupervised manner for both algorithms.
For KMeans, the silhouette score was used as the performance measure for choosing K.
The silhouette score defined as the mean silhouette coefficient over all instances, and its bounded between -1 and +1.
It measures the quality of the clustering by considering cluster density (the distance between the data points within a cluster) and separation (how far apart the clusters are from one another).
Mutual information measures the amount of information shared by two variables and it was used to validate the best K value.
According to the silhouette score, the best number of clusters for the Breast Cancer dataset was determined as K=4 (Figures 1 and 2).
Even though other K values have higher silhouette scores, such as K=2 or K=3, most of the instances in those clusters have lower coefficients than this score, stopping short on the left side of the dashed line (Figure 2).
Thus, when K=4 and 5, all the clusters extend beyond to the right of the dashed red line (Figure 2), indicating the clusters coefficients are higher than the mean silhouette score.
Although K=4 and 5 are possible choices, when K=4 all clusters have similar sizes as compared to K=5.

Similar results were observed for the Wine dataset.
The best number of clusters was chosen using the silhouette score where K=4 (Figures 3 and 4).
Since both Breast Cancer and Wine are binary classification datasets, the number of chosen clusters did not line up with the original labels.


\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step1/Subplot_KMeans_breast_cancer.png}
    \caption{\textbf{KMeans, Breast Cancer dataset, K=4.}
    The best number of clusters  was chosen in an unsupervised manner using the silhouette score where K=4, and validated using mutual information. Smaller values of K perform better than larger values for both measures.}
\end{figure}


\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step1/KMeans_Silhouette7_Breast Cancer.png}
    \caption{\textbf{KMeans, Breast Cancer dataset, K=4.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step1/Subplot_KMeans_wine.png}
    \caption{\textbf{KMeans, Wine dataset, K=4.}
    The best number of clusters  was chosen in an unsupervised manner using the silhouette score where K=4, and validated using mutual information. Smaller values of K perform better than larger values for both measures.}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step1/KMeans_Silhouette7_Wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


%------------------------------------------------------------------------
%%%%%%%%% CLUSTERING: Wine dataset
\subsection{GMM}
The Silhouette score is not a reliable measure for non-spherical clusters or clusters with vastly different sizes.
Instead, finding a model that minimizes a theoretical information criterion, such as the Bayesian information criterion (BIC) or the Akaike information criterion (AIC) was selected as the GMM measure.


\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step1/Subplot_GMM_breast_cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}



\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step1/Subplot_GMM_wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}



%------------------------------------------------------------------------
%%%%%%%%% DIMENSIONALITY REDUCTION
\section{Dimensionality Reduction}



%------------------------------------------------------------------------
%%%%%%%%% PCA
\subsection{PCA}
PCA.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step2/Subplot_PCA_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=6cm]{Figures/Breast_Cancer/step2/2D_PCA_Joint_breast_cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step2/Subplot_PCA_Wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=6cm]{Figures/Wine/step2/2D_PCA_Joint_wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


%------------------------------------------------------------------------
%%%%%%%%% ICA
\subsection{ICA}
ICA.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step2/Subplot_ICA_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step2/Subplot_ICA_Wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


%------------------------------------------------------------------------
%%%%%%%%% RP
\subsection{Random Projection}
Random Projection (RP).

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step2/Subplot_RP_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step2/Subplot_RP_Wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


%------------------------------------------------------------------------
%%%%%%%%% RFECV
\subsection{RFECV}
RFECV.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step2/Subplot_RFECV_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Wine/step2/Subplot_RFECV_Wine.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


%------------------------------------------------------------------------
%%%%%%%%% CLUSTERING ON DIMENSIONALITY REDUCED DATA
\section{Clustering on Dimensionality Reduction}
Clustering section, 16 combinations.


%------------------------------------------------------------------------
%%%%%%%%% KMeans: Dimensionality Reduction algorithms
\subsection{KMeans: PCA}
KMeans PCA.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Silhouette_KMeans_PCA_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Silhouette_KMeans_PCA_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
\subsection{KMeans: ICA}
KMeans ICA.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_KMeans_ICA_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_KMeans_ICA_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
\subsection{KMeans: RP}
KMeans RP.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_KMeans_RP_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_KMeans_RP_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
\subsection{KMeans: RFECV}
KMeans RFECV.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_KMeans_RFECV_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_KMeans_RFECV_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
%%%%%%%%% GMM: Dimensionality Reduction algorithms
\subsection{GMM: PCA}
GMM PCA.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_GMM_PCA_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_GMM_PCA_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
\subsection{GMM: ICA}
GMM ICA.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_GMM_ICA_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_GMM_ICA_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
\subsection{GMM: RP}
GMM RP.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_GMM_RP_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_GMM_RP_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}


%------------------------------------------------------------------------
\subsection{GMM: RFECV}
GMM RFECV.

\begin{figure}[ht]
    \centering
    \includegraphics[width=8.35cm]{Figures/Breast_Cancer/step3/Subplot_GMM_RFECV_Breast Cancer.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{Figures/Wine/step3/Subplot_GMM_RFECV_Wine.png}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}




%------------------------------------------------------------------------
%%%%%%%%% NEURAL NETWORK, DIMENSIONALITY REDUCED DATA
\section{Neural Network, Dimensionality Reduction}



\begin{figure}[ht]
    \centering
    \includegraphics[width=7cm]{Figures/Breast_Cancer/step4/NN_Learning_Time.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}
 



%------------------------------------------------------------------------
%%%%%%%%% NEURAL NETWORK, CLUSTERING
\section{Neural Network, Clustering}




\begin{figure}[ht]
    \centering
    \includegraphics[width=7cm]{Figures/Breast_Cancer/step5/NN_Learning_Time.png}
    \caption{\textbf{Figure name.}
    Figure caption goes here...}
\end{figure}


%------------------------------------------------------------------------
%%%%%%%%% CONCLUSION
\section{Conclusion}

% \begin{figure}[ht]
%     \centering
%     \includegraphics[width=8.35cm]{}
%     \caption{\textbf{Figure name.}
%     Figure caption goes here...}
% \end{figure}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%------------------------------------------------------------------------
%%%%%%%%% REFERENCES
% \break
{\small
% \bibliographystyle{ieee_fullname}
\bibliographystyle{unsrt}
\bibliography{egbib}
}

\end{document}
