In [22]:
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
import math
import re 
from sklearn.preprocessing import StandardScaler

# K Means Code was from here: https://github.com/aihubprojects/Machine-Learning-From-Scratch/blob/master/K-Means%20from%20Scratch.ipynb I will modify it as needed
# Followed this tutorial as well: https://www.geeksforgeeks.org/kmeans-clustering-and-pca-on-wine-dataset/

WINE_DATA = pd.read_csv("./wine/wine.data", index_col=0) # Do index_col=0 to drop the class identifier. 

IRIS_DATA = pd.read_csv("./Iris/iris.data")


class K_Means:
    
    def __init__(self, k=2, tolerance = 0.001, max_iter = 500):
        self.k = k
        self.max_iterations = max_iter
        self.tolerance = tolerance
    
    def euclidean_distance(self, x1, x2):
        return np.linalg.norm(x1- x2, axis=0)
        
    def predict(self,data):
        distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
        classification = distances.index(min(distances))
        return classification
    
    def fit(self, data):
        self.centroids = {}
        for i in range(self.k):
            self.centroids[i] = data[i]
        
        
        for i in range(self.max_iterations):
            self.classes = {}
            for j in range(self.k):
                self.classes[j] = []
                
            for point in data:
                distances = []
                for index in self.centroids:
                    distances.append(self.euclidean_distance(point,self.centroids[index]))
                cluster_index = distances.index(min(distances))
                self.classes[cluster_index].append(point)
            
            previous = dict(self.centroids)
            for cluster_index in self.classes:
                self.centroids[cluster_index] = np.average(self.classes[cluster_index], axis = 0)
            

                
            isOptimal = True
            
            for centroid in self.centroids:
                original_centroid = previous[centroid]
                curr = self.centroids[centroid]
                if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
                    isOptimal = False
            if isOptimal:
                break

def get_column_names():
    wine_columns = []
    iris_columns = []
    
    wine_line_counter = 0
    iris_line_counter = 0
    
    wine_names = "./wine/wine.names"
    with open(wine_names) as f:
        for line in f:
            wine_line_counter += 1
            if(wine_line_counter > 57 and wine_line_counter <= 70):
                line = line.strip()
                wine_columns.append(line)
    f.close()
    
    wine_columns =  [re.sub(r'^\d+\)\s*', '', column) for column in wine_columns]
    print(wine_columns)
    
    return wine_columns, iris_columns
            
    
    


def wine_classification(wine_columns):
    df = WINE_DATA
    df.columns = wine_columns
    df.info()
    scaler = StandardScaler()
    features = scaler.fit(df)
    features = features.transform(df)
    
    scaled_df = pd.DataFrame(features, columns=df.columns)
    scaled_df.head(2)
    
                
def main():
    wine_columns, iris_columns = get_column_names()
    wine_classification(wine_columns)

    
if __name__ == "__main__":
    main()

['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
<class 'pandas.core.frame.DataFrame'>
Index: 177 entries, 1 to 3
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Alcohol                       177 non-null    float64
 1   Malic acid                    177 non-null    float64
 2   Ash                           177 non-null    float64
 3   Alcalinity of ash             177 non-null    float64
 4   Magnesium                     177 non-null    int64  
 5   Total phenols                 177 non-null    float64
 6   Flavanoids                    177 non-null    float64
 7   Nonflavanoid phenols          177 non-null    float64
 8   Proanthocyanins               177 non-null    float64
 9   Color intensity               177 non-