In [6]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris, load_wine
from sklearn.datasets import fetch_openml
from sklearn.tree import DecisionTreeClassifier

In [7]:
datasets = {
    "iris": load_iris(as_frame=True).frame,
    "wine": load_wine(as_frame=True).frame,
    "abalone": fetch_openml(data_id=183, as_frame=True).frame,
    "adult": fetch_openml(data_id=1590, as_frame=True).frame,
    "digits": fetch_openml(data_id=554, as_frame=True).frame
}

In [9]:
def process_dataset(df, i_column):
    df = df.dropna()
    
    if i_column not in df.columns:
        print(f"Error: Target column '{i_column}' not found i dataset. Availabe columns: {df.columns.tolist()}")
        return None, None
    
    categori_columns = df.select_dtypes(include = ["object", "category"]).columns.tolist()
    
    if categori_columns:
        df = pd.get_dummies(df, columns=categori_columns, drop_first=True)
        
    X = df.drop(columns=[i_column])
    y = df[i_column]
    
    if y.dtype == "object":
        y = pd.factorize(y)[0]
        
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X,y
        

In [11]:
result = {}
def supervised (X,y):
    X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)
    return accuracy_score(y_test, predictions)


for name, dataset in datasets.items():
    print(f"Process dataset: {name}")
    print(f"Available columns: {dataset.columns.tolist()}")
    
    if name == "abalone":
        i_column = "Rings"
    if name == "adult":
        i_column = "income"
    if name == "digits" :
        i_column = "class"
    if name == "iris":
        i_column = "target"
    if name == "wine":
        i_column = "target"
    else:
        i_column = dataset.columns[-1]
        
    if i_column not in dataset.columns:
        print(f"Error: skipp dataset '{name}' as target column '{i_column}' does not exist")
        print(f"Availabe columns in '{name}' : {dataset.columns.tolist()}")
        
    try:
        X,y = process_dataset(dataset, i_column)
    except Exception as e:
        print(f"Error process_dataset '{name}' : {e}")
        continue
    
    if X is None or y is None:
        print(f"skipp dataset '{name}' to process error")
        continue
    # perform KMeans
    try:
        n_clusters = len(np.unique(y))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(X)
        silhouette = silhouette_score(X,labels)
    except Exception as e:
        print(f"Error perform KMeans on dataset '{name}' : {e}")
        silhouette = None
        
    # perform supervised
    try:
        accuracy = supervised(X,y)
    except Exception as e:
        print(f"Error perform supervised on dataset '{name}' : {e}")
        accuracy = None
        
    #store result
    
    result[name]= {
        "unsupervised" : silhouette,
        "supervised"  : accuracy
    } 
    
print("Results:")

for dataset, metrics in result.items():
    print(f"{dataset}:")
    
    if metrics["unsupervised"] is not None:
        print(f" Unsupervised ( Silhouette score): {metrics['unsupervised']:.4f}")
        
    else:
        print(" Unsupervised : Error calculating silhouette score")
        
    if metrics["supervised"] is not None:
        print(f"supervised (Accuracy): {metrics['supervised']:.4f}")
        
    else:
        print("Supervised : Error calcluting accuracy")
        
    
      













    

Process dataset: iris
Available columns: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'target']
Process dataset: wine
Available columns: ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline', 'target']
Process dataset: abalone
Available columns: ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Class_number_of_rings']
Error process_dataset 'abalone' : "['Class_number_of_rings'] not found in axis"
Process dataset: adult
Available columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'class']
Error process_dataset 'adult' : "['class'] not found in axis"
Process dataset: digits
Available columns: 

Jawad Rajabi    
mohraj-0@student.ltu.se

1.	Names of all group members
Jawad Rajabi
mohraj-0@student.ltu.se

2.	Clear specification of the addressed grading criteria

For grade 3: Develop 1 unsupervised and 1 supervised classification model for 5 datasets of your choice from 121 UCI datasets. Report accuracy results

Iris, wine, abalone, adult and digit are for testing.
Models that used are KMeans and DecisionTreeClassifier.
The result is measured using silhouette score for KMeans and accuracy for DecisionTreeClassifier.

3.	Description of the datasets used in the miniproject

Iris:
Dataset for classifying flowers with three different species: Setosa, Versicolor and Virginica.
Size: Total 150 samples
Classes: Setosa, Versicolor and Virginica.
Majority percentage: 33.33% (All three classes are evenly distributed with 50 samples )
Features: sepal length (cm), sepal width (cm), petal length (cm), petal width (cm)
Wine:
Dataset that analyzes chemical properties of wine to classify them into different categories.
Size: Total 178 samples
Classes: [0, 1, 2] (Corresponding to different wine categories)
Features: alcohol, malic acid, ash, alkalinity_of_ash, magnesium ,flavonoids  etc.
Majority percentage: Varies depending on class distribution


Abalone:
Dataset that try to predict the age of abalone through its physical characteristics.
Functions: Length, Diameter, Height, Whole_weight, Shucked_weight, Viscera_weight, Shell_weight, Class_number_of_rings
Target column: Rings (represents the abalone's age in number of rings).
Problem in the code: The  Rings is not correctly identified (incorrect column name is used)

Adult
Dataset to predict income level (above/below 50K) based on demographic and work-related attributes.
Features: age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, etc.
Target column: income (represents whether income is >50K or <=50K).
Problem in the code: The code is looking for the wrong target column  which causes an error.

Digit
Handwritten digit data set for classification, where each image is represented by pixel values.
Features: Contains 64 features (one for each pixel in an 8x8 image)
Target column: class (represents the number 0-9 as shown in the image).
Problem in the code: The target column is not correctly identified in the dataset, which causes an error.

4.	Description of the models used in the miniproject

KMeans (unsupervised learning):
A clustering algorithm that make groups data points based on similarity. The number of clusters is set to the uniqe number in the target variable.
Performance is measured by the silhouette  which show how well clusters are separated.

DecisionTreeClassifier (supervised learning):
A classification algorithm that creates a tree structure based on decisions to predict the target variable.
Performance is measured by accuracy, which indicates the proportion of correct predictions on the test data.

5.	Description of the experimental methodology (datasets' splits, cross-validation, performance metricsetc)

Dataset Splits: 
The datasets are split into 70% for training and 30% for testing using train_test_split.
This helps to train models on a portion of the data and test its performance on unseen data.

Cross-Validation:
Det finns ingen avancerad korsvalidering (som k-fold) som används i koden.
70/30-delningen fungerar som ett enkelt sätt att testa modellen på ny data.

Performance Matrics:
Silhouette Score:
Used to check how well the data is grouped into clusters by the supervised model (KMeans). A score closer to 1 means that the clusters are good.
Accuracy:
Used to measure how well the supervised model (DecisionTreeClassifier) predicts the correct labels. Higher accuracy means better predictions.


6.	Description of the experimental results

rice dataset:
Unsupervised (Silhouette score): 0.4799
Supervised (Accuracy): 1.0000
Wine dataset:
Unsupervised (Silhouette score): 0.2849
Supervised (Accuracy): 0.9630
For Abalone, Adult, and Digits, errors occur because the target columns are not found.

7.	Conclusions

The Iris dataset performs very well for both unsupervised and supervised models, indicating that the data is well-structured and easy to cluster and classify.

The Wine dataset has a slightly lower silhouette score, indicating that the clusters are not as well separated. However, the high accuracy shows that the supervised model is very effective.

The errors that occur for the Abalone, Adult and Digits datasets indicate that the code's handling of target columns needs to be improved


