In [1]:
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
def load_emnist_data():
    df_emnist_train = pd.read_csv("data/emnist-balanced-train.csv")
    df_emnist_test = pd.read_csv("data/emnist-balanced-test.csv")
    return df_emnist_train, df_emnist_test

# Testar vilket antal träd som ger bäst träffsäkerhet.

In [3]:
def train_RF():
    df_emnist_train, df_emnist_test = load_emnist_data()

    X_train = df_emnist_train.drop(columns=['label'])
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label'])
    y_test = df_emnist_test['label']

    n_estimators_options = [50, 100, 200]
    best_accuracy = 0
    best_n_estimators = 0
    best_rf_model = None
    
    for n_estimators in n_estimators_options:
        start_time = time.time()  # Starta tidtagningen 1
        rf = RandomForestClassifier(n_estimators=n_estimators)
        rf.fit(X_train, y_train)
        end_time = time.time()  # Stoppa tidtagningen 1

        start_time_pred = time.time() # Starta tidtagningen 2
        y_pred = rf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        end_tme_pred = time.time() # Stoppa tidtagningen 2

        training_time = end_time - start_time  # Beräkna tidenerna
        pred_time = end_tme_pred - start_time_pred
        total_time = training_time + pred_time

        print(f"RF Model with n_estimators = {n_estimators}, Accuracy: {accuracy:.3f}")
        print(f'Training Time: {training_time:.2f} seconds')
        print(f'Prediction Time: {pred_time:.2f} seconds')
        print(f'Total Time: {total_time:.2f} seconds')
        print(f'--------------------------------------------------------------------')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_n_estimators = n_estimators
            best_rf_model = rf
    
    print(f"Best RF Model n_estimators: {best_n_estimators}")
    print(f"Best RF Model Accuracy: {best_accuracy:.3f}")
    
    return best_rf_model

# Testar vilket värde på k som ger bäst träffsäkerhet.

In [4]:
def train_KNN():
    df_emnist_train, df_emnist_test = load_emnist_data()

    X_train = df_emnist_train.drop(columns=['label'])
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label'])
    y_test = df_emnist_test['label']

    n_neighbors_options = [3, 5, 7, 9]
    best_accuracy = 0
    best_n_neighbors = 0
    best_knn_model = None
    
    for n_neighbors in n_neighbors_options:
        start_time = time.time()  # Starta tidtagningen 1
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        knn.fit(X_train, y_train)
        end_time = time.time()  # Stoppa tidtagningen 1

        start_time_pred = time.time() # Starta tidtagningen 2
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        end_tme_pred = time.time() # Stoppa tidtagningen 2

        training_time = end_time - start_time  # Beräkna tidenerna
        pred_time = end_tme_pred - start_time_pred
        total_time = training_time + pred_time

        print(f"KNN Model with n_neighbors = {n_neighbors}, Accuracy: {accuracy:.3f}")
        print(f'Training Time: {training_time:.2f} seconds')
        print(f'Prediction Time: {pred_time:.2f} seconds')
        print(f'Total Time: {total_time:.2f} seconds')
        print(f'--------------------------------------------------------------------')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_n_neighbors = n_neighbors
            best_knn_model = knn
    
    print(f"Best KNN Model n_neighbors: {best_n_neighbors}")
    print(f"Best KNN Model Accuracy: {best_accuracy:.3f}")
    
    return best_knn_model

## Testar olika inställningar för ANN på 10% av datan för att se vilken som ger bäst resultat.

In [5]:
def train_ANN():
    df_emnist_train, df_emnist_test = load_emnist_data()

    X_train = df_emnist_train.drop(columns=['label']).values / 255.0
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label']).values / 255.0
    y_test = df_emnist_test['label']

    X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)

    # Definiera konfigurationerna
    configurations = [
        {"hidden_layer_sizes": (32,), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (64,), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (128, 64), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (256, 128), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (128, 128, 64, 64), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (256, 128, 64, 32), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (128, 64), "learning_rate_init": 0.001, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (128, 64), "learning_rate_init": 0.0001, "max_iter": 300, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (128, 64), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'tanh'},
        {"hidden_layer_sizes": (128, 64), "learning_rate_init": 0.01, "max_iter": 300, "random_state": 42, "activation": 'logistic'},
        {"hidden_layer_sizes": (128, 64), "learning_rate_init": 0.01, "max_iter": 600, "random_state": 42, "activation": 'relu'},
        {"hidden_layer_sizes": (256, 128), "learning_rate_init": 0.01, "max_iter": 600, "random_state": 42, "activation": 'relu'}
    ]

    best_accuracy = 0
    best_config = None
    best_ann_model = None
    
    for config in configurations:
        ann = MLPClassifier(hidden_layer_sizes=config["hidden_layer_sizes"], learning_rate_init=config["learning_rate_init"], random_state=config["random_state"])
        
        start_time = time.time()  # Starta tidtagningen 1
        ann.fit(X_train_small, y_train_small)
        end_time = time.time()  # Stoppa tidtagningen 1

        training_time = end_time - start_time  # Beräkna tiden
        
        start_time_pred = time.time() # Starta tidtagningen 2
        y_pred = ann.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        end_tme_pred = time.time() # Stoppa tidtagningen 2

        training_time = end_time - start_time  # Beräkna tidenerna
        pred_time = end_tme_pred - start_time_pred
        total_time = training_time + pred_time
        
        print(f"ANN Model with config: {config}, Accuracy: {accuracy:.3f}")
        print(f'Training Time: {training_time:.2f} seconds')
        print(f'Prediction Time: {pred_time:.2f} seconds')
        print(f'Total Time: {total_time:.2f} seconds')
        print('--------------------------------------------------------------------')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_config = config
            best_ann_model = ann

    print(f"Best ANN Model Configuration: {best_config}")
    print(f"Best ANN Model Accuracy: {best_accuracy:.3f}")
    
    return best_ann_model

In [None]:
best_ann_model = train_ANN()

## Testar den inställning som gav bäst resultat på ANN på olika stora delar av datasetet.

In [6]:
# Best ANN Model Configuration: {'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.0001, 'max_iter': 300, 'random_state': 42, 'activation': 'relu'}
def train_ANN_with_different_data_sizes(data_fraction):
    df_emnist_train, df_emnist_test = load_emnist_data()
    
    X_train = df_emnist_train.drop(columns=['label']).values / 255.0
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label']).values / 255.0
    y_test = df_emnist_test['label']
    
    # Ta en delmängd av träningsdatan
    subset_size = int(data_fraction * len(X_train))
    X_train_small = X_train[:subset_size]
    y_train_small = y_train[:subset_size]
    
    ann = MLPClassifier(hidden_layer_sizes=(128, 64), learning_rate_init=0.0001, max_iter=300, activation="relu" , random_state=42)
    start_time = time.time()
    ann.fit(X_train_small, y_train_small)
    end_time = time.time()
    training_time = end_time - start_time
    
    accuracy = ann.score(X_test, y_test)
    
    print(f"Data Fraction: {data_fraction}")
    print(f"ANN Model Accuracy: {accuracy:.3f}")
    print(f'Training Time: {training_time:.2f} seconds')
    print('--------------------------------------------------------------------')
    
    return ann

# Testar med olika träningsstorlekar
fractions = [0.1, 0.3, 0.5, 0.7, 1.0]
for fraction in fractions:
    train_ANN_with_different_data_sizes(fraction)




Data Fraction: 0.1
ANN Model Accuracy: 0.735
Training Time: 130.99 seconds
--------------------------------------------------------------------




Data Fraction: 0.3
ANN Model Accuracy: 0.779
Training Time: 460.41 seconds
--------------------------------------------------------------------




Data Fraction: 0.5
ANN Model Accuracy: 0.803
Training Time: 744.63 seconds
--------------------------------------------------------------------




Data Fraction: 0.7
ANN Model Accuracy: 0.809
Training Time: 1215.60 seconds
--------------------------------------------------------------------




Data Fraction: 1.0
ANN Model Accuracy: 0.825
Training Time: 2273.98 seconds
--------------------------------------------------------------------


In [8]:
best_rf_model = train_RF()

RF Model with n_estimators = 50, Accuracy: 0.800
Training Time: 59.05 seconds
Prediction Time: 0.50 seconds
Total Time: 59.55 seconds
--------------------------------------------------------------------
RF Model with n_estimators = 100, Accuracy: 0.812
Training Time: 115.86 seconds
Prediction Time: 0.99 seconds
Total Time: 116.84 seconds
--------------------------------------------------------------------
RF Model with n_estimators = 200, Accuracy: 0.817
Training Time: 235.41 seconds
Prediction Time: 1.98 seconds
Total Time: 237.40 seconds
--------------------------------------------------------------------
Best RF Model n_estimators: 200
Best RF Model Accuracy: 0.817


In [9]:
best_knn_model = train_KNN()

KNN Model with n_neighbors = 3, Accuracy: 0.780
Training Time: 0.25 seconds
Prediction Time: 41.38 seconds
Total Time: 41.63 seconds
--------------------------------------------------------------------
KNN Model with n_neighbors = 5, Accuracy: 0.785
Training Time: 0.29 seconds
Prediction Time: 50.46 seconds
Total Time: 50.75 seconds
--------------------------------------------------------------------
KNN Model with n_neighbors = 7, Accuracy: 0.784
Training Time: 0.31 seconds
Prediction Time: 47.83 seconds
Total Time: 48.13 seconds
--------------------------------------------------------------------
KNN Model with n_neighbors = 9, Accuracy: 0.782
Training Time: 0.26 seconds
Prediction Time: 48.09 seconds
Total Time: 48.36 seconds
--------------------------------------------------------------------
Best KNN Model n_neighbors: 5
Best KNN Model Accuracy: 0.785
