In [19]:
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [20]:
def load_emnist_data():
    df_emnist_train = pd.read_csv("data/emnist-balanced-train.csv")
    df_emnist_test = pd.read_csv("data/emnist-balanced-test.csv")
    return df_emnist_train, df_emnist_test

# Testar vilket antal träd som ger bäst träffsäkerhet.

In [21]:
def train_RF():
    df_emnist_train, df_emnist_test = load_emnist_data()

    X_train = df_emnist_train.drop(columns=['label'])
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label'])
    y_test = df_emnist_test['label']

    n_estimators_options = [50, 100, 200]
    best_accuracy = 0
    best_n_estimators = 0
    best_rf_model = None
    
    for n_estimators in n_estimators_options:
        start_time = time.time()  # Starta tidtagningen 1
        rf = RandomForestClassifier(n_estimators=n_estimators)
        rf.fit(X_train, y_train)
        end_time = time.time()  # Stoppa tidtagningen 1

        start_time_pred = time.time() # Starta tidtagningen 2
        y_pred = rf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        end_tme_pred = time.time() # Stoppa tidtagningen 2

        training_time = end_time - start_time  # Beräkna tidenerna
        pred_time = end_tme_pred - start_time_pred
        total_time = training_time + pred_time

        print(f"RF Model with n_estimators = {n_estimators}, Accuracy: {accuracy:.3f}")
        print(f'Training Time: {training_time:.2f} seconds')
        print(f'Prediction Time: {pred_time:.2f} seconds')
        print(f'Total Time: {total_time:.2f} seconds')
        print(f'--------------------------------------------------------------------')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_n_estimators = n_estimators
            best_rf_model = rf
    
    print(f"Best RF Model n_estimators: {best_n_estimators}")
    print(f"Best RF Model Accuracy: {best_accuracy:.3f}")
    
    return best_rf_model

# Testar vilket värde på k som ger bäst träffsäkerhet.

In [22]:
def train_KNN():
    df_emnist_train, df_emnist_test = load_emnist_data()

    X_train = df_emnist_train.drop(columns=['label'])
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label'])
    y_test = df_emnist_test['label']

    n_neighbors_options = [3, 5, 7, 9]
    best_accuracy = 0
    best_n_neighbors = 0
    best_knn_model = None
    
    for n_neighbors in n_neighbors_options:
        start_time = time.time()  # Starta tidtagningen 1
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        knn.fit(X_train, y_train)
        end_time = time.time()  # Stoppa tidtagningen 1

        start_time_pred = time.time() # Starta tidtagningen 2
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        end_tme_pred = time.time() # Stoppa tidtagningen 2

        training_time = end_time - start_time  # Beräkna tidenerna
        pred_time = end_tme_pred - start_time_pred
        total_time = training_time + pred_time

        print(f"KNN Model with n_neighbors = {n_neighbors}, Accuracy: {accuracy:.3f}")
        print(f'Training Time: {training_time:.2f} seconds')
        print(f'Prediction Time: {pred_time:.2f} seconds')
        print(f'Total Time: {total_time:.2f} seconds')
        print(f'--------------------------------------------------------------------')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_n_neighbors = n_neighbors
            best_knn_model = knn
    
    print(f"Best KNN Model n_neighbors: {best_n_neighbors}")
    print(f"Best KNN Model Accuracy: {best_accuracy:.3f}")
    
    return best_knn_model

## Testar olika inställningar för ANN på 10% av datan för att se vilken som ger bäst resultat.

In [23]:
def train_ANN():
    df_emnist_train, df_emnist_test = load_emnist_data()

    X_train = df_emnist_train.drop(columns=['label']).values / 255.0
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label']).values / 255.0
    y_test = df_emnist_test['label']

    X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)

    
    configurations = [
        # Färre lager och neuroner
        {"hidden_layer_sizes": (64, 32), "learning_rate_init": 0.001, "max_iter": 600, "random_state": 42},
        # Många lager och neuroner
        {"hidden_layer_sizes": (512, 256, 128, 64), "learning_rate_init": 0.001, "max_iter": 600, "random_state": 42},
        # Symmetriska antal neuroner
        {"hidden_layer_sizes": (128, 128, 64, 64), "learning_rate_init": 0.001, "max_iter": 600, "random_state": 42},
        # Färre lager med flera neuroner
        {"hidden_layer_sizes": (128, 128), "learning_rate_init": 0.001, "max_iter": 600, "random_state": 42},
        # Färre neuroner med fler lager
        {"hidden_layer_sizes": (64, 64, 32, 32, 16), "learning_rate_init": 0.001, "max_iter": 600, "random_state": 42}
    ]

    best_accuracy = 0
    best_config = None
    best_ann_model = None
    
    for config in configurations:
        ann = MLPClassifier(hidden_layer_sizes=config["hidden_layer_sizes"], learning_rate_init=config["learning_rate_init"], random_state=config["random_state"])
        
        start_time = time.time()  # Starta tidtagningen 1
        ann.fit(X_train_small, y_train_small)
        end_time = time.time()  # Stoppa tidtagningen 1

        training_time = end_time - start_time  # Beräkna tiden
        
        start_time_pred = time.time() # Starta tidtagningen 2
        y_pred = ann.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        end_tme_pred = time.time() # Stoppa tidtagningen 2

        training_time = end_time - start_time  # Beräkna tidenerna
        pred_time = end_tme_pred - start_time_pred
        total_time = training_time + pred_time
        
        print(f"ANN Model with config: {config}")
        print(f"Accuracy: {accuracy:.3f}")
        print(f'Training Time: {training_time:.2f} seconds')
        print(f'Prediction Time: {pred_time:.2f} seconds')
        print(f'Total Time: {total_time:.2f} seconds')
        print('--------------------------------------------------------------------')
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_config = config
            best_ann_model = ann

    print(f"Best ANN Model Configuration: {best_config}")
    print(f"Best ANN Model Accuracy: {best_accuracy:.3f}")
    
    return best_ann_model

In [24]:
best_ann_model = train_ANN()



ANN Model with config: {'hidden_layer_sizes': (64, 32), 'learning_rate_init': 0.0001, 'max_iter': 600, 'random_state': 42}
Accuracy: 0.692
Training Time: 38.62 seconds
Prediction Time: 0.04 seconds
Total Time: 38.67 seconds
--------------------------------------------------------------------




ANN Model with config: {'hidden_layer_sizes': (512, 256, 128, 64), 'learning_rate_init': 0.0001, 'max_iter': 600, 'random_state': 42}
Accuracy: 0.741
Training Time: 423.23 seconds
Prediction Time: 0.22 seconds
Total Time: 423.45 seconds
--------------------------------------------------------------------




ANN Model with config: {'hidden_layer_sizes': (128, 128, 64, 64), 'learning_rate_init': 0.0001, 'max_iter': 600, 'random_state': 42}
Accuracy: 0.728
Training Time: 122.63 seconds
Prediction Time: 0.08 seconds
Total Time: 122.72 seconds
--------------------------------------------------------------------




ANN Model with config: {'hidden_layer_sizes': (128, 128), 'learning_rate_init': 0.0001, 'max_iter': 600, 'random_state': 42}
Accuracy: 0.733
Training Time: 156.65 seconds
Prediction Time: 0.09 seconds
Total Time: 156.74 seconds
--------------------------------------------------------------------
ANN Model with config: {'hidden_layer_sizes': (64, 64, 32, 32, 16), 'learning_rate_init': 0.0001, 'max_iter': 600, 'random_state': 42}
Accuracy: 0.679
Training Time: 49.63 seconds
Prediction Time: 0.05 seconds
Total Time: 49.68 seconds
--------------------------------------------------------------------
Best ANN Model Configuration: {'hidden_layer_sizes': (512, 256, 128, 64), 'learning_rate_init': 0.0001, 'max_iter': 600, 'random_state': 42}
Best ANN Model Accuracy: 0.741




## Testar den inställning som gav bäst resultat på ANN på olika stora delar av datasetet.

In [25]:
# Best ANN Model Configuration: {'hidden_layer_sizes': (256, 128, 64), 'learning_rate_init': 0.001, 'max_iter': 600, 'random_state': 42, 'activation': 'relu'}
def train_ANN_with_different_data_sizes(data_fraction):
    df_emnist_train, df_emnist_test = load_emnist_data()
    
    X_train = df_emnist_train.drop(columns=['label']).values / 255.0
    y_train = df_emnist_train['label']
    X_test = df_emnist_test.drop(columns=['label']).values / 255.0
    y_test = df_emnist_test['label']
    
    # Ta en delmängd av träningsdatan
    subset_size = int(data_fraction * len(X_train))
    X_train_small = X_train[:subset_size]
    y_train_small = y_train[:subset_size]
    
    ann = MLPClassifier(hidden_layer_sizes=(256, 128, 64), learning_rate_init=0.001, max_iter=600, activation="relu" , random_state=42)
    start_time = time.time()
    ann.fit(X_train_small, y_train_small)
    end_time = time.time()
    training_time = end_time - start_time
    
    accuracy = ann.score(X_test, y_test)
    
    print(f"Data Fraction: {data_fraction}")
    print(f"ANN Model Accuracy: {accuracy:.3f}")
    print(f'Training Time: {training_time:.2f} seconds')
    print('--------------------------------------------------------------------')
    
    return ann

# Testar med olika träningsstorlekar
#fractions = [0.1,0.3,0.5,0.7,1.0]
#for fraction in fractions:
    #train_ANN_with_different_data_sizes(fraction)


In [26]:
#best_rf_model = train_RF()

In [27]:
#best_knn_model = train_KNN()