In [None]:
#import packages
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import hamming_loss,f1_score, ConfusionMatrixDisplay
from sklearn.tree import plot_tree


In [None]:
# read in data from csv, and prepare for training
# we drop features that were determined insignificant from the feature selection process.
df=pd.read_csv('wind_dataset.csv')
state=df["State"]
name=df["Name"]
df.drop(["State","Name","GHI"], axis=1, inplace=True) 
order_of_df = df.index #store for later
df=df.sample(frac=1)  


In [None]:
#preparing functions
def prepare_semi_supervised_data(data_frame, count):
    final_features = ["Temperature", "Wind_Speed", "Dew_Point", "Pressure"]
    features = data_frame[final_features]
    labels = data_frame.iloc[:, -1]
    

    

    labeled_indices = (labels == 0) | (labels == 1) #boolean mask 
    unlabeled = labels == -1
    X_labeled = features[labeled_indices]
    y_labeled = labels[labeled_indices] 
    X_unknown = features[unlabeled].iloc[:50*(count+1), :]

    #we ignore -2 because we are not confident in those predictions

    return X_labeled, y_labeled, X_unknown

def add_pseudo_label_proba(df, probs, prob_threshold=0.8):  #play aroud with value to not have too many -2's  or maybe it can be a user thing
    i = 0

    while i < df.shape[0] and len(probs) != 0:
        if df.loc[i, "Suitability"] == -1:
            predicted_class = np.argmax(probs[0])

            if probs[0][predicted_class] >= prob_threshold:
                df.loc[i, "Suitability"] = predicted_class
            else:
                df.loc[i, "Suitability"] = -2 #lost causes labeled -2 because they are useless 
                

            probs = probs[1:]
        i += 1

def plot_iterations_scores(f1_scores, accuracy_scores):
    iterations = range(1, len(f1_scores) + 1)

    plt.figure(figsize=(10, 6))  
    plt.plot(iterations, f1_scores, marker='o', linestyle='-', label='F1-Score')
    plt.plot(iterations, accuracy_scores, marker='s', linestyle='-', label='Accuracy')

    plt.xlabel("Iterations")
    plt.ylabel("Score")
    plt.title("F1-Score and Accuracy vs. Iterations for the Wind RFC Model")
    plt.ylim(0.5, 1)
    plt.grid(True)
    plt.legend()
    
    plt.show()

In [None]:
#preperation dictionaries
results_dic = {
    'Accuracy': [],
    'F1-Score': [],
    'Hamming Loss': []
}

params = {  #these are the orignal, not tuned to tmy
    'n_estimators': 300, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_leaf_nodes': 27, 'criterion': 'gini',
    'max_depth': 5
}


In [None]:
#loop
count = 0


while count < 7:   #10th iteration will be with all data trained 
    model = RandomForestClassifier(**params)
    X_labeled, y_labeled, X_unknown = prepare_semi_supervised_data(df, count)

    X_train_labeled, X_test, y_train_labeled, y_test = train_test_split(X_labeled, y_labeled, test_size=0.2, shuffle=True)

    model.fit(X_train_labeled, y_train_labeled)

    prob_predictions = model.predict_proba(X_test)

    accuracy = model.score(X_test, y_test)
    f1 = f1_score(y_test, np.argmax(prob_predictions, axis=1)) 
    hamming_loss_value = hamming_loss(y_test, np.argmax(prob_predictions, axis=1))

    
    results_dic["Accuracy"].append(accuracy)
    results_dic["F1-Score"].append(f1)
    results_dic["Hamming Loss"].append(hamming_loss_value)

    if len(X_unknown) > 0:
        prob_labels = model.predict_proba(X_unknown)
        add_pseudo_label_proba(df, prob_labels)
    
    # Optional Confusion matrix display for each iteration
    # ConfusionMatrixDisplay.from_estimator(model,X_test,y_test)
    # plt.show()

    count += 1

In [None]:
#plotting and exporting



plot_iterations_scores(results_dic["F1-Score"],results_dic["Accuracy"])

df.insert(0, "Name", name)
df.insert(0, "State", state)
# Assuming df is the shuffled DataFrame
restored_order_df = df.loc[order_of_df].reset_index(drop=True)

restored_order_df.to_csv("Labeled_Wind.csv", index=False)  

print(f"Accuracy list: {results_dic['Accuracy']}")

print(np.mean(results_dic["F1-Score"]))
print(model.feature_importances_)