# Niklas Scholz - 3832620

# Imports

In [42]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.fft import fft
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Read Data

In [43]:
X_test = pd.read_csv("PEMS_test", sep=";", header=None)
X_train = pd.read_csv("PEMS_train", sep=";", header=None)
y_test = pd.read_csv("PEMS_testlabels", sep=";", header=None)
y_train = pd.read_csv("PEMS_trainlabels", sep=";", header=None)
stationsList = pd.read_csv("stations_list", sep=";", header=None)

In [44]:
def transposeColumn(df):
    # '["1" "1" "1" ... "0" "0" "0"]' -> [1, 1, 1, ..., 0, 0, 0]
    # get rid of the first and last character ([ and ])
    df = df.values[0][0][1:-1].split(" ")
    df = [float(i) for i in df]
    df = pd.DataFrame(df, columns=["label"])
    return df

y_train = transposeColumn(y_train)
y_test = transposeColumn(y_test)

# Extract stations and cast as string
stationsList = transposeColumn(stationsList)
stationsList = [str(i) for i in stationsList["label"].to_list()]

In [48]:
def transposeMeasurments(df):
    # Iterate over all columns
    for station in stationsList:
        # Split by " "
        lists = []
        # Iterate over all rows from one specific columns and store the values as lists in variable lists
        for i in range(len(df)):
            liste = df[station][i].split(" ")
            if liste[0].startswith("["):
                liste[0] = liste[0].replace("[", "")
            
            if liste[-1].endswith("]"):
                liste[-1] = liste[-1].replace("]", "")

            # Convert all values to float
            floats = [float(i) for i in liste]

            # Append the list to the lists variable
            lists.append(floats)

        # Create 144 empty lists for the 144 measurements
        newLists = [[] for x in range(len(lists[0]))]
    
        # Iterate over all lists and append first element to first list, second element to second list and so on
        for sublist in lists:
            for index, element in enumerate(sublist):
                newLists[index].append(element)
       
        # Create a new dataframe with the new lists
        new = pd.DataFrame(newLists)
        new = new.transpose()
        new.columns = [f"{station} - Messwert {index}" for index in range(1, len(newLists) + 1)]
        
        # Concatenate the new dataframe with the old one
        df = pd.concat([df, new], axis=1)

    # Drop old columns with the names 0 until 962
    df.drop(columns=stationsList, inplace=True)
    return df

In [49]:
X_test.columns = stationsList
X_train.columns = stationsList
X_test = transposeMeasurments(X_test)
X_train = transposeMeasurments(X_train)

# PCA

In [50]:
# Create a pipeline with StandardScaler and PCA
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standard Scaling
    ('pca', PCA(n_components=0.95)),   # Step 2: PCA - keep 95% of the variance
])

# Fit the pipeline on the training data
pipeline.fit(X_train)

# Fit and transform the training data
X_train_pca = pipeline.transform(X_train)
X_test_pca = pipeline.transform(X_test)

# Convert NumPy array to DataFrame
X_train_pca = pd.DataFrame(X_train_pca, columns=[f'PC{i}' for i in range(1, X_train_pca.shape[1] + 1)])
X_test_pca = pd.DataFrame(X_test_pca, columns=[f'PC{i}' for i in range(1, X_train_pca.shape[1] + 1)])


# FFT

In [51]:
# Apply Scipy fft
X_train_fft = fft(X_train)
X_test_fft = fft(X_test)

# Predictions

In [52]:
# Random Forest Classifier
# Use ravel to convert y_train from a column vector to a 1d array
# Accuracy formula: (TP + TN) / (TP + TN + FP + FN)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_pca, y_train.values.ravel())
y_test_pred = rf.predict(X_test_pca)
print(f"Random Forest Classifier accuracy with PCA: {accuracy_score(y_test, y_test_pred)}")

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_fft.real, y_train.values.ravel())
y_test_pred = rf.predict(X_test_fft.real)
print(f"Random Forest Classifier with accuracy FFT: {accuracy_score(y_test, y_test_pred)}")

Random Forest Classifier accuracy with PCA: 0.8439306358381503
Random Forest Classifier with accuracy FFT: 0.7225433526011561
