# Task:

- Exploration und Dimensionsreduktion mit PCA / FFT 
- supervised Vorhersage des Wochentags (1(montag)-7(sonntag))

|Teammitglied|MNr|
|-|-|
|Martin Schauer|7961802|
|Joel Bück|4860895|


In [1]:
# Imports
import os
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
from scipy.fft import fft
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.pipeline import Pipeline

## Step 1) Load data

Data-Format

-------------

There are two files for each fold, the data file and the labels file. We have split the 440 time series between train and test folds, but you are of course free to merge them to consider a different cross validation setting.
- The PEMS_train textfile has 263 lines. Each line describes a time-series provided as a matrix. The matrix syntax is that of Matlab, e.g. [ a b ; c d] is the matrix with row vectors [a b] and [c d] in that order. Each matrix describes the different occupancies rates (963 lines, one for each station/detector) sampled every 10 minutes during the day (144 columns).
- The PEMS_trainlabel text describes, for each day of measurements described above, the day of the week on which the data was sampled, namely an integer between 1 (Mon.) and 7 (Sun.).

- PEMS_test and PEMS_testlabels are formatted in the same way, except that there are 173 test instances.

- The permutation that I used to shuffle the dataset is given in the randperm file. If you need to rearrange the data so that it follows the calendar order, you should merge train and test samples and reorder them using the inverse permutation of randperm.

In [2]:
def open_data(data_file,label_file)->tuple[pd.DataFrame,list]:
    with open(os.path.join(os.getcwd(),data_file),"r") as of:
        data = [" ".join(d[1:-2].split(";")).split(" ") for d in of.readlines()] # matlab syntax to pd.dataframe
        df = pd.DataFrame(data=data).astype("float")
    with open(os.path.join(os.getcwd(),label_file),"r") as of:
        labels = " ".join(of.read()[1:-2].split(";")).split(" ")# matlab syntax to pd.dataframe
        labels = [float(l) for l in labels]
    return df, labels

In [3]:
train_set,trainlabels = open_data("PEMS_train","PEMS_trainlabels")
test_set,testlabels = open_data("PEMS_test","PEMS_testlabels")

## Step 2) Dimension reduction

### PCA

In [4]:
pca = PCA(n_components=0.9) # keep 90% of information

train_set_pca = pca.fit_transform(train_set)
test_set_pca =pca.transform(test_set)

print(f"train: {train_set_pca.shape}\ntest: {test_set_pca.shape}\nlables: ({len(trainlabels)} & {len(testlabels)})")

train: (267, 98)
test: (173, 98)
lables: (267 & 173)


### FFT

In [10]:

# --> exploration of method in seperate notebook, no satisfactory result achieved

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,138662,138663,138664,138665,138666,138667,138668,138669,138670,138671
0,7849.3605- 0.0000j,-116.675672- 49.075851j,-54.445511- 26.716478j,115.417684- 68.160759j,152.100970- 18.477380j,-80.872637+ 91.642832j,95.455432+ 66.964620j,-132.737180+ 33.479892j,-64.453599- 61.901430j,-117.034025- 57.777503j,...,-148.273949+ 44.868727j,-117.034025+ 57.777503j,-64.453599+ 61.901430j,-132.737180- 33.479892j,95.455432- 66.964620j,-80.872637- 91.642832j,152.100970+ 18.477380j,115.417684+ 68.160759j,-54.445511+ 26.716478j,-116.675672+ 49.075851j
1,7967.2563- 0.0000j,-127.593348- 73.672773j,-77.698074- 13.367225j,142.108033- 99.231013j,180.313009- 6.779441j,-99.737953+ 90.340507j,88.004020+ 76.669816j,-135.369061+ 26.738596j,-39.234323- 79.332410j,-112.292884- 53.669801j,...,-192.871744+ 45.086694j,-112.292884+ 53.669801j,-39.234323+ 79.332410j,-135.369061- 26.738596j,88.004020- 76.669816j,-99.737953- 90.340507j,180.313009+ 6.779441j,142.108033+ 99.231013j,-77.698074+ 13.367225j,-127.593348+ 73.672773j
2,7693.3056- 0.0000j,-84.171713- 47.764552j,-29.596806- 30.339704j,95.402665- 90.046476j,141.280689+ 22.524088j,-37.782701- 26.885945j,120.992048+ 63.026940j,-149.545106+ 29.405590j,-37.854055- 11.027108j,-118.797205- 63.621331j,...,-124.548367+ 13.779506j,-118.797205+ 63.621331j,-37.854055+ 11.027108j,-149.545106- 29.405590j,120.992048- 63.026940j,-37.782701+ 26.885945j,141.280689- 22.524088j,95.402665+ 90.046476j,-29.596806+ 30.339704j,-84.171713+ 47.764552j
3,8177.6446- 0.0000j,-151.689298-125.539831j,-290.876889+ 44.490078j,152.297116- 93.530641j,378.096157-145.912605j,-19.091108+ 16.415380j,12.454774+258.620485j,-103.453731+ 66.180403j,-33.572481-240.167501j,-153.345471-106.742700j,...,-43.565481-142.125212j,-153.345471+106.742700j,-33.572481+240.167501j,-103.453731- 66.180403j,12.454774-258.620485j,-19.091108- 16.415380j,378.096157+145.912605j,152.297116+ 93.530641j,-290.876889- 44.490078j,-151.689298+125.539831j
4,5540.0224- 0.0000j,-1.714268- 55.836627j,-30.730693-218.342970j,23.923997-152.879125j,-48.475982- 55.102648j,-70.917302+ 36.534380j,56.599857+198.508685j,-61.621603+118.332303j,104.711238+ 6.052865j,66.756985- 51.835813j,...,2.645941+136.481777j,66.756985+ 51.835813j,104.711238- 6.052865j,-61.621603-118.332303j,56.599857-198.508685j,-70.917302- 36.534380j,-48.475982+ 55.102648j,23.923997+152.879125j,-30.730693+218.342970j,-1.714268+ 55.836627j
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,7608.4676- 0.0000j,-201.859791-118.456064j,-163.487743- 95.042448j,156.577998- 13.209539j,200.537384-103.075510j,42.332342- 14.414352j,27.786675+161.366228j,-123.272881+ 23.272678j,66.565233- 69.513658j,-176.432965- 27.128810j,...,-134.899338+ 19.413267j,-176.432965+ 27.128810j,66.565233+ 69.513658j,-123.272881- 23.272678j,27.786675-161.366228j,42.332342+ 14.414352j,200.537384+103.075510j,156.577998+ 13.209539j,-163.487743+ 95.042448j,-201.859791+118.456064j
263,8005.5442- 0.0000j,-98.531070-169.753031j,-43.225891- 79.028486j,33.081777- 29.327416j,218.903038- 57.587849j,36.183356+ 26.466569j,111.363551+ 83.411665j,-188.868238- 10.906513j,1.334428- 41.287405j,-152.677327- 12.586834j,...,-50.576112+ 61.266765j,-152.677327+ 12.586834j,1.334428+ 41.287405j,-188.868238+ 10.906513j,111.363551- 83.411665j,36.183356- 26.466569j,218.903038+ 57.587849j,33.081777+ 29.327416j,-43.225891+ 79.028486j,-98.531070+169.753031j
264,8391.3841- 0.0000j,-193.722814-330.553437j,-157.355762+ 32.860201j,134.514413- 91.679670j,170.077894-133.213583j,-26.882650+ 46.734370j,195.045848+129.980967j,-92.023499- 58.380690j,-15.075786- 67.947704j,52.853294+ 65.614002j,...,-86.897811+178.313427j,52.853294- 65.614002j,-15.075786+ 67.947704j,-92.023499+ 58.380690j,195.045848-129.980967j,-26.882650- 46.734370j,170.077894+133.213583j,134.514413+ 91.679670j,-157.355762- 32.860201j,-193.722814+330.553437j
265,6189.2182- 0.0000j,-39.295321+ 59.713217j,-54.995262- 40.395855j,59.146708- 52.457728j,113.892097- 13.275144j,47.956691+ 1.290990j,125.867234- 4.184008j,-105.574843+ 25.520124j,-59.088627- 7.748515j,-83.361324- 25.596824j,...,-114.967525- 3.611389j,-83.361324+ 25.596824j,-59.088627+ 7.748515j,-105.574843- 25.520124j,125.867234+ 4.184008j,47.956691- 1.290990j,113.892097+ 13.275144j,59.146708+ 52.457728j,-54.995262+ 40.395855j,-39.295321- 59.713217j


## Step 3) Model creation

In [5]:
model_outline = (
    RandomForestClassifier(
        max_features='sqrt',
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        bootstrap=True,
        random_state=42
    ), {
        'classifier__n_estimators': [100, 200, 300, 400, 500],
        'classifier__max_depth': [None, 10, 25, 50],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': [1, 5, 10, 'sqrt', 'log2'],
        'classifier__bootstrap': [True, False]
    })

scorer = make_scorer(accuracy_score,greater_is_better=True)

In [6]:
def fit_model(model_name, model, param_grid, features_train, target_train, scoring, cv=10, n_iter=100, random_state=42, n_jobs=-1):
    
    pipeline = Pipeline(steps=[
        ('classifier', model)
    ])
    
    randomized_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, scoring=scoring, cv=cv, n_iter=n_iter, random_state=random_state, n_jobs=n_jobs)
    randomized_search.fit(features_train, target_train)
    
    best_score = randomized_search.best_score_
    best_params = randomized_search.best_params_
    
    print(f'{model_name}:\naccuracy : {best_score:.2f}')
    print('Best Parameters :', best_params)
    print('\n')
    
    return randomized_search, best_params

model_pca, best_params_pca = fit_model("RandomForestClassifier", model_outline[0], model_outline[1], train_set_pca, trainlabels, scoring=scorer)



RandomForestClassifier:
accuracy : 0.88
Best Parameters : {'classifier__n_estimators': 400, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 50, 'classifier__bootstrap': False}




## Step 4) Evaluation

In [7]:
prediction_pca = model_pca.predict(test_set_pca)
acc_pca = accuracy_score(testlabels,prediction_pca)
print(f"Accuracy (PCA): {acc_pca:.2f}")

Accuracy (PCA): 0.82
