In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import time
from pathlib import Path
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sys import getsizeof
from tqdm import tqdm
%matplotlib inline

In [None]:

#tsne = TSNE(n_components=3, perplexity=30, n_iter=3000, random_state=42)
#Y_tsne = tsne.fit_transform(features_scaled[startIndex:endIndex])


batchSize = 32000
currentIndex = 0
startIndex = currentIndex * batchSize
endIndex = startIndex + batchSize

# Load the dataset
dataset = pd.read_csv(
    '../Datasets/Household_Electric_Power_Consumption/household_power_consumption.csv',
    na_values='?'
)

#KPCADataset.to_csv('./Results/household_power_consumption/K-PCA_Results.csv', mode='a', index=False)

dataset.dropna(inplace=True)
# Select relevant columns
features = dataset[['Global_active_power',
                'Global_reactive_power',
                'Voltage', 
                'Global_intensity',
                'Sub_metering_1',
                'Sub_metering_2',
                'Sub_metering_3']].astype(np.float32)

start = time.time()
# Step 1: Standardize the Data
scaler = StandardScaler()
scaledFeatures = scaler.fit_transform(features)
#Only calculating error on a small slice of the dataset.
# numRows, numColumns = features.shape
# loopRuns = round(numRows / batchSize)
# print(loopRuns)
# Step 2: Calculating errors only since calculation of entire dataset takes too long
#kpca = KernelPCA( kernel='rbf', gamma=15)  # Using RBF kernel
#calculating error based on number of dimensions
rootMeanSquaredValues = []
meanAbsoluteErrorValues = []
rSquaredErrorValues = []
compressionRatio = []
dimensions = range(1, scaledFeatures.shape[1] + 1)
featureSlice = features[startIndex:endIndex].copy()
datasetSlice = scaler.fit_transform(featureSlice)
for d in tqdm(dimensions):
    kpca = TSNE(n_components=dimensions,
                perplexity=30,
                max_iter=3000,
                random_state=42)
    datasetReduced = np.float32(kpca.fit_transform(datasetSlice))
    datasetReconstructed = kpca.inverse_transform(datasetReduced).astype(np.float32)
    rootMeanSquaredValues.append(root_mean_squared_error(
        datasetSlice,
        datasetReconstructed,))
    meanAbsoluteErrorValues.append(mean_absolute_error(
        datasetSlice,
        datasetReconstructed))
    rSquaredErrorValues.append(r2_score(
        datasetSlice,
        datasetReconstructed))
    compressionRatio.append(
        getsizeof(datasetReduced) /
        getsizeof( datasetSlice) )
Details = pd.DataFrame({
    'Root Mean Squared Values' : rootMeanSquaredValues,
    'Mean Absolute Values' : meanAbsoluteErrorValues,
    'R Squared Error Values' : rSquaredErrorValues,
    'Compression Ratio' : compressionRatio
    })
Details.index += 1
Details.to_csv('./Results/household_power_consumption/Details K-PCA.csv', mode='w', index=False)
