# Reducing input data with SVD

In [None]:
%reload_ext autoreload
%autoreload 2
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go

module_path = os.path.abspath(os.path.join('./../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
main_df = pd.read_csv("simulated_sensor_data_8_cols.csv")
main_df["leak_amount"] = main_df["leak_amount"].str.replace("LPS", "").astype(float)

In [None]:
def prepare_data_for_svd(df):
    keep_columns = ['Sensor1', 'Sensor2', 'Sensor3', 'Sensor4', 'J-Apollo', 
                    'J-RN2','J-RN1', 'encoded_node_with_leak', 'leak_amount']
    enc_df, l_enc = encode_df_column(df)
    filtered_df = enc_df[keep_columns]
    
    return filtered_df

def encode_df_column(df, col_to_encode="node_with_leak", new_col_name="encoded_node_with_leak"):
    label_enc = LabelEncoder()
    df[new_col_name] = label_enc.fit_transform(df[col_to_encode])
    
    return df, label_enc
    
prepared_df = prepare_data_for_svd(main_df)
display(prepared_df)

In [None]:
prepared_csv_df = prepared_df[['Sensor1', 'Sensor2', 'Sensor3', 'Sensor4', 'J-Apollo', 
                               'J-RN2','J-RN1', "encoded_node_with_leak"]].round(4).drop_duplicates()
display(prepared_csv_df)

## Effect on leak on the pressures of the sensors
Chart below indicates that the sensors are linearly effected by the increase in leakage

In [None]:
one_node_df = prepared_df[(prepared_df["encoded_node_with_leak"] == 118)]
display(one_node_df)

for col_name in ['Sensor1', 'Sensor2', 'Sensor3', 'Sensor4', 'J-Apollo', 
                 'J-RN2','J-RN1']:
    fig_avg_g = go.Figure([go.Scatter(x=one_node_df["leak_amount"], y=one_node_df[col_name])])
    fig_avg_g.show()

In [None]:
def plot_leak_correlation_for_every_node(df):
    sensor_arr = ['Sensor1', 'Sensor2', 'Sensor3', 'Sensor4', 'J-Apollo', 'J-RN2','J-RN1']
    
    for col_name in df["encoded_node_with_leak"].unique()[:10]:
        print(f"Chart for effects of leak on node {col_name}")
        # arr_of_nodes, data_df = instance.nodes_which_effect_the_sensors_most(16.0, node)

        fig = go.Figure()
        for column in sensor_arr:
            column_df = df[df["encoded_node_with_leak"] == col_name]
            # display(column_df)
            fig.add_trace(go.Scattergl(
                                    x=[column] * len(column_df), 
                                    y=column_df[column],
                                    text=column_df["leak_amount"],
                                    mode='markers',
                                    name=column))
        fig.update_layout(
            title=f"Leak effect on '{col_name}'",
            xaxis_title="Sensor effected",
            yaxis_title="Pressure",
        )
        fig.show()
# plot_leak_correlation_for_every_node(prepared_df)

## Finding unique rows -> optimal leak step

In [None]:
# gauss elimination
from scipy.linalg import lu
import sympy 

# np_prepared_arr = prepared_df.to_numpy() 
one_node_df_1 = prepared_df[(prepared_df["encoded_node_with_leak"] == 100)]
display(one_node_df_1)
one_node_prep_arr = one_node_df_1.to_numpy() 
display(one_node_prep_arr, len(one_node_prep_arr))
# pl, u = lu(np_prepared_arr, permute_l=True)

_, inds = sympy.Matrix(one_node_prep_arr).T.rref() 

In [None]:
unique_rows_count = 0
for node_val in prepared_df["encoded_node_with_leak"].unique():
    print(node_val)
    one_node_arr = prepared_df[(prepared_df["encoded_node_with_leak"] == node_val)].to_numpy() 
    _, inds = sympy.Matrix(one_node_df).T.rref() 
    unique_rows_count += len(inds)
    
print(f"All rows {str(unique_rows_count)}, average {unique_rows_count/len(prepared_df['encoded_node_with_leak'].unique())}")

In [None]:
display(inds)
# prepared_df[(prepared_df["encoded_node_with_leak"] == 118)].iloc[inds, :]
one_node_df_1.loc[one_node_df_1.index[list(inds)]]

In [None]:
_, inds = sympy.Matrix(prepared_df.to_numpy() ).T.rref() 

print(inds[:5], len(inds))

In [None]:
testing_df = prepared_df[prepared_df["leak_amount"] < 0.6]
display(testing_df)
# U, s, V = np.linalg.svd(testing_df.T, full_matrices=False)
# display(U.shape, s.shape, V.shape)
# U, s, V = np.linalg.svd(prepared_df, full_matrices=False)
# display(U.shape, s.shape, V.shape)