In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import datetime

In [2]:
class Time_Series:
    def __init__(self, filename: str):
        self.filename = filename
        self.df = None

    def read_csv_file(self) -> pd.DataFrame:
        """
        Read a CSV file and return a pandas DataFrame.
        """
        with open(self.filename) as file:
            self.df = pd.read_csv(file)
        return self.df

    def drop_duplicates(self) -> pd.DataFrame:
        """
        Drop duplicate rows from the pandas DataFrame and return the resulting DataFrame.
        """
        self.df = self.df.drop_duplicates()
        return self.df

    def cast_timestamp(self) -> pd.DataFrame:
        """
        Cast the 'timestamp' column of the pandas DataFrame to a datetime object and return the resulting DataFrame.
        """
        self.df['timestamp'] = self.df['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
        return self.df

    def sort_by_timestamp(self) -> pd.DataFrame:
        """
        Sort the pandas DataFrame by the 'timestamp' column in ascending order and return the resulting DataFrame.
        """
        self.df = self.df.sort_values(by=['timestamp'], ascending=True).reset_index().drop(columns=['index'])
        return self.df
    
    def data_preparation_train(self):
        return self.df[:int(self.df.shape[0]*0.85)]

    def data_preparation_test(self):
        return self.df[int(self.df.shape[0]*0.85):]
    
    def scale_data(self,df) -> pd.DataFrame:
        """
        Scale the 'temperatura' column of the pandas DataFrame and return the resulting DataFrame.
        """
        scaler = StandardScaler()
        self.df["temperatura"] = scaler.fit_transform(self.df["temperatura"].values.reshape(-1, 1))
        return df  

    def split_train_test_data_train(self, df):
        """
        Split the pandas DataFrame into train/test sets and return X_train, X_test, y_train, y_test as numpy arrays.
        """
        df_prepared = self.data_preparation_train()
        X_train, X_test, y_train, y_test = train_test_split(
        df_prepared["temperatura"].values, df_prepared["fallo"].values, test_size=0.2, random_state=42, shuffle=False)
        X_train_reshaped = np.reshape(X_train, (X_train.shape[0], 1, 1))
        X_test_reshaped = np.reshape(X_test, (X_test.shape[0], 1, 1))
        return X_train_reshaped, X_test_reshaped, y_train, y_test
 

    def build_model(self) -> Sequential:
        """
        Build an LSTM model and return the model.
        """
        model = Sequential()
        model.add(LSTM(1, input_shape=(1,1)))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam')
        return model

    def fit_model(self):
        model = self.build_model()
        model.fit(X_train_reshaped, y_train, epochs=20, batch_size=1)
        return model
    
    def evaluate_model(self):
        test_loss, test_acc = model.evaluate(X_test_reshaped, y_test)
        print(f'Test loss: {test_loss}, Test accuracy: {test_acc}')
  
    def predictions(self, df):
        X = np.reshape(np.array(df["temperatura"]), (df.shape[0], 1, 1))
        predictions = model.predict(X)
        return predictions



In [3]:
time_series = Time_Series("historicos series temporales.csv")

In [4]:
time_series.read_csv_file()

Unnamed: 0,timestamp,fallo,temperatura
0,1.552490e+09,0,0.633453
1,1.552490e+09,0,0.633453
2,1.552491e+09,0,0.633453
3,1.552492e+09,0,0.633453
4,1.552492e+09,0,0.633453
...,...,...,...
12245,1.558598e+09,0,0.619355
12246,1.558598e+09,0,0.619355
12247,1.558598e+09,0,0.619355
12248,1.558599e+09,0,0.619355


In [5]:
time_series.drop_duplicates()

Unnamed: 0,timestamp,fallo,temperatura
0,1.552490e+09,0,0.633453
1,1.552490e+09,0,0.633453
2,1.552491e+09,0,0.633453
3,1.552492e+09,0,0.633453
4,1.552492e+09,0,0.633453
...,...,...,...
8270,1.559094e+09,0,-0.151447
8271,1.559094e+09,0,-0.201771
8272,1.559094e+09,0,-0.252094
8273,1.559095e+09,0,-0.252094


In [6]:
time_series.cast_timestamp()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['timestamp'] = self.df['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))


Unnamed: 0,timestamp,fallo,temperatura
0,2019-03-13 16:10:00,0,0.633453
1,2019-03-13 16:15:00,0,0.633453
2,2019-03-13 16:35:00,0,0.633453
3,2019-03-13 16:40:00,0,0.633453
4,2019-03-13 16:45:00,0,0.633453
...,...,...,...
8270,2019-05-29 03:35:00,0,-0.151447
8271,2019-05-29 03:40:00,0,-0.201771
8272,2019-05-29 03:45:00,0,-0.252094
8273,2019-05-29 03:50:00,0,-0.252094


In [7]:
time_series.sort_by_timestamp()

Unnamed: 0,timestamp,fallo,temperatura
0,2019-03-13 16:10:00,0,0.633453
1,2019-03-13 16:15:00,0,0.633453
2,2019-03-13 16:35:00,0,0.633453
3,2019-03-13 16:40:00,0,0.633453
4,2019-03-13 16:45:00,0,0.633453
...,...,...,...
8270,2019-05-29 03:35:00,0,-0.151447
8271,2019-05-29 03:40:00,0,-0.201771
8272,2019-05-29 03:45:00,0,-0.252094
8273,2019-05-29 03:50:00,0,-0.252094


In [8]:
time_series.data_preparation_train()

Unnamed: 0,timestamp,fallo,temperatura
0,2019-03-13 16:10:00,0,0.633453
1,2019-03-13 16:15:00,0,0.633453
2,2019-03-13 16:35:00,0,0.633453
3,2019-03-13 16:40:00,0,0.633453
4,2019-03-13 16:45:00,0,0.633453
...,...,...,...
7028,2019-05-15 02:00:00,0,0.166204
7029,2019-05-15 02:05:00,0,0.166204
7030,2019-05-15 02:10:00,0,0.150845
7031,2019-05-15 02:15:00,0,0.150845


In [9]:
time_series.data_preparation_test()

Unnamed: 0,timestamp,fallo,temperatura
7033,2019-05-15 02:25:00,0,0.150845
7034,2019-05-15 02:30:00,0,0.150845
7035,2019-05-15 02:35:00,0,0.150845
7036,2019-05-15 02:45:00,0,0.150845
7037,2019-05-15 02:50:00,0,0.150845
...,...,...,...
8270,2019-05-29 03:35:00,0,-0.151447
8271,2019-05-29 03:40:00,0,-0.201771
8272,2019-05-29 03:45:00,0,-0.252094
8273,2019-05-29 03:50:00,0,-0.252094


In [10]:
df_train = time_series.scale_data(time_series.data_preparation_train())
df_train

Unnamed: 0,timestamp,fallo,temperatura
0,2019-03-13 16:10:00,0,0.633453
1,2019-03-13 16:15:00,0,0.633453
2,2019-03-13 16:35:00,0,0.633453
3,2019-03-13 16:40:00,0,0.633453
4,2019-03-13 16:45:00,0,0.633453
...,...,...,...
7028,2019-05-15 02:00:00,0,0.166204
7029,2019-05-15 02:05:00,0,0.166204
7030,2019-05-15 02:10:00,0,0.150845
7031,2019-05-15 02:15:00,0,0.150845


In [23]:
df_test = time_series.scale_data(time_series.data_preparation_test())
df_test

Unnamed: 0,timestamp,fallo,temperatura
7033,2019-05-15 02:25:00,0,0.131378
7034,2019-05-15 02:30:00,0,0.131378
7035,2019-05-15 02:35:00,0,0.131378
7036,2019-05-15 02:45:00,0,0.131378
7037,2019-05-15 02:50:00,0,0.131378
...,...,...,...
8270,2019-05-29 03:35:00,0,-0.327153
8271,2019-05-29 03:40:00,0,-0.403487
8272,2019-05-29 03:45:00,0,-0.479820
8273,2019-05-29 03:50:00,0,-0.479820


In [27]:
df_test.describe(),df_train.describe()

(        fallo  temperatura
 count  1242.0  1242.000000
 mean      0.0     0.392725
 std       0.0     0.642185
 min       0.0    -2.876357
 25%       0.0     0.054512
 50%       0.0     0.474592
 75%       0.0     0.730342
 max       0.0     2.208452,
              fallo  temperatura
 count  7033.000000  7033.000000
 mean      0.137352     0.018510
 std       0.344244     0.682561
 min       0.000000    -4.245247
 25%       0.000000    -0.201771
 50%       0.000000     0.261121
 75%       0.000000     0.480146
 max       1.000000     1.758311)

In [12]:
X_train_reshaped, X_test_reshaped, y_train, y_test = time_series.split_train_test_data_train(df_train)

In [13]:
X_train_reshaped.shape[0], X_test_reshaped.shape[0]

(5626, 1407)

In [14]:
model = time_series.build_model()

In [15]:
time_series.fit_model()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.engine.sequential.Sequential at 0x2a81be93be0>

In [16]:
predictions= time_series.predictions(df_test)

In [17]:
pd.DataFrame(predictions).describe()

Unnamed: 0,0
count,1242.0
mean,0.50008
std,0.000832
min,0.490991
25%,0.500055
50%,0.500333
75%,0.500417
max,0.50048


In [18]:
predictions.shape, df_test.shape

((1242, 1), (1242, 3))

In [28]:
np.mean(predictions)

0.5000817

In [34]:
y_pred

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [32]:
threshold = np.mean(predictions)
y_pred = np.where(predictions < threshold, 0, 1)

In [33]:
f1_score(df_test['fallo'], y_pred)

0.0