In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
data = pd.read_csv('ds1.csv', delimiter=' ', header=None, 
                   names=['Date', 'Time', 'HighLevelSensorID', 'LowLevelSensorID', 'SensorValue', 'Label'],
                   on_bad_lines='skip')

In [3]:
data

Unnamed: 0,Date,Time,HighLevelSensorID,LowLevelSensorID,SensorValue,Label
0,2009-07-17,15:49:51.750001,Kitchen,Kitchen,ON,Other_Activity
1,2009-07-17,15:49:52.406001,Kitchen,Kitchen,ON,Other_Activity
2,2009-07-17,15:49:55.421001,Kitchen,Kitchen,OFF,Other_Activity
3,2009-07-17,15:49:56.406001,Kitchen,Kitchen,OFF,Other_Activity
4,2009-07-17,15:49:58.031001,GuestRoom,Hall,ON,Other_Activity
...,...,...,...,...,...,...
99995,2009-08-20,12:16:04.093001,Kitchen,Kitchen,ON,Cook
99996,2009-08-20,12:16:08.031001,Kitchen,Kitchen,OFF,Cook
99997,2009-08-20,12:16:10.687001,Kitchen,Kitchen,ON,Cook
99998,2009-08-20,12:16:14.625001,Kitchen,Kitchen,OFF,Cook


In [4]:
data.isnull().sum()

Date                 0
Time                 0
HighLevelSensorID    0
LowLevelSensorID     0
SensorValue          0
Label                0
dtype: int64

In [5]:
def process_datetime(df):
    df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    df['Date'] = pd.to_datetime(df['Date'])
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S.%f').dt.time
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    df['Second'] = df['DateTime'].dt.second
    df['Microsecond'] = df['DateTime'].dt.microsecond
    return df

In [6]:
process_datetime(data)

Unnamed: 0,Date,Time,HighLevelSensorID,LowLevelSensorID,SensorValue,Label,DateTime,Hour,Minute,Second,Microsecond
0,2009-07-17,15:49:51.750001,Kitchen,Kitchen,ON,Other_Activity,2009-07-17 15:49:51.750001,15,49,51,750001
1,2009-07-17,15:49:52.406001,Kitchen,Kitchen,ON,Other_Activity,2009-07-17 15:49:52.406001,15,49,52,406001
2,2009-07-17,15:49:55.421001,Kitchen,Kitchen,OFF,Other_Activity,2009-07-17 15:49:55.421001,15,49,55,421001
3,2009-07-17,15:49:56.406001,Kitchen,Kitchen,OFF,Other_Activity,2009-07-17 15:49:56.406001,15,49,56,406001
4,2009-07-17,15:49:58.031001,GuestRoom,Hall,ON,Other_Activity,2009-07-17 15:49:58.031001,15,49,58,31001
...,...,...,...,...,...,...,...,...,...,...,...
99995,2009-08-20,12:16:04.093001,Kitchen,Kitchen,ON,Cook,2009-08-20 12:16:04.093001,12,16,4,93001
99996,2009-08-20,12:16:08.031001,Kitchen,Kitchen,OFF,Cook,2009-08-20 12:16:08.031001,12,16,8,31001
99997,2009-08-20,12:16:10.687001,Kitchen,Kitchen,ON,Cook,2009-08-20 12:16:10.687001,12,16,10,687001
99998,2009-08-20,12:16:14.625001,Kitchen,Kitchen,OFF,Cook,2009-08-20 12:16:14.625001,12,16,14,625001


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Date               100000 non-null  datetime64[ns]
 1   Time               100000 non-null  object        
 2   HighLevelSensorID  100000 non-null  object        
 3   LowLevelSensorID   100000 non-null  object        
 4   SensorValue        100000 non-null  object        
 5   Label              100000 non-null  object        
 6   DateTime           100000 non-null  datetime64[ns]
 7   Hour               100000 non-null  int64         
 8   Minute             100000 non-null  int64         
 9   Second             100000 non-null  int64         
 10  Microsecond        100000 non-null  int64         
dtypes: datetime64[ns](2), int64(4), object(5)
memory usage: 8.4+ MB


In [9]:
data.head(10)

Unnamed: 0,Date,Time,HighLevelSensorID,LowLevelSensorID,SensorValue,Label,DateTime,Hour,Minute,Second,Microsecond
0,2009-07-17,15:49:51.750001,Kitchen,Kitchen,ON,Other_Activity,2009-07-17 15:49:51.750001,15,49,51,750001
1,2009-07-17,15:49:52.406001,Kitchen,Kitchen,ON,Other_Activity,2009-07-17 15:49:52.406001,15,49,52,406001
2,2009-07-17,15:49:55.421001,Kitchen,Kitchen,OFF,Other_Activity,2009-07-17 15:49:55.421001,15,49,55,421001
3,2009-07-17,15:49:56.406001,Kitchen,Kitchen,OFF,Other_Activity,2009-07-17 15:49:56.406001,15,49,56,406001
4,2009-07-17,15:49:58.031001,GuestRoom,Hall,ON,Other_Activity,2009-07-17 15:49:58.031001,15,49,58,31001
5,2009-07-17,15:50:02.031001,GuestRoom,Hall,OFF,Other_Activity,2009-07-17 15:50:02.031001,15,50,2,31001
6,2009-07-17,15:50:06.328001,LivingRoom,LivingRoom,ON,Other_Activity,2009-07-17 15:50:06.328001,15,50,6,328001
7,2009-07-17,15:50:10.609001,LivingRoom,LivingRoom,OFF,Other_Activity,2009-07-17 15:50:10.609001,15,50,10,609001
8,2009-07-17,15:50:15.500001,LivingRoom,LivingRoom,ON,Other_Activity,2009-07-17 15:50:15.500001,15,50,15,500001
9,2009-07-17,15:50:19.140001,GuestRoom,Hall,ON,Other_Activity,2009-07-17 15:50:19.140001,15,50,19,140001


In [15]:
def create_sequences(df, sequence_length):
    sequences = []
    for i in range(0, len(df) - sequence_length + 1, sequence_length):
        sequence = df.iloc[i:i+sequence_length].copy()
        sequences.append(sequence)
    return sequences

# Choose an appropriate sequence length
sequence_length = 50  # Adjust this based on your needs

positive_sequences = create_sequences(data, sequence_length)

# Add target variable (1 for positive cases)
positive_targets = [1] * len(positive_sequences)
positive_targets
positive_sequences[2]

Unnamed: 0,Date,Time,HighLevelSensorID,LowLevelSensorID,SensorValue,Label,DateTime,Hour,Minute,Second,Microsecond
100,2009-07-17,16:12:20.437001,Kitchen,Kitchen,ON,Other_Activity,2009-07-17 16:12:20.437001,16,12,20,437001
101,2009-07-17,16:12:24.312001,Kitchen,Kitchen,OFF,Other_Activity,2009-07-17 16:12:24.312001,16,12,24,312001
102,2009-07-17,16:12:34.218001,GuestRoom,Hall,ON,Other_Activity,2009-07-17 16:12:34.218001,16,12,34,218001
103,2009-07-17,16:12:38.515001,GuestRoom,Hall,OFF,Other_Activity,2009-07-17 16:12:38.515001,16,12,38,515001
104,2009-07-17,16:12:43.828001,DiningRoom,DiningRoom,ON,Other_Activity,2009-07-17 16:12:43.828001,16,12,43,828001
105,2009-07-17,16:12:47.437001,DiningRoom,DiningRoom,OFF,Other_Activity,2009-07-17 16:12:47.437001,16,12,47,437001
106,2009-07-17,16:12:49.703001,DiningRoom,DiningRoom,ON,Other_Activity,2009-07-17 16:12:49.703001,16,12,49,703001
107,2009-07-17,16:12:54.015001,DiningRoom,DiningRoom,OFF,Other_Activity,2009-07-17 16:12:54.015001,16,12,54,15001
108,2009-07-17,16:13:20.765001,DiningRoom,DiningRoom,ON,Other_Activity,2009-07-17 16:13:20.765001,16,13,20,765001
109,2009-07-17,16:13:24.703001,DiningRoom,DiningRoom,OFF,Other_Activity,2009-07-17 16:13:24.703001,16,13,24,703001


In [16]:
def create_negative_sequences(df, sequence_length, num_sequences):
    negative_sequences = []
    for _ in range(num_sequences):
        shuffled_df = df.sample(n=sequence_length).reset_index(drop=True)
        negative_sequences.append(shuffled_df)
    return negative_sequences

# Create an equal number of negative sequences
negative_sequences = create_negative_sequences(data, sequence_length, len(positive_sequences))

# Add target variable (0 for negative cases)
negative_targets = [0] * len(negative_sequences)

In [17]:
all_sequences = positive_sequences + negative_sequences
all_targets = positive_targets + negative_targets

# Convert to numpy arrays
X = np.array(all_sequences)
y = np.array(all_targets)

In [29]:
X[:1,:]

array([[[Timestamp('2009-07-17 00:00:00'),
         datetime.time(15, 49, 51, 750001), 'Kitchen', 'Kitchen', 'ON',
         'Other_Activity', Timestamp('2009-07-17 15:49:51.750001'), 15,
         49, 51, 750001],
        [Timestamp('2009-07-17 00:00:00'),
         datetime.time(15, 49, 52, 406001), 'Kitchen', 'Kitchen', 'ON',
         'Other_Activity', Timestamp('2009-07-17 15:49:52.406001'), 15,
         49, 52, 406001],
        [Timestamp('2009-07-17 00:00:00'),
         datetime.time(15, 49, 55, 421001), 'Kitchen', 'Kitchen', 'OFF',
         'Other_Activity', Timestamp('2009-07-17 15:49:55.421001'), 15,
         49, 55, 421001],
        [Timestamp('2009-07-17 00:00:00'),
         datetime.time(15, 49, 56, 406001), 'Kitchen', 'Kitchen', 'OFF',
         'Other_Activity', Timestamp('2009-07-17 15:49:56.406001'), 15,
         49, 56, 406001],
        [Timestamp('2009-07-17 00:00:00'),
         datetime.time(15, 49, 58, 31001), 'GuestRoom', 'Hall', 'ON',
         'Other_Activity', Timest

In [33]:
X.shape

(4000, 50, 11)

In [42]:
X[600][0]

array([Timestamp('2009-07-27 00:00:00'),
       datetime.time(14, 21, 20, 187001), 'Kitchen', 'Kitchen', 'ON',
       'Cook', Timestamp('2009-07-27 14:21:20.187001'), 14, 21, 20,
       187001], dtype=object)

In [39]:
target_value = y[0]
print(f"The target value for X[0] is: {target_value}")

The target value for X[0] is: 1


In [46]:
# Assuming this data point is from sequence i
sequence_index =  3000 # You need to determine this based on your data structure
target_value = y[sequence_index]
print(f"The target value for the sequence containing this data point is: {target_value}")

The target value for the sequence containing this data point is: 0


In [60]:
# Create a dictionary where keys are sequence indices and values are (sequence, target) tuples
combined_data_dict = {i: (sequence, target) for i, (sequence, target) in enumerate(zip(X, y))}

# To access data:
for i in range(1999,2002):  # Show first 5 for example
    sequence, target = combined_data_dict[i]
    print(f"Sequence {i}:")
    print(f"  Shape: {sequence.shape}")
    print(f"  First event: {sequence[0]}")
    print(f"  Target: {target}")
    print()

Sequence 1999:
  Shape: (50, 11)
  First event: [Timestamp('2009-08-20 00:00:00') datetime.time(12, 12, 49, 750001)
 'Kitchen' 'Kitchen' 'ON' 'Cook' Timestamp('2009-08-20 12:12:49.750001')
 12 12 49 750001]
  Target: 1

Sequence 2000:
  Shape: (50, 11)
  First event: [Timestamp('2009-07-23 00:00:00') datetime.time(6, 58, 53, 531001)
 'Kitchen' 'Kitchen' 'OFF' 'Cook' Timestamp('2009-07-23 06:58:53.531001')
 6 58 53 531001]
  Target: 0

Sequence 2001:
  Shape: (50, 11)
  First event: [Timestamp('2009-07-19 00:00:00') datetime.time(16, 47, 50, 546001)
 'Kitchen' 'Kitchen' 'ON' 'Cook' Timestamp('2009-07-19 16:47:50.546001')
 16 47 50 546001]
  Target: 0



In [61]:
counts = np.bincount(y)
for value, count in enumerate(counts):
    print(f"{value}: {count}")

0: 2000
1: 2000


In [62]:
from sklearn.model_selection import train_test_split

# First, let's create lists of sequences and targets
sequences = [seq for seq, _ in combined_data_dict.values()]
targets = [target for _, target in combined_data_dict.values()]

# Convert to numpy arrays if they aren't already
X = np.array(sequences)
y = np.array(targets)

# First, split into train+val and test sets (80% train+val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Then split train+val into train and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

# Print the shapes of the resulting sets
print(f"Training set shape: {X_train.shape}, Target shape: {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, Target shape: {y_val.shape}")
print(f"Test set shape: {X_test.shape}, Target shape: {y_test.shape}")

# Print class distribution in each set
print("\nClass distribution:")
print(f"Training set: {np.bincount(y_train)}")
print(f"Validation set: {np.bincount(y_val)}")
print(f"Test set: {np.bincount(y_test)}")

# Create new combined data dictionaries for each set
train_dict = {i: (seq, target) for i, (seq, target) in enumerate(zip(X_train, y_train))}
val_dict = {i: (seq, target) for i, (seq, target) in enumerate(zip(X_val, y_val))}
test_dict = {i: (seq, target) for i, (seq, target) in enumerate(zip(X_test, y_test))}

# Print a sample from each set
print("\nSample from training set:")
seq, target = next(iter(train_dict.values()))
print(f"Sequence shape: {seq.shape}, Target: {target}")
print("First event:", seq[0])

print("\nSample from validation set:")
seq, target = next(iter(val_dict.values()))
print(f"Sequence shape: {seq.shape}, Target: {target}")
print("First event:", seq[0])

print("\nSample from test set:")
seq, target = next(iter(test_dict.values()))
print(f"Sequence shape: {seq.shape}, Target: {target}")
print("First event:", seq[0])

Training set shape: (2560, 50, 11), Target shape: (2560,)
Validation set shape: (640, 50, 11), Target shape: (640,)
Test set shape: (800, 50, 11), Target shape: (800,)

Class distribution:
Training set: [1280 1280]
Validation set: [320 320]
Test set: [400 400]

Sample from training set:
Sequence shape: (50, 11), Target: 1
First event: [Timestamp('2009-07-25 00:00:00') datetime.time(13, 1, 46, 375001)
 'Kitchen' 'Kitchen' 'ON' 'Cook' Timestamp('2009-07-25 13:01:46.375001')
 13 1 46 375001]

Sample from validation set:
Sequence shape: (50, 11), Target: 0
First event: [Timestamp('2009-08-03 00:00:00') datetime.time(20, 18, 12, 1)
 'DiningRoom' 'DiningRoom' 'ON' 'Other_Activity'
 Timestamp('2009-08-03 20:18:12.000001') 20 18 12 1]

Sample from test set:
Sequence shape: (50, 11), Target: 1
First event: [Timestamp('2009-08-16 00:00:00') datetime.time(4, 43, 44, 218001)
 'Bedroom' 'Bed' 'OFF' 'Sleep' Timestamp('2009-08-16 04:43:44.218001') 4
 43 44 218001]
