In [1]:
import pandas as pd
import time

from sklearn.preprocessing import StandardScaler

In [2]:
from utils import train_val_split
from utils import train_datapath, test_datapath

targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)

# Majority Guessing

In [3]:
train_df = pd.read_csv(train_datapath)

In [4]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Assuming 'target' is the column to predict and the rest are features
X = train_df.drop(columns=['target'])
y = train_df['target']

# Split the data into training and validation sets
X_train = X.iloc[:len(X)*4//5]
y_train = y.iloc[:len(y)*4//5]
X_val= X.iloc[len(X)*4//5:]
y_val = y.iloc[len(y)*4//5:]

# Check if there are more Ones or Zeroes on train_df[target]
majority_class = train_df['target'].value_counts().idxmax()

y_pred = [majority_class] * len(y_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro}')



Validation Accuracy: 0.531152824108102
Validation F1 Macro Score: 0.3468973284345402


In [5]:
del train_df
test_df = pd.read_csv(test_datapath)

In [6]:
# Evaluate the X_test predictions using the targets_for_test_df

# Ensure the test data is preprocessed in the same way as the training data

X_test = test_df.drop(columns=['row_id'])

# Make predictions on the test data
test_prediction = [majority_class] * len(X_test)
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, test_prediction[:len(targets_for_test_df)])	
print(f'Test Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, test_prediction[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score: {f1_macro:.5f}')

Test Accuracy: 0.58005
Test F1 Macro Score: 0.36711


In [7]:
del test_df

# Random Prediction

In [8]:
test_df = pd.read_csv(test_datapath)

In [9]:
import numpy as np

# Generate random predictions for the test data
np.random.seed(42)  # For reproducibility
random_predictions = np.random.choice([0, 1], size=len(test_df))

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, random_predictions[:len(targets_for_test_df)])
print(f'Test Accuracy (Random): {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, random_predictions[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score (Random): {f1_macro:.5f}')

Test Accuracy (Random): 0.50025
Test F1 Macro Score (Random): 0.49700


In [10]:

# Create a new DataFrame for the submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'target': random_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")


Predictions saved to submission.csv


# Current Prediction

In [11]:

# Generate random predictions for the test data
shift_prediction = targets_for_test_df.shift(1).fillna(0)
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, shift_prediction[:len(targets_for_test_df)])
print(f'Test Accuracy (shifted): {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, shift_prediction[:len(targets_for_test_df)], average='macro')
print(f'Test F1 Macro Score (shifted): {f1_macro:.5f}')

Test Accuracy (shifted): 0.50648
Test F1 Macro Score (shifted): 0.49350


# Random with seasonal trend as probability

## Minute of day

In [12]:
train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)

In [13]:

# Create copies of train_df and test_df
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

# Convert timestamp to datetime
train_df_copy['datetime'] = pd.to_datetime(train_df_copy['timestamp'], unit='s')
test_df_copy['datetime'] = pd.to_datetime(test_df_copy['timestamp'], unit='s')

test_df_copy['time'] = test_df_copy['datetime'].dt.time
train_df_copy['time'] = train_df_copy['datetime'].dt.time

# Group by minute of the day and calculate mean for each group
train_grouped_by_minute = train_df_copy.groupby('time').mean()
test_grouped_by_minute = test_df_copy.groupby('time').mean()
labels = train_grouped_by_minute.index

# Group by minute_of_day and calculate the mean target value for each group
mean_target_by_minute = train_grouped_by_minute['target']
mean_target_by_minute.index = labels.astype(str)
mean_target_by_minute.to_csv('data/mean_target_by_minute.csv', header=True)


In [14]:

minute_seasonality = []
predictions = []
for idx, hour in enumerate(test_df_copy['time'].astype(str)):
    prob1 = mean_target_by_minute.loc[hour]
    prob0 = 1 - prob1
    minute_seasonality.append(prob1)
    predictions.append(np.random.choice([0, 1], p=[prob0, prob1]))    


In [15]:
# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, predictions[1:])
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, predictions[1:], average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

Validation Accuracy: 0.50365
Validation F1 Macro Score: 0.49820


## Hour of week seasonality

In [16]:
# get hour from the index of test_target_3h_df
pd.to_datetime(targets_for_test_df.index)



DatetimeIndex(['2022-05-17 19:59:00', '2022-05-17 20:00:00',
               '2022-05-17 20:01:00', '2022-05-17 20:02:00',
               '2022-05-17 20:03:00', '2022-05-17 20:04:00',
               '2022-05-17 20:05:00', '2022-05-17 20:06:00',
               '2022-05-17 20:07:00', '2022-05-17 20:08:00',
               ...
               '2024-02-08 12:05:00', '2024-02-08 12:06:00',
               '2024-02-08 12:07:00', '2024-02-08 12:08:00',
               '2024-02-08 12:09:00', '2024-02-08 12:10:00',
               '2024-02-08 12:11:00', '2024-02-08 12:12:00',
               '2024-02-08 12:13:00', '2024-02-08 12:14:00'],
              dtype='datetime64[ns]', name='timestamp', length=909616, freq=None)

In [17]:
# Resample the data to 3-hour intervals and sum the target
target_3h_df = train_df_copy.copy()
target_3h_df.set_index('datetime', inplace=True)
target_3h_df['3h_group'] = target_3h_df.index.to_series().dt.floor('3h').dt.hour


# Add columns for day of the week and hour of the day
target_3h_df['day_of_week'] = target_3h_df.index.dayofweek
target_3h_df['hour'] = target_3h_df.index.hour
target_3h_df['week_day_hour'] = target_3h_df['day_of_week'].astype(str).str.zfill(2) + '-' + target_3h_df['3h_group'].astype(str).str.zfill(2) + ':00'  
grouped_target_3h = target_3h_df.groupby('week_day_hour')['target'].mean()
target_3h_df.to_csv('data/target_3h_df.csv')

# Resample the data to 3-hour intervals and sum the target
test_target_3h_df = pd.DataFrame()
test_target_3h_df['target'] = targets_for_test_df.copy()
test_target_3h_df['datetime'] = pd.to_datetime(test_target_3h_df.index)
test_target_3h_df.set_index('datetime', inplace=True)

test_target_3h_df['hour'] = test_target_3h_df.index.hour
test_target_3h_df['3h_group'] = test_target_3h_df.index.to_series().dt.floor('3h').dt.hour

# Add columns for day of the week and hour of the day
test_target_3h_df['day_of_week'] = test_target_3h_df.index.dayofweek
test_target_3h_df['week_day_hour'] = test_target_3h_df['day_of_week'].astype(str).str.zfill(2) + '-' + test_target_3h_df['3h_group'].astype(str).str.zfill(2) + ':00'  

test_target_3h_df.to_csv('data/test_target_3h_df.csv')

In [18]:

weekly_seasonality = []
seasonal_predictions = []
for idx, hour in enumerate(test_target_3h_df['week_day_hour']):
    prob1 = grouped_target_3h.loc[hour]
    prob0 = 1 - prob1
    
    weekly_seasonality.append(prob1)
    seasonal_predictions.append(int(np.random.choice([0, 1], p=[prob0, prob1])))


# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, seasonal_predictions)
print(f'Test Accuracy (shifted): {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, seasonal_predictions, average='macro')
print(f'Test F1 Macro Score (shifted): {f1_macro:.5f}')

Test Accuracy (shifted): 0.50346
Test F1 Macro Score (shifted): 0.49796


## Ensamble them

In [33]:
# just so we see what are the variables 
minute_seasonality = minute_seasonality
weekly_seasonality = weekly_seasonality
row_id_timestamp = test_df[['row_id', 'timestamp']].set_index('timestamp')
row_id_timestamp.index = pd.to_datetime(row_id_timestamp.index, unit='s')

seasonal_predictions2 = []

for i in range(len(targets_for_test_df)):
    mean_probability1 = np.array(minute_seasonality[i], weekly_seasonality[i]).mean()
    mean_probability0 = 1 - mean_probability1
    seasonal_predictions2.append((i,np.random.choice([0, 1], p=[mean_probability0, mean_probability1])))
    

# Calculate accuracy
accuracy = accuracy_score(targets_for_test_df, seasonal_predictions2)
print(f'Test Accuracy (shifted): {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(targets_for_test_df, seasonal_predictions2, average='macro')
print(f'Test F1 Macro Score (shifted): {f1_macro:.5f}')

ValueError: Classification metrics can't handle a mix of binary and multiclass-multioutput targets

In [None]:
pd.DataFrame(seasonal_predictions2).to_csv('submission/mean_seasonal_predictions.csv')

In [None]:
row_id_timestamp = test_df[['row_id', 'timestamp']].set_index('timestamp')
row_id_timestamp.index = pd.to_datetime(row_id_timestamp.index, unit='s')
row_id_timestamp.loc[pandastest_df.iloc[0]['timestamp']]

KeyError: 1652817480.0

In [30]:
seasonal_predictions2

[1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
