In [None]:
from utils.preprocess_dataset import preprocess_dataframe
from tqdm import tqdm
import pandas as pd
import os

In [None]:
train_dataframes = []

for filename in tqdm(os.listdir("cleaned_data/train_data")):
    if filename.endswith(".csv"):
        train_dataframes.append(preprocess_dataframe(f"cleaned_data/train_data/{filename}", "train"))

train_data = pd.concat(train_dataframes, ignore_index=True)
del train_dataframes

In [None]:
train_data.head()

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

unique_words_eventtype_0 = Counter()
unique_words_eventtype_1 = Counter()

for words in train_data[train_data["EventType"] == 0]["Unique Common Words"]:
    unique_words_eventtype_0.update(dict(words))

for words in train_data[train_data["EventType"] == 1]["Unique Common Words"]:
    unique_words_eventtype_1.update(dict(words))

fig, axes = plt.subplots(2, 1, figsize=(15, 10))

axes[0].bar(unique_words_eventtype_0.keys(), unique_words_eventtype_0.values())
axes[0].set_title('Unique Common Words for Non-Critical Minutes')
axes[0].set_xlabel('Words')
axes[0].set_ylabel('Frequency')
axes[0].tick_params(axis='x', rotation=90)

# EventType 1
axes[1].bar(unique_words_eventtype_1.keys(), unique_words_eventtype_1.values())
axes[1].set_title('Unique Common Words for Critical Minutes')
axes[1].set_xlabel('Words')
axes[1].set_ylabel('Frequency')
axes[1].tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

In [None]:
unique_words_eventtype_0.total()

In [None]:
unique_words_eventtype_1.total()

In [None]:
train_data["Frequency of Unique Common Words"] = train_data["Unique Common Words"].apply(lambda x: sum(dict(x).values()))

In [None]:
train_data.head()

In [None]:
mean_event_0_freq = train_data[train_data["EventType"] == 0]["Frequency of Unique Common Words"].mean()
mean_event_1_freq = train_data[train_data["EventType"] == 1]["Frequency of Unique Common Words"].mean()
std_event_0_freq = train_data[train_data["EventType"] == 0]["Frequency of Unique Common Words"].std()
std_event_1_freq = train_data[train_data["EventType"] == 1]["Frequency of Unique Common Words"].std()

print(f"Mean Frequency of Unique Common Words for Non-Critical Minutes: {mean_event_0_freq} +/- {std_event_0_freq}")
print(f"Mean Frequency of Unique Common Words for Critical Minutes: {mean_event_1_freq} +/- {std_event_1_freq}")

In [None]:
threshold = (mean_event_0_freq * (std_event_1_freq**2) + mean_event_1_freq * (std_event_0_freq**2)) / ((std_event_1_freq**2) + (std_event_0_freq**2))
threshold

In [None]:
import numpy as np

In [None]:
a = (1 / std_event_0_freq**2) - (1 / std_event_1_freq**2)
b = -2 * (mean_event_0_freq / std_event_0_freq**2 - mean_event_1_freq / std_event_1_freq**2)
c = (mean_event_0_freq**2 / std_event_0_freq**2) - (mean_event_1_freq**2 / std_event_1_freq**2)

coefficients = [a, b, c]
gaussian_threshold = np.roots(coefficients)[0]
gaussian_threshold

In [None]:
eval_dataframes = []

for filename in tqdm(os.listdir("cleaned_data/eval_data")):
    if filename.endswith(".csv"):
        eval_dataframes.append(preprocess_dataframe(f"cleaned_data/eval_data/{filename}", "eval"))

eval_data = pd.concat(eval_dataframes, ignore_index=True)
del eval_dataframes

In [None]:
eval_dataframes["Frequency of Unique Common Words"] = eval_dataframes["Unique Common Words"].apply(lambda x: sum(dict(x).values()))

In [None]:
eval_dataframes.head()

In [None]:
eval_dataframes["Simple_Model_Prediction"] = eval_dataframes["Frequency of Unique Common Words"].apply(lambda x: 1.0 if x > threshold else 0.0)
eval_dataframes["Gaussian_Model_Prediction"] = eval_dataframes["Frequency of Unique Common Words"].apply(lambda x: 1.0 if x > gaussian_threshold else 0.0)

In [None]:
# submission = eval_dataframes[["ID", "Simple_Model_Prediction"]]
# submission.rename(columns={"Simple_Model_Prediction": "EventType"}, inplace=True)
# submission.to_csv("model_output/submissions/sub_3/submission_simple.csv", index=False)

In [None]:
# submission = eval_dataframes[["ID", "Gaussian_Model_Prediction"]]
# submission.rename(columns={"Gaussian_Model_Prediction": "EventType"}, inplace=True)
# submission.to_csv("model_output/submissions/sub_3/submission_gaussian.csv", index=False)

# Representing the periods with TF-IDF metric and simple word embeddings

In [None]:
from utils.preprocess_dataset import pre_processing_feature_extraction
from tqdm import tqdm
import pandas as pd
import os

In [None]:
train_dataframes = []

for filename in tqdm(os.listdir("cleaned_data/final_approach/train_data")):
    if filename.endswith(".csv"):
        train_dataframes.append(pre_processing_feature_extraction(f"cleaned_data/final_approach/train_data/{filename}", "train"))

train_data = pd.concat(train_dataframes, ignore_index=True)
del train_dataframes

In [None]:
train_data.head(20)

In [None]:
import gensim.downloader as api

embedding_model = api.load("glove-twitter-200")

In [None]:
import numpy as np
def get_weighted_embedding(top_words, top_word_scores, model):
    embeddings = [model[word] * score for word, score in zip(top_words, top_word_scores) if word in model]
    return np.sum(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

In [None]:
tqdm.pandas()
train_data["Weighted_Embedding"] = train_data.progress_apply(lambda row: get_weighted_embedding(row["TopWords"], row["TopWordScores"], embedding_model), axis=1)

In [None]:
train_data.head()

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
X = np.stack(train_data["Weighted_Embedding"].values)
y = train_data["EventType"].values

In [None]:
repr_2d_pca = PCA(n_components=2).fit_transform(X)
repr_3d_pca = PCA(n_components=3).fit_transform(X)

In [None]:
plt.scatter(repr_2d_pca[:, 0], repr_2d_pca[:, 1], c=y, cmap="viridis")

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(repr_3d_pca[:, 0], repr_3d_pca[:, 1], repr_3d_pca[:, 2], c=y, cmap="viridis")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)

xgb_clf = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)

print("XGBoost Test set: ", accuracy_score(y_test, y_pred_xgb))
print("Test set: ", accuracy_score(y_test, y_pred))

In [None]:
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
xgb_clf = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss').fit(X, y)

In [None]:
eval_dataframes = []

for filename in tqdm(os.listdir("cleaned_data/eval_data")):
    if filename.endswith(".csv"):
        eval_dataframes.append(pre_processing_feature_extraction(f"cleaned_data/eval_data/{filename}", "eval"))

eval_data = pd.concat(eval_dataframes, ignore_index=True)
del eval_dataframes

In [None]:
eval_data["Weighted_Embedding"] = eval_data.progress_apply(lambda row: get_weighted_embedding(row["TopWords"], row["TopWordScores"], embedding_model), axis=1)

In [None]:
predictions = xgb_clf.predict(np.stack(eval_data["Weighted_Embedding"].values))

In [None]:
submission = eval_data[["ID"]]

In [None]:
submission["EventType"] = predictions

In [None]:
submission["EventType"] = submission["EventType"].astype(float)

In [None]:
submission.to_csv("model_output/submissions/sub_4/submission_xgb.csv", index=False)