In [1]:
import pandas as pd
import os
import torch
import numpy as np
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import LongformerForSequenceClassification, LongformerTokenizerFast
from transformers import LongformerTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import Dataset
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
import time
from sklearn.model_selection import train_test_split
import random
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_train = pd.read_json('/kaggle/input/action-final-seed0/240307_action_text_train_f3.json')
df_valid = pd.read_json('/kaggle/input/action-final-seed0/240307_action_text_valid_f3.json')
df_test = pd.read_json('/kaggle/input/action-final-seed0/240307_action_text_test_f3.json')

In [3]:
# Regular expression pattern to find consecutive actions
pattern = r'(\d+) times consecutively'
action_pattern = r"the action '([^']+)'"

# Function to extract consecutive action counts
def extract_consecutive_actions(text):
    counts = re.findall(pattern, text)
    counts = list(map(int, counts))  # Convert counts from strings to integers
    return counts

def extract_unique_actions(text):
    actions = re.findall(action_pattern, text)
    unique_actions = set(actions)
    return len(unique_actions)

# Function to calculate maximum, minimum, and average
def calculate_stats(counts):
    if counts:
        return {
            'max_consecutive': max(counts),
            'min_consecutive': min(counts),
            'med_consecutive': int(round(np.median(counts)))
        }
    else:
        return {
            'max_consecutive': 0,
            'min_consecutive': 0,
            'med_consecutive': 0
        }

# Apply extraction and calculation to each row
df_train['consecutive_counts'] = df_train['text'].apply(extract_consecutive_actions)
df_train['unique_action_count'] = df_train['text'].apply(extract_unique_actions)
df_train['consecutive_stats'] = df_train['consecutive_counts'].apply(calculate_stats)

df_valid['consecutive_counts'] = df_valid['text'].apply(extract_consecutive_actions)
df_valid['unique_action_count'] = df_valid['text'].apply(extract_unique_actions)
df_valid['consecutive_stats'] = df_valid['consecutive_counts'].apply(calculate_stats)

df_test['consecutive_counts'] = df_test['text'].apply(extract_consecutive_actions)
df_test['unique_action_count'] = df_test['text'].apply(extract_unique_actions)
df_test['consecutive_stats'] = df_test['consecutive_counts'].apply(calculate_stats)




# Extract results into separate columns
df_train['max_consecutive'] = df_train['consecutive_stats'].apply(lambda x: x['max_consecutive'])
df_train['min_consecutive'] = df_train['consecutive_stats'].apply(lambda x: x['min_consecutive'])
df_train['med_consecutive'] = df_train['consecutive_stats'].apply(lambda x: x['med_consecutive'])

df_valid['max_consecutive'] = df_valid['consecutive_stats'].apply(lambda x: x['max_consecutive'])
df_valid['min_consecutive'] = df_valid['consecutive_stats'].apply(lambda x: x['min_consecutive'])
df_valid['med_consecutive'] = df_valid['consecutive_stats'].apply(lambda x: x['med_consecutive'])

df_test['max_consecutive'] = df_test['consecutive_stats'].apply(lambda x: x['max_consecutive'])
df_test['min_consecutive'] = df_test['consecutive_stats'].apply(lambda x: x['min_consecutive'])
df_test['med_consecutive'] = df_test['consecutive_stats'].apply(lambda x: x['med_consecutive'])

# Create the text to be added as a prefix in a sentence form
def create_prefix_text(row):
    return f"The count of unique action type is {row['unique_action_count']}. Maximum, minimum, and median number of consecutive actions are {row['max_consecutive']}, {row['min_consecutive']}, and {row['med_consecutive']} respectively. "

# Add the prefix to the original text
df_train['text_with_prefix'] = df_train.apply(lambda row: create_prefix_text(row) + row['text'], axis=1)
df_valid['text_with_prefix'] = df_valid.apply(lambda row: create_prefix_text(row) + row['text'], axis=1)
df_test['text_with_prefix'] = df_test.apply(lambda row: create_prefix_text(row) + row['text'], axis=1)


df = pd.concat([df_train, df_valid, df_test], axis=0)


X_train = df_train.loc[:,['max_consecutive', 'min_consecutive','med_consecutive','unique_action_count']]
y_train = df_train.restrict


X_test = df_test.loc[:,['max_consecutive', 'min_consecutive','med_consecutive','unique_action_count']]
y_test = df_test.restrict

In [4]:
seed = 0
random.seed(seed)

def print_score(test, pred):
    print(confusion_matrix(test, pred))
    print(f'Accuracy : {accuracy_score(test, pred)}')
    print(f'Precision : {precision_score(test, pred)}')
    print(f'Recall : {recall_score(test, pred)}')
    print(f'F1 : {f1_score(test, pred)}')
    
best_xg = {'eta': 0.29828541838560074, 'gamma': 7.01925453051008,
'max_depth': 11,
'min_child_weight': 4,
'n_estimators': 370}

best_rf = {'criterion': 'entropy',
'max_depth': 3,
'min_samples_leaf': 6,
'min_samples_split': 13,
'n_estimators': 200}

In [5]:
# 시작 시간 측정
start_time = time.time()

model = xgb.XGBClassifier(**best_xg, random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

# 종료 시간 측정
end_time = time.time()
# 훈련 시간 계산
total_computing_time = end_time - start_time
print(f"Total computing Time: {total_computing_time:.2f} seconds")

print_score(y_test, pred)

Total computing Time: 0.18 seconds
[[ 813  915]
 [ 277 1451]]
Accuracy : 0.6550925925925926
Precision : 0.6132713440405748
Recall : 0.8396990740740741
F1 : 0.7088422081094284


In [6]:
# 시작 시간 측정
start_time = time.time()

model = RandomForestClassifier(**best_rf, random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)

# 종료 시간 측정
end_time = time.time()
# 훈련 시간 계산
total_computing_time = end_time - start_time
print(f"Total computing Time: {total_computing_time:.2f} seconds")

print_score(y_test, pred)

Total computing Time: 0.79 seconds
[[ 753  975]
 [ 233 1495]]
Accuracy : 0.6504629629629629
Precision : 0.6052631578947368
Recall : 0.8651620370370371
F1 : 0.7122439256788947
