# IHLT Project

---

This notebook executes all the experiments: obtaining features and performing cross-validation on the model. If one wants to refer to analysis and results, please see the notebook "Results and Analysis"

The current jupyter notebook 

python 3.10.12 as in colab

## 1. Data Preparation

### 1.1 Import Libraries

In [None]:
# basic
import os
import pandas as pd
import numpy as np

# our scripts
from scripts.data_loader import load_data
from scripts.feature_extraction import FeatureExtractor
from scripts.experiments import run_experiment

### 1.2 Load Data

In [None]:
data_dir = 'data'

train_data = load_data(data_dir, dataset_type='train')
test_data = load_data(data_dir, dataset_type='test')

print(f"Number of training samples: {len(train_data)}")
print(f"Number of test samples: {len(test_data)}")


## 2. Feature Extraction

To avoid recalculation, we extract all features at once and then filter before training!

In [None]:
extractor = FeatureExtractor()

In [None]:
train_df  = extractor.extract_features_sequential(train_data)

In [None]:
test_df = extractor.extract_features_sequential(test_data)

In [None]:
train_csv_path = 'results/train_features_HOPIUM.csv'
test_csv_path = 'results/test_features_HOPIUM.csv'

In [None]:
# save them
train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

In [None]:
# load them
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

## 3. Feature Selection & Model Training

In [None]:
# basic

train_csv_path = 'results/train_features.csv'
test_csv_path = 'results/test_features.csv'

# load them
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)


# small preprocess
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

clip_threshold = 10 # IMPORTANT (explained in the Results_and_Analysis notebook)

# clip only numeric columns in both dfs
train_df.update(train_df.select_dtypes(include=[np.number]).clip(upper=clip_threshold))
test_df.update(test_df.select_dtypes(include=[np.number]).clip(upper=clip_threshold))

lexical_features_columns = [col for col in train_df.columns if col.startswith('lex_')]
syntactic_features_columns = [col for col in train_df.columns if col.startswith('syn_')]
semantic_features_columns = [col for col in train_df.columns if col.startswith('sem_')]
stylistic_features_columns = [col for col in train_df.columns if col.startswith('sty_')]

feature_sets = {
    'lexical': lexical_features_columns,
    'syntactic': syntactic_features_columns,
    'semantic': semantic_features_columns,
    'stylistic': stylistic_features_columns,
    'combined': lexical_features_columns + syntactic_features_columns + semantic_features_columns
}

model_save_path = 'models'
os.makedirs(model_save_path, exist_ok=True)

results_rows = []

for feature_set_name, feature_columns in feature_sets.items():
    print("="*80)
    print(f"Running experiment for feature set: {feature_set_name}")
    results = run_experiment(
        train_df,
        test_df.copy(),
        feature_columns,
        feature_set_name,
        model_save_path
    )

    # results['all_models_results'] contains a dict of model_name -> metrics
    for model_name, model_metrics in results['all_models_results'].items():
        row = {
            'Feature_Set': feature_set_name,
            'Model_Name': model_name,
            'Best_Params': str(model_metrics['best_params']),
            'CV_Pearson': model_metrics['best_cv_score'],
            'Test_Pearson': model_metrics['test_pearson'],
            'Test_RMSE': model_metrics['test_rmse'],
            'Test_MAE': model_metrics['test_mae'],
            'Test_R2': model_metrics['test_r2'],
            'Selected_Features': ', '.join(model_metrics['selected_features']),
        }

        for dataset, corr in model_metrics['correlations_per_dataset'].items():
            row[f'Pearson_{dataset}'] = corr

        results_rows.append(row)

results_df = pd.DataFrame(results_rows)

os.makedirs('results', exist_ok=True)

results_csv_path = 'results/model_results.csv'
results_df.to_csv(results_csv_path, index=False)
print(f"Results saved to {results_csv_path}")