In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
fpath = 'data/lcp_single_trial.tsv'
data = pd.read_csv(fpath, sep='\t')
data.head()

Unnamed: 0,id,subcorpus,sentence,token,complexity
0,3QI9WAYOGQB8GQIR4MDIEF0D2RLS67,bible,They will not hurt nor destroy in all my holy ...,sea,0.0
1,3T8DUCXY0N6WD9X4RTLK8UN1U929TF,bible,"that sends ambassadors by the sea, even in ves...",sea,0.102941
2,3I7KR83SNADXAQ7HXK7S7305BYB9KD,bible,"and they entered into the boat, and were going...",sea,0.109375
3,3BO3NEOQM0HK9ERYPN0GQIWCPC4IAQ,bible,"Joseph laid up grain as the sand of the sea, v...",sea,0.160714
4,3Y3CZJSZ9KT0W7I0KE38WZHHKSW5RH,bible,There will be a highway for the remnant that i...,land,0.0


In [18]:
# Load the data
file_path = 'data/lcp_single_trial.tsv'
data = pd.read_csv(file_path, sep='\t')

# Preprocessing: Lowercase the sentences and tokens
data['sentence'] = data['sentence'].str.lower()
data['token'] = data['token'].str.lower()

# Feature Extraction
# 1. Token Length
data['token_length'] = data['token'].apply(len)

# 2. Token Frequency in the Corpus
token_counts = data['token'].value_counts()
data['token_frequency'] = data['token'].map(token_counts)

# 3. TF-IDF Vectorization for sentences
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['sentence'])


# Combine TF-IDF features and additional features
additional_features = data[['token_length', 'token_frequency']].values
features = np.hstack([tfidf_matrix.toarray(), additional_features])


# Verify the last few columns of combined features
print("Last few columns of combined features (should be additional features):")
print(features[:1])

Last few columns of combined features (should be additional features):
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.   