In [17]:
import pandas as pd
import numpy as np
import textstat
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
import gensim.downloader as api
import gensim
import nltk
from nltk.tokenize import word_tokenize

In [2]:
file_path = '../data/train.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [3]:
df.shape

(2834, 6)

# Phase 1: Statistical Features

In [None]:
def get_statistical_features(text):
    # a. basic count
    word_count = len(text.split())
    sentence_count = textstat.sentence_count(text)

    # b. length statistics
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    avg_word_length = sum(len(word) for word in text.split()
                          ) / word_count if word_count > 0 else 0

    # c. textstat
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    gunning_fog = textstat.gunning_fog(text)

    # d. lexical variation
    lexical_diversity = textstat.lexicon_count(
        text, removepunct=True) / word_count if word_count > 0 else 0

    return [
        word_count, sentence_count, avg_sentence_length, avg_word_length,
        flesch_reading_ease, flesch_kincaid_grade, gunning_fog, lexical_diversity
    ]

In [None]:
feature_names = [
    'word_count', 'sentence_count', 'avg_sentence_length', 'avg_word_length',
    'flesch_reading_ease', 'flesch_kincaid_grade', 'gunning_fog', 'lexical_diversity'
]

features_df = pd.DataFrame(df['excerpt'].apply(
    get_statistical_features).tolist(), columns=feature_names)

features_df.head()

Unnamed: 0,word_count,sentence_count,avg_sentence_length,avg_word_length,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,lexical_diversity
0,179,11,16.272727,4.547486,79.251143,6.247984,8.743728,1.0
1,169,14,12.071429,4.550296,78.945814,5.246851,6.958749,1.0
2,166,12,13.833333,4.475904,78.125492,5.798976,7.702008,1.0
3,164,5,32.8,4.54878,70.372268,11.592244,13.607805,1.0
4,147,5,29.4,3.92517,79.157265,9.522259,12.848435,1.0


## baseline model

In [None]:
X = features_df
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(2267, 8)
(567, 8)


In [None]:
baseline_model = RandomForestRegressor(
    n_estimators=100, random_state=42, n_jobs=-1)
baseline_model.fit(X_train, y_train)

predictions = baseline_model.predict(X_test)

baseline_rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"\n--- baseline model evaluation ---")
print(f"RMSE: {baseline_rmse:.4f}")


--- baseline model evaluation ---
RMSE: 0.8884


# Phase 2: Add TF-IDF Vectors

In [11]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

y_train = train_df['target']
y_test = test_df['target']

print(train_df.shape)
print(test_df.shape)

(2267, 6)
(567, 6)


## statistical feature from phase 1

In [None]:
X_train_stats = pd.DataFrame(train_df['excerpt'].apply(
    get_statistical_features).tolist(), columns=feature_names)
X_test_stats = pd.DataFrame(test_df['excerpt'].apply(
    get_statistical_features).tolist(), columns=feature_names)

## tfidf_vectorizer

In [None]:
# initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000,
                                   stop_words='english',
                                   ngram_range=(1, 2))

# Fit TF-IDF on the training data and then Transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['excerpt'])

# Only Transform on the test data
X_test_tfidf = tfidf_vectorizer.transform(test_df['excerpt'])

## merge

In [15]:
# convert the statistical features (dense) into a sparse matrix
X_train_stats_sparse = csr_matrix(X_train_stats)
X_test_stats_sparse = csr_matrix(X_test_stats)

# use hstack (horizontal stacking) to concatenate the features together
X_train_combined = hstack([X_train_stats_sparse, X_train_tfidf])
X_test_combined = hstack([X_test_stats_sparse, X_test_tfidf])

X_train_combined.shape

(2267, 1008)

## second model

In [None]:
model_phase2 = RandomForestRegressor(
    n_estimators=100, random_state=42, n_jobs=-1)
model_phase2.fit(X_train_combined, y_train)

predictions_phase2 = model_phase2.predict(X_test_combined)
rmse_phase2 = np.sqrt(mean_squared_error(y_test, predictions_phase2))

print("\n--- model evaluation ---")
print(f"Phase 1 (Only Statistical Features) RMSE: {baseline_rmse:.4f}")
print(f"Phase 2 (Statistics + TF-IDF) RMSE: {rmse_phase2:.4f}")


--- model evaluation ---
Phase 1 (Only Statistical Features) RMSE: 0.8884
Phase 2 (Statistics + TF-IDF) RMSE: 0.7891


# Phase 3: Add Bigram Word2Vec Embedding

In [18]:
# download a pre-trained model
wv_model = api.load('glove-wiki-gigaword-100')
print("Gensim 'glove-wiki-gigaword-100' model loaded successfully.")

Gensim 'glove-wiki-gigaword-100' model loaded successfully.


In [21]:
# import NLTK for word segmentation
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## train the Bigram detector

In [None]:
# tokenizing training text for bigram detection
tokenized_train_text = [word_tokenize(text.lower())
                        for text in train_df['excerpt']]

In [None]:
# train the Bigram (Phrases) model
bigram_model = gensim.models.Phrases(
    tokenized_train_text, min_count=5, threshold=10)

In [24]:
# train the Phraser model
bigram_phraser = gensim.models.phrases.Phraser(bigram_model)

In [27]:
test_sentence = "When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape. The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches. At each end of the room, on the wall, hung a beautiful bear-skin rug. These rugs were for prizes, one for the girls and one for the boys. And this was the game. The girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole. This would have been an easy matter, but each traveller was obliged to wear snowshoes.".split()
print(f"Original: {test_sentence}")
print(f"With Bigrams: {bigram_phraser[test_sentence]}")

Original: ['When', 'the', 'young', 'people', 'returned', 'to', 'the', 'ballroom,', 'it', 'presented', 'a', 'decidedly', 'changed', 'appearance.', 'Instead', 'of', 'an', 'interior', 'scene,', 'it', 'was', 'a', 'winter', 'landscape.', 'The', 'floor', 'was', 'covered', 'with', 'snow-white', 'canvas,', 'not', 'laid', 'on', 'smoothly,', 'but', 'rumpled', 'over', 'bumps', 'and', 'hillocks,', 'like', 'a', 'real', 'snow', 'field.', 'The', 'numerous', 'palms', 'and', 'evergreens', 'that', 'had', 'decorated', 'the', 'room,', 'were', 'powdered', 'with', 'flour', 'and', 'strewn', 'with', 'tufts', 'of', 'cotton,', 'like', 'snow.', 'Also', 'diamond', 'dust', 'had', 'been', 'lightly', 'sprinkled', 'on', 'them,', 'and', 'glittering', 'crystal', 'icicles', 'hung', 'from', 'the', 'branches.', 'At', 'each', 'end', 'of', 'the', 'room,', 'on', 'the', 'wall,', 'hung', 'a', 'beautiful', 'bear-skin', 'rug.', 'These', 'rugs', 'were', 'for', 'prizes,', 'one', 'for', 'the', 'girls', 'and', 'one', 'for', 'the', '

## document_vector

In [None]:
def get_document_vector(text, wv_model, phraser, vector_dim=100):
    # tokenize and apply bigram phrase generator
    tokens = phraser[word_tokenize(text.lower())]

    # obtain the vector for each word
    vectors = []
    for word in tokens:
        if word in wv_model:
            vectors.append(wv_model[word])

    if not vectors:
        return np.zeros(vector_dim)

    # aggregation: calculate the average of all word vectors
    return np.mean(vectors, axis=0)

In [None]:
# creating document vectors for train and test sets
X_train_w2v = np.array([get_document_vector(
    text, wv_model, bigram_phraser) for text in train_df['excerpt']])
X_test_w2v = np.array([get_document_vector(
    text, wv_model, bigram_phraser) for text in test_df['excerpt']])

print(X_train_w2v.shape)
print(X_test_w2v.shape)

(2267, 100)
(567, 100)


## merge

In [31]:
# convert the new W2V features (dense) into a sparse matrix
X_train_w2v_sparse = csr_matrix(X_train_w2v)
X_test_w2v_sparse = csr_matrix(X_test_w2v)

X_train_final = hstack([X_train_combined, X_train_w2v_sparse])
X_test_final = hstack([X_test_combined, X_test_w2v_sparse])

print(X_train_final.shape)
print(X_test_final.shape)

(2267, 1108)
(567, 1108)


## third model

In [None]:
model_phase3 = RandomForestRegressor(
    n_estimators=100, random_state=42, n_jobs=-1)
model_phase3.fit(X_train_final, y_train)

predictions_phase3 = model_phase3.predict(X_test_final)
rmse_phase3 = np.sqrt(mean_squared_error(y_test, predictions_phase3))

print("\n--- model evaluation ---")
print(f"Phase 1 (Only Statistical Features) RMSE: {baseline_rmse:.4f}")
print(f"Phase 2 (Statistics + TF-IDF) RMSE: {rmse_phase2:.4f}")
print(f"Phase 3 (Stats + TF-IDF + Bigram W2V) RMSE: {rmse_phase3:.4f}")


--- model evaluation ---
Phase 1 (Only Statistical Features) RMSE: 0.8884
Phase 2 (Statistics + TF-IDF) RMSE: 0.7891
Phase 3 (Stats + TF-IDF + Bigram W2V) RMSE: 0.6997


# Analysis

In [33]:
results_df = test_df.copy()
results_df['predicted_target'] = predictions_phase3
results_df['error'] = results_df['predicted_target'] - results_df['target']
results_df['abs_error'] = np.abs(results_df['error'])

# identify the example with the poorest performance (the largest error)
print("--- The 5 worst examples (where the model made the most errors) ---")
print(results_df.sort_values(by='abs_error', ascending=False).head(5))

# Identify the best example (the one with the smallest error)
print("\n--- The top 5 best-performing examples (with the most accurate models) ---")
print(results_df.sort_values(by='abs_error', ascending=True).head(5))

--- The 5 worst examples (where the model made the most errors) ---
             id                                          url_legal    license  \
1637  8662922c7                                                NaN        NaN   
1642  ee7d40251                                                NaN        NaN   
2226  513bd77b3                                                NaN        NaN   
1770  968c283cd                                                NaN        NaN   
2731  76fe1a630  https://freekidsbooks.org/wp-content/uploads/2...  CC BY 4.0   

                                                excerpt    target  \
1637  A first-class boat will be of about the follow... -3.236543   
1642  We have frequent inquiries as to the best mean... -3.585369   
2226  The Goban was the master of sixteen trades. Th... -1.819263   
1770  "The principal generators of incrustation in b... -3.373600   
2731  Helen Adams Keller was born on 27th June 1880,...  0.646549   

      standard_error  predicte