# 1. Config

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', 500)

In [2]:
raw_df = pd.read_csv('data/exports/combined_heartfelt_data.csv')

print(len(raw_df))
raw_df.head(3)

131


Unnamed: 0,text,is_heartfelt
0,I would love for our school to be considered for some gardening equipment. We lost a member of staff in February who had been at our school for 30 years and she was heavily involved with teaching the children in our inner city school about the environment and gardening. Since her death we have had countless children asking if we can use a piece of small ground in the playground and use it to her memory and plant seeds and make it a happy place. Mrs.Upham used to run a gardening club but she ...,True
1,"Our year 1 children have been inspired by a book called Omar, the Bees and Me, where two children from different backgrounds bond over their interest in bees and decide to make a 'bee corridor' between their school and the local park. They send out envelopes of wildflower seeds to every house and building along the route and by the time the late spring comes, the whole neighborhood is alive with flowers and insects. Our year 1 classes would love to do something like this in our community. We...",True
2,"I am the SEN and Pastoral Lead in my school, Derryhale Primary School, Co Armagh N Ireland. We recently lost a valued member of our PTA team, a lovely mummy of two children, a little girl in Y2, and a little boy in Y4. This lovely lady was just 31 years old and lost her fight for life to cancer. Budget is tight in school. We are a small rural school of 76 pupils and everyone has been effected by this loss. All the children in our care worry now that they will loose a family member and pasto...",True


# 2. Data Preprocessing

## 2.1 Normalising Text

In [3]:
import string

In [4]:
norm_text = raw_df.copy()

In [5]:
def normalise_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = text.translate(str.maketrans('','', string.punctuation))
        text = text.strip()

        return  ' '.join(text.split())

In [6]:
norm_text['text'] = norm_text['text'].map(normalise_text)

In [7]:
norm_text.head(3)

Unnamed: 0,text,is_heartfelt
0,i would love for our school to be considered for some gardening equipment we lost a member of staff in february who had been at our school for 30 years and she was heavily involved with teaching the children in our inner city school about the environment and gardening since her death we have had countless children asking if we can use a piece of small ground in the playground and use it to her memory and plant seeds and make it a happy place mrsupham used to run a gardening club but she dona...,True
1,our year 1 children have been inspired by a book called omar the bees and me where two children from different backgrounds bond over their interest in bees and decide to make a bee corridor between their school and the local park they send out envelopes of wildflower seeds to every house and building along the route and by the time the late spring comes the whole neighborhood is alive with flowers and insects our year 1 classes would love to do something like this in our community well need ...,True
2,i am the sen and pastoral lead in my school derryhale primary school co armagh n ireland we recently lost a valued member of our pta team a lovely mummy of two children a little girl in y2 and a little boy in y4 this lovely lady was just 31 years old and lost her fight for life to cancer budget is tight in school we are a small rural school of 76 pupils and everyone has been effected by this loss all the children in our care worry now that they will loose a family member and pastorally i am ...,True


## 2.2 Stopword Removal

In [8]:
import spacy
import spacy_cleaner
from spacy_cleaner.processing import removers, mutators

nlp = spacy.load('en_core_web_md')

In [9]:
pipeline = spacy_cleaner.Cleaner(
    nlp,
    removers.remove_stopword_token,
    mutators.mutate_lemma_token,
)

In [10]:
def clean_text_with_pipeline(text):
    text = pipeline.clean(text)

    return text

In [11]:
preprocessed_df = norm_text.copy()

In [13]:
preprocessed_df['clean_text'] = pipeline.clean(preprocessed_df['text'].to_list())

Cleaning Progress: 100%|██████████████████████████████████████████| 131/131 [00:00<00:00, 313.08it/s]


In [15]:
preprocessed_df.head(3)

Unnamed: 0,text,is_heartfelt,clean_text
0,i would love for our school to be considered for some gardening equipment we lost a member of staff in february who had been at our school for 30 years and she was heavily involved with teaching the children in our inner city school about the environment and gardening since her death we have had countless children asking if we can use a piece of small ground in the playground and use it to her memory and plant seeds and make it a happy place mrsupham used to run a gardening club but she dona...,True,love school consider gardening equipment lose member staff february school 30 year heavily involved teach child inner city school environment gardening death countless child ask use piece small ground playground use memory plant seed happy place mrsupham run gardening club donate thing 50 child volunteer help positive outcome sad loss
1,our year 1 children have been inspired by a book called omar the bees and me where two children from different backgrounds bond over their interest in bees and decide to make a bee corridor between their school and the local park they send out envelopes of wildflower seeds to every house and building along the route and by the time the late spring comes the whole neighborhood is alive with flowers and insects our year 1 classes would love to do something like this in our community well need ...,True,year 1 child inspire book call omar bee child different background bond interest bee decide bee corridor school local park send envelope wildflower seed house building route time late spring come neighborhood alive flower insect year 1 class love like community need buy lot wildflower seed produce leaflet promote idea locally handdeliver envelope local area hope turn local area beefriendly community
2,i am the sen and pastoral lead in my school derryhale primary school co armagh n ireland we recently lost a valued member of our pta team a lovely mummy of two children a little girl in y2 and a little boy in y4 this lovely lady was just 31 years old and lost her fight for life to cancer budget is tight in school we are a small rural school of 76 pupils and everyone has been effected by this loss all the children in our care worry now that they will loose a family member and pastorally i am ...,True,sen pastoral lead school derryhale primary school co armagh n ireland recently lose value member pta team lovely mummy child little girl y2 little boy y4 lovely lady 31 year old lose fight life cancer budget tight school small rural school 76 pupil effect loss child care worry loose family member pastorally support child well resource twinkl support daddy granny child 10 day prior mummy die granda die little child lose beloved mummy granda 10 day love treat child school school trip christmas...


In [16]:
preprocessed_df = preprocessed_df[['text', 'clean_text', 'is_heartfelt']]

preprocessed_df.columns

Index(['text', 'clean_text', 'is_heartfelt'], dtype='object')

In [17]:
df = preprocessed_df.copy()

In [18]:
df.isna().sum()

text            0
clean_text      0
is_heartfelt    0
dtype: int64

# 3. Model Training

In [47]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [44]:
X, y = df['text'], df['is_heartfelt']

# 2. Split out a hold-out set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 3. Build a TF–IDF + Logistic Regression pipeline
baseline_pipe = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        min_df=2
    )),
    ('clf', LogisticRegression(
        solver='liblinear',
        class_weight='balanced',  # if classes are skewed
        random_state=42
    )),
])


In [45]:
# 4. Quick cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    baseline_pipe, X_train, y_train,
    cv=cv,
    scoring='f1'
)
print(f'Baseline CV F1: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}')

# 5. Fit and evaluate on held-out test
baseline_pipe.fit(X_train, y_train)
y_pred = baseline_pipe.predict(X_test)
print(classification_report(y_test, y_pred))

Baseline CV F1: 0.916 ± 0.046
              precision    recall  f1-score   support

       False       1.00      0.91      0.95        11
        True       0.94      1.00      0.97        16

    accuracy                           0.96        27
   macro avg       0.97      0.95      0.96        27
weighted avg       0.97      0.96      0.96        27



In [46]:
# Example: inspect errors
X_err = X_test[y_test != y_pred]
y_err_true = y_test[y_test != y_pred]
y_err_pred = y_pred[y_test != y_pred]
for text, true, pred in zip(X_err, y_err_true, y_err_pred):
    print(f"> Real={true}, Pred={pred}\n{text}\n---")


> Real=False, Pred=True
i would look to buy supplies for the whole school to use for science i know that some experiments use a lot of materials and would want to buy good kits for circuits i would also look into buying things to improve the learning and teaching of computing science in the school
---


In [None]:
import joblib

baseline_pipe.fit(X_train, y_train)

joblib.dump(baseline_pipe, '../src/models/heartfelt_pipeline.joblib')

# …later, or in a new script/notebook, load it back:
#loaded_pipe = joblib.load('heartfelt_pipeline.joblib')

# Confirm it still works
y_pred = loaded_pipe.predict(X_test)
