In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-dataset/dataset/sample_submission.csv
/kaggle/input/shl-dataset/dataset/train.csv
/kaggle/input/shl-dataset/dataset/test.csv
/kaggle/input/shl-dataset/dataset/audios_test/audio_885.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_698.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_1176.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_1215.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_66.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_386.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_1026.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_330.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_72.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_858.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_107.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_820.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_300.wav
/kaggle/input/shl-dataset/dataset/audios_test/audio_435.wav
/kaggle/input/sh

In [30]:
# installing libraries
!pip install -q openai-whisper nltk scikit-learn tqdm


In [31]:
import os
import numpy as np
import pandas as pd
import whisper
import nltk

from tqdm import tqdm
from nltk import word_tokenize, pos_tag, sent_tokenize
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import pearsonr


In [32]:
import nltk

# Required for Python 3.12+
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')



[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [33]:
def load_data(base_path):
    train = pd.read_csv(f"{base_path}/train.csv")
    test = pd.read_csv(f"{base_path}/test.csv")
    return train, test


DATA_PATH = "/kaggle/input/shl-dataset/dataset"
train_df, test_df = load_data(DATA_PATH)

In [34]:
def load_asr_model():
    return whisper.load_model("base")

asr_model = load_asr_model()

In [35]:
def lexical_features(tokens):
    return {
        "total_words": len(tokens),
        "avg_word_len": np.mean([len(w) for w in tokens]) if tokens else 0,
        "type_token_ratio": len(set(tokens)) / len(tokens) if tokens else 0
    }


In [36]:
def syntactic_features(pos_tags):
    return {
        "num_nouns": sum(1 for _, t in pos_tags if t.startswith("NN")),
        "num_verbs": sum(1 for _, t in pos_tags if t.startswith("VB")),
        "num_adjs":  sum(1 for _, t in pos_tags if t.startswith("JJ"))
    }

In [37]:
def sentence_features(sentences, total_words):
    return {
        "total_sentences": len(sentences),
        "avg_sentence_len": total_words / len(sentences) if sentences else 0
    }

In [38]:
def fluency_features(segments):
    durations = [(s["end"] - s["start"]) for s in segments]
    return {
        "num_segments": len(segments),
        "avg_segment_duration": np.mean(durations) if durations else 0
    }

In [39]:
FILLERS = {"uh", "um", "erm", "like", "you know"}

def disfluency_features(tokens):
    return {
        "filler_count": sum(1 for w in tokens if w in FILLERS)
    }


In [40]:
def extract_features(audio_path, model):
    result = model.transcribe(audio_path, fp16=False)
    text = result["text"].lower().strip()

    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    sentences = sent_tokenize(text)
    segments = result.get("segments", [])

    features = {}
    features.update(lexical_features(tokens))
    features.update(syntactic_features(pos_tags))
    features.update(sentence_features(sentences, len(tokens)))
    features.update(fluency_features(segments))
    features.update(disfluency_features(tokens))

    return features


In [41]:
def build_feature_dataframe(df, audio_dir, model, label_col=None):
    rows = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        path = f"{audio_dir}/{row['filename']}"
        try:
            feats = extract_features(path, model)
            if label_col:
                feats["label"] = row[label_col]
            rows.append(feats)
        except Exception as e:
            print(f"❌ Failed: {row['filename']}")

    return pd.DataFrame(rows)


In [42]:
train_features = build_feature_dataframe(
    train_df,
    f"{DATA_PATH}/audios_train",
    asr_model,
    label_col="label"
)

X = train_features.drop(columns=["label"])
y = train_features["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

model.fit(X_train, y_train)


100%|██████████| 444/444 [1:43:38<00:00, 14.01s/it]  


In [43]:
val_preds = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, val_preds))
pearson, _ = pearsonr(y_val, val_preds)

print(f"RMSE    : {rmse:.4f}")
print(f"Pearson : {pearson:.4f}")


RMSE    : 1.1093
Pearson : 0.3591


In [44]:
test_features = build_feature_dataframe(
    test_df,
    f"{DATA_PATH}/audios_test",
    asr_model
)

test_preds = model.predict(test_features)

submission = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


100%|██████████| 195/195 [37:42<00:00, 11.60s/it] 


Unnamed: 0,filename,label
0,audio_706.wav,4.099803
1,audio_800.wav,2.676393
2,audio_68.wav,3.114733
3,audio_1267.wav,2.912426
4,audio_683.wav,2.361249


In [49]:
submission.head(194)

Unnamed: 0,filename,label
0,audio_706.wav,4.099803
1,audio_800.wav,2.676393
2,audio_68.wav,3.114733
3,audio_1267.wav,2.912426
4,audio_683.wav,2.361249
...,...,...
189,audio_1178.wav,4.693350
190,audio_135.wav,3.712650
191,audio_512.wav,4.530542
192,audio_529.wav,3.778846
