In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

shl_intern_hiring_assessment_2025_path = kagglehub.competition_download('shl-intern-hiring-assessment-2025')

print('Data source import complete.')


In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4


# Problem Statement
# GRAMMER SCORING ENGINE FOR SPOKEN DATA
       
The objective of this project is to develop a Grammar Scoring Engine that predicts a
continuous grammar score (0â€“5) from spoken audio samples. Each audio file is 45â€“60
seconds long, and the target labels are Mean Opinion Scores (MOS) based on a defined
grammar rubric.

This solution converts speech to text using an ASR model and evaluates grammatical
quality using linguistic and syntactic features, followed by regression-based modeling.

# Data loading  & Exploration
* Training sample: 409
* Test sample : 197
* Each audio file : 45-60 second
* Labels : Grammar MOS scores (0-5)

In [None]:
import pandas as pd
import os

train_csv_path = os.path.join(shl_intern_hiring_assessment_2025_path, "dataset", "csvs", "train.csv")
test_csv_path = os.path.join(shl_intern_hiring_assessment_2025_path, "dataset", "csvs", "test.csv")

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

print("train Data ", train_df.head(10))
print("Test Data", test_df.head())

# Check column

In [None]:
train_df.columns

In [None]:
train_df['label'].describe()

# Visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns



plt.hist(train_df['label'], bins=10)
plt.xlabel("Grammer Score")
plt.ylabel("Count")
plt.title("Grammer Score Distribution")
plt.show()

# Define Constant

In [None]:
import os
# Correct audio directory (dataset folder uses 'audios', not 'audio_files')
audio_dir =  os.path.join(shl_intern_hiring_assessment_2025_path , "dataset", "audios", "train")
File_name_col = "filename"
label_col = "label"

# Verify Audio Path

In [None]:
# Build a robust filename from the CSV value
raw_name = train_df.iloc[0].get(File_name_col, '')
raw_name = str(raw_name).strip()
if raw_name == '' or raw_name.lower() == 'nan':
    raise ValueError(f'Filename column is empty for first row: {raw_name}')
sample_file = raw_name if raw_name.lower().endswith('.wav') else raw_name + '.wav'
audio_path = os.path.join(audio_dir, sample_file)

# Diagnostics: show resolved paths and directory contents
print('Audio dir (raw):', audio_dir)
print('Audio dir (abs):', os.path.abspath(audio_dir))
try:
    files = sorted(os.listdir(audio_dir))
    print('Number of files in audio_dir:', len(files))
    print('First 40 files:', files[:40])
    print('sample_file in dir?:', sample_file in files)
except FileNotFoundError as e:
    print('Directory not found:', e)

print(f'CSV filename value: {raw_name}')
print('Audio filename used:', sample_file)
print('Full path:', audio_path)
print('Exists:', os.path.exists(audio_path))


 # Check all audio  Validation

In [None]:
# Check missing files robustly: append .wav when necessary and show diagnostics
missing = []
csv_count = len(train_df)
for fname in train_df['filename']:
    name = str(fname).strip()
    if name == '' or name.lower() == 'nan':
        continue
    sample = name if name.lower().endswith('.wav') else name + '.wav'
    path = os.path.join(audio_dir, sample)
    if not os.path.exists(path):
        missing.append(sample)

# Diagnostics summary
try:
    files = sorted(os.listdir(audio_dir))
except FileNotFoundError:
    files = []
print('CSV rows:', csv_count)
print('Files in audio_dir:', len(files))
print('Missing files (count):', len(missing))
# Show a sample of missing filenames
missing[:50]


# preprocessing & Pipeline Architecture

### Pipline Overview:
Audio-> Preprocessing->Speech-to-text-> text Cleaning -> Grammer Feature Extraction ->  Regression Model -> Score (0-5)

### Audio Preprocessing:
* Resampling to 16hz
* Silence trimming
* Loudness normalization
### Speech-to-text:
* OpenAI Whisper ASR
* English Only Transcription
### Feature Engineering :
* Grammer error counts
* Sentence statics
* POS ratios
* Syntactic tree depth


 ## Load Require Libraries

In [None]:
import numpy as np
import librosa

## Audio Preprocessing Function

In [None]:


def preprocesss_audio(audio_path, target_sr=16000, max_duration=60):
    # load _audio
    y, sr =  librosa.load(audio_path, sr=target_sr)
    y , _ = librosa.effects.trim(y, top_db=20)

    # normalize
    max_len = sr * max_duration
    if len(y)>max_len:
        y = y[:max_len]
    else:
        y = np.pad(y, (0, max_len - len(y)))


    return y

## Test Preprocessing on One File

In [None]:
sample_wav =  train_df.iloc[0]['filename'].strip()
sample = sample_wav if sample_wav.lower().endswith('.wav') else sample_wav + '.wav'
Audio_path =  os.path.join(audio_dir, sample)

y = preprocesss_audio(audio_path=Audio_path)
print("Processed duration (sec):", len(y)/16000)

# Listen sample preprocess audio

In [None]:
import IPython.display as ipd


print("Processed audio:")
ipd.Audio(y, rate=16000)

## Install Whisper

In [None]:
!pip install -U openai-whisper




## Load Whisper Model

In [None]:
import whisper

whisper_model = whisper.load_model("base")



## Transcript Function

In [None]:
def transcribe_audio(audio_path):
    result = whisper_model.transcribe(
        audio_path,
        language="en",
        fp16=False
    )
    return result["text"]

## Test on one audio file

In [None]:
sample_file = train_df.iloc[0]['filename']
sample = sample_file if sample_file.lower().endswith('.wav') else sample_file + '.wav'
audio_path = os.path.join(audio_dir, sample)

text = transcribe_audio(audio_path)

print("TRANSCRIBED TEXT:\n")
print(text)

## Clean Transcribe Text function

In [None]:
import re
def clean_text(text):
    text =  text.lower()
    text = re.sub(r"\s+", " ", text)
    return text.strip()

### Test

In [None]:
cleaned_text = clean_text(text)
print(cleaned_text)


## Transcribe Full Train Set

In [None]:
from tqdm import tqdm

train_df["transcript"] = ""

for i in tqdm(range(len(train_df))):
    file_name = train_df.loc[i, "filename"]
    file =  file_name if file_name.lower().endswith('.wav') else file_name + '.wav'
    audio_path = os.path.join(audio_dir, file)

    text = transcribe_audio(audio_path)
    train_df.loc[i, "transcript"] = clean_text(text)


### Save the transcribe

In [None]:
train_df.to_csv("train_with_transcripts.csv", index=False)


### For Check

In [None]:
train_df[["filename", "label", "transcript"]].head()

## Grammer Feature Engineering

we extract explainable linguistic feature such as:
* Word count
* Sentence  count
* Grammar Errors per sentence
* POS ratios (noun , verb , adjective, adverb)
* Average depandency tree depth

### Install Required Libraries

In [None]:
!pip install language-tool-python spacy
!python -m spacy download en_core_web_sm


### Load Tools

In [None]:
import spacy
import language_tool_python

nlp = spacy.load("en_core_web_sm")

tool = language_tool_python.LanguageTool(
    'en-US',
    remote_server='https://api.languagetool.org'
)


### Feature Extraction Function

In [None]:

def extract_grammar_features(text):
    features = {}

    # ---------- BASIC STATS ----------
    words = text.split()
    word_count = len(words)
    features["word_count"] = word_count

    doc = nlp(text)
    sentences = list(doc.sents)
    sentence_count = len(sentences)
    features["sentence_count"] = sentence_count

    features["avg_sentence_length"] = (
        word_count / sentence_count if sentence_count > 0 else 0
    )

    # ---------- GRAMMAR ERROR FEATURES ----------
    matches = tool.check(text)
    error_count = len(matches)
    features["grammar_error_count"] = error_count

    features["grammar_error_per_sentence"] = (
        error_count / sentence_count if sentence_count > 0 else 0
    )

    #  FIXED: Average grammar error span length (SAFE)
    error_lengths = []
    for m in matches:
        try:
            # Preferred: length of suggested replacement
            if m.replacements:
                error_lengths.append(len(m.replacements[0]))
            else:
                # Fallback: estimate from text slice
                error_lengths.append(1)
        except:
            error_lengths.append(1)

    features["avg_error_length"] = (
        np.mean(error_lengths) if error_lengths else 0
    )

    # ---------- SENTENCE COMPLETENESS ----------
    incomplete_sentences = 0

    for sent in sentences:
        sent_doc = nlp(sent.text)
        has_verb = any(tok.pos_ == "VERB" for tok in sent_doc)
        has_subject = any(tok.dep_ in ("nsubj", "nsubjpass") for tok in sent_doc)

        if not has_verb or not has_subject:
            incomplete_sentences += 1

    features["incomplete_sentence_ratio"] = (
        incomplete_sentences / sentence_count if sentence_count > 0 else 0
    )

    # ---------- POS RATIOS ----------
    pos_counts = {}
    for tok in doc:
        pos_counts[tok.pos_] = pos_counts.get(tok.pos_, 0) + 1

    for pos in ["NOUN", "VERB", "ADJ", "ADV"]:
        features[f"pos_{pos.lower()}_ratio"] = (
            pos_counts.get(pos, 0) / word_count if word_count > 0 else 0
        )

    # ---------- SYNTACTIC COMPLEXITY ----------
    def tree_depth(token):
        children = list(token.children)
        if not children:
            return 1
        return 1 + max(tree_depth(child) for child in children)

    depths = [tree_depth(sent.root) for sent in sentences] if sentences else [0]
    features["avg_parse_tree_depth"] = np.mean(depths)

    return features

### Test Feature Extraction on One Sample

In [None]:
sample_text = train_df.iloc[0]["transcript"]
extract_grammar_features(sample_text)


### Extract Features for Entire Training Set

In [None]:
from tqdm import tqdm

feature_rows = []

for i in tqdm(range(len(train_df))):
    text = train_df.loc[i, "transcript"]
    feats =  extract_grammar_features(text)
    feats["label"]  =  train_df.loc[i, "label"]
    feature_rows.append(feats)

features_df =  pd.DataFrame(feature_rows)
features_df.head()

### Handle Missing

In [None]:
features_df =  features_df.fillna(0)

### Prepare Data  & Labels

In [None]:
# preprare featrue target
x =  features_df.drop(columns=["label"])
y = features_df["label"]

### Check shape

In [None]:
print(x.shape, y.shape)


## K-fold Cross-Validation Setup

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

kf =  KFold(n_splits=5, shuffle=True, random_state=42)

## install  xgboost

In [None]:
!pip install xgboost


### Train  with Cross Validation

In [None]:
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import numpy as np

mae_scores = []
rmse_scores = []
pearson_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(x)):
    print(f"\nFold {fold+1}")

    X_train, X_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # ðŸ”¹ Scale target (FIT ONLY on train fold)
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(
        y_train.values.reshape(-1, 1)
    ).ravel()

    # ðŸ”¹ Initialize model (fresh per fold)
    xgb = XGBRegressor(
        n_estimators=800,
        max_depth=7,
        learning_rate=0.03,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        objective="reg:squarederror"
    )

    # ðŸ”¹ Train
    xgb.fit(X_train, y_train_scaled)

    # ðŸ”¹ Predict (scaled space)
    preds_scaled = xgb.predict(X_val)

    # ðŸ”¹ Inverse transform
    preds = scaler.inverse_transform(
        preds_scaled.reshape(-1, 1)
    ).ravel()

    # ðŸ”¹ Clip to valid grammar range
    preds = np.clip(preds, 0, 5)

    # ðŸ”¹ Metrics
    mae = mean_absolute_error(y_val, preds)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    pearson = pearsonr(y_val, preds)[0]

    mae_scores.append(mae)
    rmse_scores.append(rmse)
    pearson_scores.append(pearson)

    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Pearson: {pearson:.4f}")


In [None]:
print("\n===== CROSS-VALIDATION RESULTS =====")
print("MAE scores:", mae_scores)
print("Mean MAE:", np.mean(mae_scores))

print("\nRMSE scores:", rmse_scores)
print("Mean RMSE:", np.mean(rmse_scores))

print("\nPearson scores:", pearson_scores)
print("Mean Pearson:", np.mean(pearson_scores))


In [None]:

plt.figure(figsize=(5,5))
plt.scatter(y_val, preds, alpha=0.6)
plt.xlabel("Actual Grammar Score")
plt.ylabel("Predicted Grammar Score")
plt.title("Prediction vs Actual")
plt.plot([0,5], [0,5], 'r--')
plt.show()


### Model Evaluation

The model was evaluated using 5-fold cross-validation. Since the competition
leaderboard is based on RMSE and Pearson correlation, both metrics were computed
for each fold. Target normalization was applied to improve ranking consistency,
which significantly improved Pearson correlation.

- Mean RMSE reflects absolute prediction accuracy
- Mean Pearson correlation reflects ranking alignment with true grammar scores


In [None]:
import pandas as pd

importance = pd.Series(
    xgb.feature_importances_,
    index=x.columns
).sort_values(ascending=False)

importance.head(10).plot(kind='barh')
plt.title("Top Feature Importances")
plt.show()


## check test data

In [None]:
test_df.head()

## Transcribe Test Audio

In [None]:
from tqdm import tqdm
import os

test_df["transcript"] = ""

TEST_AUDIO_DIR = os.path.join(shl_intern_hiring_assessment_2025_path, "dataset", "audios", "test")

for i in tqdm(range(len(test_df))):
    file_name = test_df.loc[i, "filename"]
    file = file_name if file_name.lower().endswith('.wav') else file_name + '.wav'
    audio_path = os.path.join(TEST_AUDIO_DIR, file)

    text = transcribe_audio(audio_path)
    test_df.loc[i, "transcript"] = clean_text(text)


### Save it

In [None]:
test_df.to_csv("test_with_transcripts.csv", index=False)


## Extract Grammer Feature For Test Set

In [None]:
test_feature_rows = []

for i in tqdm(range(len(test_df))):
    text = test_df.loc[i, "transcript"]
    feats = extract_grammar_features(text)
    test_feature_rows.append(feats)

X_test = pd.DataFrame(test_feature_rows)
X_test = X_test.fillna(0)


# Train Final  Model  on Full training Data

In [None]:
final_model = XGBRegressor(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

X_train_full = features_df.drop(columns=["label"])
y_train_full = features_df["label"]

final_model.fit(X_train_full, y_train_full)


## Predict Grammer Scores For Test Set

In [None]:
test_predictions = final_model.predict(X_test)


# To set valid range

In [None]:
test_predictions = test_predictions.clip(0, 5)


# Create Submission file

In [None]:
submission = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_predictions.round(2)
})

submission.head()


In [None]:
submission.to_csv("submission.csv", index=False)


## Final Result Summary
* Model: XGBoost Regressor
* Evaluation metrics :  RMSEM , Pearson Correlation
* Mean  Cross-Validation RMSE : 0.7333702368823092
* Pearson Correlation:  0.34329159431492196

## Conclusion & Future Work

This notebook presents a complete pipeline for spoken grammar scoring.
Future improvements include:
- Using larger ASR models
- Incorporating language model perplexity
- Model ensembling