<a href="https://colab.research.google.com/github/nipunagarwal9636/Grammar-Scoring-engine/blob/main/Grammar_Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

shl_intern_hiring_assessment_2025_path = kagglehub.competition_download('shl-intern-hiring-assessment-2025')

print('Data source import complete.')


In [None]:
!pip install -q lightgbm librosa sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import os, pandas as pd
DATA_DIR = '/kaggle/input/shl-intern-hiring-assessment-2025/dataset'
OUTPUT_DIR = '/kaggle/working'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print('DATA_DIR =', DATA_DIR)

DATA_DIR = /kaggle/input/shl-intern-hiring-assessment-2025/dataset


In [None]:
from pathlib import Path
csvs = list(Path(DATA_DIR).rglob('*.csv'))
print('Found CSVs:', [p.name for p in csvs])
train_df = pd.read_csv(csvs[0]) if csvs else None
test_df  = pd.read_csv(csvs[1]) if len(csvs) > 1 else None
train_df.head()

Found CSVs: ['train.csv', 'test.csv']


Unnamed: 0,filename,label
0,audio_173,3.0
1,audio_138,3.0
2,audio_127,2.0
3,audio_95,2.0
4,audio_73,3.5


In [None]:
text_cols = [c for c in train_df.columns if train_df[c].dtype == 'object']
label_col = next((c for c in ['target','score','label','y','grammar_score'] if c in train_df.columns), None)
print('Text cols:', text_cols)
print('Label col:', label_col)

Text cols: ['filename']
Label col: label


In [None]:
import numpy as np

def add_text_feats(df, text_col):
    s = df[text_col].fillna('').astype(str)
    df[text_col+'_len'] = s.str.len()
    df[text_col+'_words'] = s.str.split().apply(len)
    return df

for tc in text_cols:
    train_df = add_text_feats(train_df, tc)
    if test_df is not None and tc in test_df.columns:
        test_df = add_text_feats(test_df, tc)

In [None]:
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
N_EMB = 4
for tc in text_cols:
    tr_emb = embed_model.encode(train_df[tc].fillna('').tolist(), show_progress_bar=True)[:,:N_EMB]
    for i in range(N_EMB): train_df[f'{tc}_emb_{i}'] = tr_emb[:,i]
    if test_df is not None:
        te_emb = embed_model.encode(test_df[tc].fillna('').tolist(), show_progress_bar=True)[:,:N_EMB]
        for i in range(N_EMB): test_df[f'{tc}_emb_{i}'] = te_emb[:,i]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
import librosa, numpy as np
from tqdm import tqdm
audio_col = next((c for c in ['audio_path','file_path','wav'] if c in train_df.columns), None)
def extract_audio_feats(path):
    try:
        y, sr = librosa.load(Path(DATA_DIR)/path, sr=16000)
        return pd.Series({'duration': librosa.get_duration(y=y, sr=sr), 'rms': librosa.feature.rms(y=y).mean()})
    except: return pd.Series({'duration': np.nan, 'rms': np.nan})
if audio_col:
    train_df = pd.concat([train_df, train_df[audio_col].progress_apply(extract_audio_feats)], axis=1)
    if test_df is not None:
        test_df = pd.concat([test_df, test_df[audio_col].progress_apply(extract_audio_feats)], axis=1)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

feat_cols = [c for c in train_df.columns if any(x in c for x in ['_len','_words','_emb_','duration','rms'])]
X = train_df[feat_cols].fillna(0)
y = train_df[label_col]
X_test = test_df[feat_cols].fillna(0) if test_df is not None else None

models, oof = [], np.zeros(len(X))
kf = KFold(n_splits=3, shuffle=True, random_state=42)

for fold, (tr, val) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")
    m = lgb.LGBMRegressor(n_estimators=300, learning_rate=0.1, num_leaves=31)
    m.fit(
        X.iloc[tr], y.iloc[tr],
        eval_set=[(X.iloc[val], y.iloc[val])],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=30), lgb.log_evaluation(50)]
    )
    oof[val] = m.predict(X.iloc[val])
    models.append(m)

print("OOF RMSE:", mean_squared_error(y, oof, squared=False))


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 370
[LightGBM] [Info] Number of data points in the train set: 272, number of used features: 5
[LightGBM] [Info] Start training from score 2.893382
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.764614	valid_0's l2: 0.584634
Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train set: 273, number of used features: 5
[LightGBM] [Info] Start training from score 2.906593
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.801917	valid_0's l2: 0.643071
Fold 3
[LightG

In [None]:

import os
import pandas as pd
import numpy as np

if X_test is not None:
    preds = np.mean([m.predict(X_test) for m in models], axis=0)


    if 'filename' in test_df.columns:
        submission = pd.DataFrame({
            'filename': test_df['filename'],
            'label': preds
        })
    else:

        possible_ids = [c for c in test_df.columns if 'file' in c.lower()]
        id_col = possible_ids[0] if possible_ids else test_df.columns[0]
        print(f"⚠️ 'filename' column not found; using '{id_col}' as ID.")
        submission = pd.DataFrame({
            'filename': test_df[id_col],
            'label': preds
        })


    sub_path = os.path.join(OUTPUT_DIR, "submission.csv")
    submission.to_csv(sub_path, index=False)
    print(f"✅ submission.csv saved successfully at: {sub_path}")
    print(submission.head())

else:
    print("❌ No test set found; skipping submission.")


✅ submission.csv saved successfully at: /kaggle/working/submission.csv
    filename     label
0  audio_141  2.901584
1  audio_114  2.886334
2   audio_17  2.931496
3   audio_76  2.928373
4  audio_156  2.893446


In [None]:
data= pd.read_csv("/kaggle/working/submission.csv")
data.head()

Unnamed: 0,filename,label
0,audio_141,2.901584
1,audio_114,2.886334
2,audio_17,2.931496
3,audio_76,2.928373
4,audio_156,2.893446
