# Grammar Scoring Engine for Voice Samples 

### 📌 Project Goal
The goal is to build a model that evaluates the grammatical quality of spoken English by analyzing voice samples. The output is a continuous score between 0 and 5.

---

# 📦 Install Required Libraries

In [1]:
!pip install -q openai-whisper
!pip install -q language-tool-python
!pip install -q tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.1/253.1 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.3/54.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# 📚 Import Libraries

In [2]:
import os
import pandas as pd
import whisper
import language_tool_python
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

# 📁 Define Paths

In [3]:
TRAIN_CSV_PATH = "/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv"
TEST_CSV_PATH = "/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv"
TRAIN_AUDIO_FOLDER = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train"
TEST_AUDIO_FOLDER = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test"

# 🧠 Load Models

In [4]:
print("🔄 Loading Whisper ASR model...")
asr_model = whisper.load_model("tiny")


print("🔍 Loading LanguageTool...")
grammar_tool = language_tool_python.LanguageTool('en-US')


🔄 Loading Whisper ASR model...


100%|█████████████████████████████████████| 72.1M/72.1M [00:03<00:00, 20.8MiB/s]
  checkpoint = torch.load(fp, map_location=device)


🔍 Loading LanguageTool...


Downloading LanguageTool 6.5: 100%|██████████| 248M/248M [00:03<00:00, 65.6MB/s]


# 🧪 Grammar Scoring Function (0 to 5 scale)

In [5]:

def get_grammar_score(file_path):
    try:
        result = asr_model.transcribe(file_path)
        text = result['text']
    except Exception as e:
        print(f"❌ Error: {e}")
        text = ""
    
    errors = len(grammar_tool.check(text))
    score = max(0, 5 - (errors // 3))
    return text, score


# 📊 TRAINING: Process audios_train + train.csv

In [6]:
train_df = pd.read_csv(TRAIN_CSV_PATH)

train_texts = []
train_preds = []

print("🔁 Scoring training samples...\n")
for fname in tqdm(train_df['filename']):
    file_path = os.path.join(TRAIN_AUDIO_FOLDER, fname)
    text, score = get_grammar_score(file_path)
    train_texts.append(text)
    train_preds.append(score)

train_df['transcription'] = train_texts
train_df['predicted_score'] = train_preds

# 📉 Optional Evaluation
mse = mean_squared_error(train_df['label'], train_df['predicted_score'])
print(f"\n📉 Mean Squared Error (MSE) on train data: {mse:.4f}")

🔁 Scoring training samples...



100%|██████████| 444/444 [1:06:06<00:00,  8.93s/it]


📉 Mean Squared Error (MSE) on train data: 4.1284





# 🚀 TESTING: Predict grammar scores for audios_test

In [7]:
test_df = pd.read_csv(TEST_CSV_PATH)

test_texts = []
test_scores = []

print("🧪 Scoring test samples...\n")
for fname in tqdm(test_df['filename']):
    file_path = os.path.join(TEST_AUDIO_FOLDER, fname)
    text, score = get_grammar_score(file_path)
    test_texts.append(text)
    test_scores.append(score)

test_df['transcription'] = test_texts
test_df['predicted_score'] = test_scores

🧪 Scoring test samples...



100%|██████████| 195/195 [27:30<00:00,  8.47s/it]


# 📤 Create submission.csv


In [8]:
submission_df = test_df[['filename', 'predicted_score']].rename(columns={'predicted_score': 'label'})
submission_df.to_csv("submission.csv", index=False)
print("✅ Submission file saved as submission.csv")

✅ Submission file saved as submission.csv


In [9]:
import pandas as pd

submission_df = pd.read_csv("submission.csv")
submission_df.head()


Unnamed: 0,filename,label
0,audio_706.wav,0
1,audio_800.wav,4
2,audio_68.wav,4
3,audio_1267.wav,5
4,audio_683.wav,5


In [10]:
import pandas as pd

try:
    submission_df = pd.read_csv("submission.csv")
    print("📄 submission.csv preview:")
    print(submission_df.head())
except Exception as e:
    print("⚠️ Error reading submission.csv:", e)


📄 submission.csv preview:
         filename  label
0   audio_706.wav      0
1   audio_800.wav      4
2    audio_68.wav      4
3  audio_1267.wav      5
4   audio_683.wav      5
