In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**INSTALLING DEPENDENCIES**

In [None]:
# General packages
!pip install numpy pandas matplotlib scipy

# Audio processing
!pip install librosa

# Whisper (for transcription)
!pip install git+https://github.com/openai/whisper.git 
!pip install torch  # Required by Whisper

# Hugging Face Transformers (for BERT)
!pip install transformers

# LanguageTool for grammar checking
!pip install language-tool-python

# Scikit-learn (for ML model)
!pip install scikit-learn

!pip install textstat
!pip install umap-learn

**CREATING OF TRANSCRIPTS**

In [None]:
import os
import numpy as np
import pandas as pd
import whisper

# Datasets loaded
train_df = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv')

# Define audio directories
TRAIN_AUDIO_DIR = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train'
TEST_AUDIO_DIR = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test'

# Define output transcription directories
OUTPUT_TRAIN_TXT_DIR = '/kaggle/working/train_transcriptions'
OUTPUT_TEST_TXT_DIR = '/kaggle/working/test_transcriptions'
os.makedirs(OUTPUT_TRAIN_TXT_DIR, exist_ok=True)
os.makedirs(OUTPUT_TEST_TXT_DIR, exist_ok=True)

# Load Whisper model
model = whisper.load_model("large")

# Transcribe training audio files
print("Transcribing training audio files...")
for filename in train_df['filename'].tolist():
    full_path = os.path.join(TRAIN_AUDIO_DIR, filename)
    audio = whisper.load_audio(full_path)
    result = model.transcribe(audio)
    transcription = result['text'].strip()
    
    txt_filename = os.path.splitext(filename)[0] + ".txt"
    with open(os.path.join(OUTPUT_TRAIN_TXT_DIR, txt_filename), 'w') as f:
        f.write(transcription)

# Transcribe test audio files
print("Transcribing test audio files...")
for filename in test_df['filename'].tolist():
    full_path = os.path.join(TEST_AUDIO_DIR, filename)
    audio = whisper.load_audio(full_path)
    result = model.transcribe(audio)
    transcription = result['text'].strip()
    
    txt_filename = os.path.splitext(filename)[0] + ".txt"
    with open(os.path.join(OUTPUT_TEST_TXT_DIR, txt_filename), 'w') as f:
        f.write(transcription)

print("✅ All transcriptions saved:")
print(f" - Train: {OUTPUT_TRAIN_TXT_DIR}")
print(f" - Test:  {OUTPUT_TEST_TXT_DIR}")

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import matplotlib.pyplot as plt
from transformers import RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import optuna
from language_tool_python import LanguageTool
from tqdm import tqdm

# Paths
TRAIN_CSV = '/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv'
TEST_CSV = '/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv'
TRAIN_TRANSCRIPT_DIR = '/kaggle/input/shl-dataset/train_transcriptions'
TEST_TRANSCRIPT_DIR = '/kaggle/input/shl-dataset/test_transcriptions'
TRAIN_AUDIO_DIR = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train'
TEST_AUDIO_DIR = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test'

# Load data
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base").to(device).eval()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

tool = LanguageTool("en-US")

def get_text(path):
    with open(path, 'r') as f:
        return f.read().strip()

def get_roberta_embedding(text):
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

def get_sentence_embedding(text):
    return sentence_model.encode(text)

def get_grammar_features(text):
    matches = tool.check(text)
    grammar = sum(1 for m in matches if 'GRAMMAR' in m.category)
    typo = sum(1 for m in matches if 'TYPO' in m.category)
    style = sum(1 for m in matches if 'STYLE' in m.category)
    return [grammar, typo, style]

def get_audio_features(audio_path, text):
    y, sr = librosa.load(audio_path)
    duration = librosa.get_duration(y=y, sr=sr)
    word_count = len(text.split())
    speaking_rate = word_count / (duration / 60) if duration else 0
    pauses = len(librosa.effects.split(y)) - 1
    return [speaking_rate, max(pauses, 0)]

def extract_features(df, transcript_dir, audio_dir, tfidf_model=None, is_train=True):
    features = []
    tfidf_texts = []
    labels = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        fname = row['filename']
        text_path = os.path.join(transcript_dir, fname.replace('.wav', '.txt'))
        audio_path = os.path.join(audio_dir, fname)
        
        text = get_text(text_path)
        tfidf_texts.append(text)

        roberta_emb = get_roberta_embedding(text)
        sentence_emb = get_sentence_embedding(text)
        grammar_feats = get_grammar_features(text)
        audio_feats = get_audio_features(audio_path, text)
        sentence_len = np.mean([len(s.split()) for s in text.split('.') if s.strip()]) or 0

        feat = np.concatenate([roberta_emb, sentence_emb, grammar_feats, audio_feats, [sentence_len]])
        features.append(feat)

        if is_train:
            labels.append(row['label'])

    if is_train and tfidf_model is None:
        tfidf_model = TfidfVectorizer(max_features=300)
        tfidf_model.fit(tfidf_texts)
    
    tfidf_feats = tfidf_model.transform(tfidf_texts).toarray()
    all_feats = np.hstack([features, tfidf_feats])

    return np.nan_to_num(all_feats), (np.array(labels) if is_train else None), tfidf_model

# Extract Features
X_train, y_train, tfidf_model = extract_features(train_df, TRAIN_TRANSCRIPT_DIR, TRAIN_AUDIO_DIR)
X_test, _, _ = extract_features(test_df, TEST_TRANSCRIPT_DIR, TEST_AUDIO_DIR, tfidf_model, is_train=False)

# Optuna Objective
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.2, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    model = LGBMRegressor(**params)
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    return mean_squared_error(y_val, preds)

# Run Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

print("Best params:", study.best_params)

# Train final model
final_model = LGBMRegressor(**study.best_params)
final_model.fit(X_train, y_train)

# Predict and Save
preds = final_model.predict(X_test)
preds = np.clip(preds, 0, 5)

submission = pd.DataFrame({'filename': test_df['filename'], 'label': preds})
submission.to_csv('testing.csv', index=False)
print("Final submission saved as 'submission.csv'")