## Objective

The objective of this machine learning project is to develop a robust Hindi-to-English translation model using deep learning techniques. The primary goal is to improve the translation quality for real-time news articles and informational content sourced from various Indian news websites. The model aims to handle the nuances of the Hindi language effectively and produce translations that maintain the contextual and cultural accuracy of the original text.

In [1]:
import pandas as pd
import numpy as np
import string
import warnings
import re
import multiprocessing
import tensorflow as tf
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from stanza import Pipeline
from sklearn.utils import shuffle
from transformers import MarianMTModel, MarianTokenizer, GenerationConfig
from sklearn.feature_extraction.text import CountVectorizer
import torch

In [2]:
# Display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="Some non-default generation parameters")

In [3]:
#reading the web-scrapped csv file
df = pd.read_csv("./language.csv")
df = shuffle(df).reset_index(drop=True)
df.head(n=10)

Unnamed: 0,Reference,Hindi,English
0,https://www.bhaskar.com/local/rajasthan/sikar/,बूथ चलें अभियान; बैनर से दे रहे मतदान जागरूकता का संदेश,Booths run campaign;The message of voting awareness is being given by banner
1,https://navbharattimes.indiatimes.com/business/articlelist/2279786.cms,"ईरान-इजरायल तनाव से नया रेकॉर्ड बनाने की तैयारी में सोना, जानिए कहां तक पहुंच सकती है कीमत","Iran-Israeli is preparing to make a new record with stress, know where the price can reach the price"
2,https://ndtv.in/faith#pfrom=home-khabar_nav,"नवरात्रि के तीसरे दिन मां चंद्रघंटा की इस विधि से करें पूजा, मंत्र से लेकर आरती जानें यहां","On the third day of Navratri, do worship with this method of Maa Chandraghanta, know from mantra to Aarti here"
3,https://navbharattimes.indiatimes.com/apna-bazaar/fashion/articlelist/80476266.cms,": रुपये से कम में खरीदें ये टॉप क्वालिटी वाली , मिलेगा ताबड़तोड़ डिस्काउंट का ऑफर",": Buy this top quality for less than Rs, you will get a rampant discount offer"
4,https://navbharattimes.indiatimes.com/state/uttarakhand/articlelist/21236621.cms,"मरघट, पनघट सब यहीं... जोशीमठ छोड़ कहीं और नहीं जाएंगे, हाथ में हल-फावड़े लेकर सड़कों पर उतरे पुश्‍तैनी लोग","Marghat, Panaghat all here ... Joshimath will not leave anywhere else, the people who came out on the streets with plow in their hands"
5,https://www.bhaskar.com/local/rajasthan/,"करणीमाता मंदिर में श्रद्धालुओं की भीड़ उमड़ी, जाम लगा तो घंटे तक दुपहिया वाहन प्रतापबंध पर रोके","Crowds of devotees gathered in Karnimata temple, two two -wheelers stopped at the glory for hours"
6,https://www.amarujala.com/privacy-policy,निजी जानकारी हासिल करना और उसे अपडेट करना,Get personal information and update it
7,https://www.prabhatkhabar.com/,"भगता परब: बोकारो में पीठ छिदवाकर व अंगारों पर चलकर दिखायी आस्था, फीट की ऊंचाई पर झूले भगतिया","Bhagata Parab: Faith shown in Bokaro by piercing back and walking on coals, swing at the height of feet"
8,https://www.prabhatkhabar.com/state/jharkhand/deoghar/cyber-crime-mumbai-police-attacked-in-deoghar-to-save-cyber-criminal-two-soldiers-injured-vehicle-damaged-grj,"झारखंड के देवघर में साइबर क्रिमिनल को बचाने के लिए मुंबई पुलिस पर हमला, दो जवान घायल, वाहन क्षतिग्रस्त","Mumbai police attacked, two soldiers injured, vehicle damaged to save cyber criminal in Deoghar, Jharkhand"
9,https://www.punjabkesari.in/business/property,पिछले दो चुनावी वर्षों में मकानों की बिक्री ने नए रिकॉर्ड बनाए,Sale of houses in the last two election years set new records


In [4]:
# taking only first 4000 instances
df = df[:4000]

In [5]:
# checking the shape of the dataframe
df.shape

(4000, 3)

In [6]:
#to check the null values in dataframe
df.isnull().sum()

Reference    0
Hindi        0
English      0
dtype: int64

In [7]:
hindi_stopwords = [
    'है', 'में', 'यह', 'वह', 'के', 'हो', 'को', 'पर', 'इस', 'साथ', 'जो', 'कर', 'था', 'द्वारा', 'होता', 'लिए',
    'आप', 'आपको', 'आपका', 'इसे', 'वे', 'उनके', 'बारे', 'तक', 'इन', 'उस', 'अत', 'अब', 'कहा', 'गया', 'जा', 'रहे',
    'उनका', 'इसका', 'रहा', 'जैसे', 'सब', 'किस', 'जिस', 'जिसे', 'किसी', 'किन', 'उसका', 'जिन', 'यदि', 'हुआ', 'जब',
    'कहीं', 'कौन', 'कौनसा', 'इत्यादि', 'यहाँ', 'वहाँ'
]

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in hindi_stopwords]
    return ' '.join(tokens)

* A list named hindi_stopwords that includes common stopwords in Hindi—words which are typically filtered out before processing natural language data because they are frequent and carry minimal meaningful information by themselves. 

In [8]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Replace URLs with a space
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove English and Hindi stopwords
    eng_stop_words = set(stopwords.words('english'))
    combined_stopwords = eng_stop_words.union(hindi_stopwords)
    
    # Filter out stopwords
    tokens = [word for word in tokens if word not in combined_stopwords]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing
df['Hindi_clean'] = df['Hindi'].apply(preprocess_text)

KeyboardInterrupt: 

* Lowercasing: Converts all characters in the text to lowercase to ensure uniformity.
* URL Replacement: Uses a regular expression to find and replace URLs with a space, removing hyperlinks from the text.
* Tokenization: Breaks the text into individual words or tokens. This requires importing a tokenizer, which is not explicitly imported in the provided code but typically comes from a library like NLTK.
* Stop Words Removal: Removes stopwords from the tokens. Stopwords are commonly used words (like "and", "the", etc.) that are often filtered out before processing text. The code assumes lists of stopwords for both English and Hindi are available.
* Lemmatization: Applies lemmatization to the tokens to reduce them to their base or root form. This part of the code uses the WordNetLemmatizer from the NLTK library.

In [None]:
#display the first coluumns after cleaning 'Hindi'
df.head()

## Feature Extraction

#### Count vectorization:
A straightforward method used in text processing to convert text data into a numerical format, which is essential for machine learning models that require numeric input. Here's a simple explanation of how it works:

Vocabulary Creation: Count vectorization starts by building a vocabulary of all the unique words in the entire set of documents (text data) you have. Each word in the vocabulary is assigned a unique index.

Count Calculation: For each document, the method counts how many times each word from the vocabulary appears. These counts are then organized into a vector (a list of numbers) where each position in the vector corresponds to a word in the vocabulary, and the value at that position is the count of that word in the document.

Document Representation: As a result, each document is transformed into a vector of numbers. These vectors will all be of the same length—the size of the vocabulary. Words that appear in a document have counts greater than zero in the vector at their corresponding index, and words that do not appear have a count of zero.

[For more information:](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [None]:
# Initialize with a maximum feature number limit
vectorizer_bow = CountVectorizer(max_features=1000)

# Fit and transform the data
bow_features = vectorizer_bow.fit_transform(df['Hindi_clean'])

bow_df = pd.DataFrame(bow_features.toarray(), columns=vectorizer_bow.get_feature_names_out())

* Setting a Limit (max_features=1000): We tell the tool to only consider the most frequent 1000 words from all the text we give it. This means if there are more than 1000 unique words across all the texts, only the top 1000 by frequency will be used.

Transforming Text into Numbers (fit_transform):

* Fit: This part of the process is where the tool learns which words are the most common across all the provided Hindi texts. Think of it like making a list of top words.
* Transform: After learning, the tool then goes through each text and counts how many times these top words appear. Each text is transformed into a list of numbers, where each number represents how many times a specific word from the top words list appears in the text.
* Creating a Data Table (DataFrame): The lists of numbers for each text are organized into a table where each column represents one of the top words, and each row represents one of the texts. The numbers in the table tell us the count of each word for each text.

## Word2Vec

Training Data: Word2Vec requires a large amount of text data to learn from. This text data helps the model understand the context in which words appear.

Concept of Context: The model learns by looking at the words that frequently appear around a given word. For instance, in the sentence "I love eating apples", if the model is focusing on the word "eating", the surrounding words like "love" and "apples" give clues about its meaning.

Creating Vectors: Through training, Word2Vec assigns each word in the vocabulary a vector (a list of numbers). These vectors are created in such a way that they capture relationships and patterns among words in the text data. Words with similar meanings end up having vectors that are close to each other in the vector space.



In [None]:
# Tokenize texts for Word2Vec
sentences = [row.split() for row in df['Hindi_clean']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Convert sentences to vectors
def get_vector(sentence):
    words = sentence.split()
    vector = sum(word2vec_model.wv[word] for word in words if word in word2vec_model.wv) / len(words)
    return vector

X_word2vec = np.array([get_vector(sentence) for sentence in df['Hindi_clean']])

In [None]:
from transformers import MarianMTModel, MarianTokenizer, GenerationConfig

# Load your model and tokenizer
model_name = "Helsinki-NLP/opus-mt-hi-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Define a GenerationConfig with your custom parameters
gen_config = GenerationConfig(
    max_length=512,
    num_beams=6,
    bad_words_ids=[[61126]],
    forced_eos_token_id=0
)

In [None]:
# Load the saved model and tokenizer
model = MarianMTModel.from_pretrained(save_directory)
tokenizer = MarianTokenizer.from_pretrained(save_directory)

def translate(text):
    # Encode the text input
    encoded_hindi_text = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    
    # Generate translation ids
    translated_ids = model.generate(**encoded_hindi_text)
    
    # Decode the translated text and return it
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

In [None]:
from sklearn.model_selection import train_test_split

# Define X and Y
X = df["Hindi_clean"]
Y = df["English"]

# Splitting the data into train, validation, and test sets
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

# Confirm the splits
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

# Preparing data for model training
train_encodings = tokenizer(X_train.tolist(), return_tensors='pt', padding=True, truncation=True)
val_encodings = tokenizer(X_val.tolist(), return_tensors='pt', padding=True, truncation=True)

train_labels = tokenizer(Y_train.tolist(), return_tensors='pt', padding=True, truncation=True)['input_ids']
val_labels = tokenizer(Y_val.tolist(), return_tensors='pt', padding=True, truncation=True)['input_ids']

# Define the dataset class for PyTorch
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create Dataset objects
train_dataset = TranslationDataset(train_encodings, train_labels)
val_dataset = TranslationDataset(val_encodings, val_labels)

# Training setup
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

In [None]:
def translate_batch(texts, model, tokenizer, device, max_length=128):
    encodings = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    encodings = {key: val.to(device) for key, val in encodings.items()}  # Move encodings to the device

    with torch.no_grad():
        translated_tokens = model.generate(**encodings)
    
    translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens.cpu()]  
    return translations

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

sample_df = df.sample(5)
# Batch processing
batch_size = 1  # Adjust based on your system's memory
translations = []
for i in range(0, len(sample_df), batch_size):
    batch_texts = sample_df['Hindi_clean'].iloc[i:i+batch_size].tolist()
    batch_translations = translate_batch(batch_texts, model, tokenizer, device)
    translations.extend(batch_translations)


sample_df['predicted_translation'] = translations
sample_df

In [None]:
# Save the model, tokenizer, and generation configuration
save_directory = "./saved_model"
model.save_pretrained(save_directory, gen_config=gen_config)
tokenizer.save_pretrained(save_directory)