In [13]:
import os
os.system("pip install import-ipynb")

0

In [9]:
from deep_translator import GoogleTranslator
import re
import string
import inflect
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from symspellpy import SymSpell, Verbosity
from langdetect import detect
from nltk.corpus import wordnet

In [2]:
def deep_translate(text):
    try:
        if not isinstance(text, str) or not text.strip():
            return "Invalid or Empty Text"
        return GoogleTranslator(source='auto', target='en').translate(text)
    except Exception as e:
        raise ValueError(f"Deep Translator Error: {e}")

In [None]:


# Initialize necessary components
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "../Assets/frequency_dictionary_en_82_765.txt"
if not sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1):
    raise FileNotFoundError(f"SymSpell dictionary file not found at {dictionary_path}")

inflect_engine = inflect.engine()
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words("english"))

def detect_language(text):
    """Detect the language of a given text, returning 'unknown' if detection fails."""
    try:
        return detect(text)
    except:
        return "unknown"

def convert_numbers(text):
    """Convert numeric digits to their word representation (e.g., '4' → 'four')."""
    return re.sub(r'\b\d+\b', lambda x: inflect_engine.number_to_words(x.group()), text)

def correct_spelling(word):
    """Correct spelling using SymSpell, returning the closest suggestion if available."""
    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    return suggestions[0].term if suggestions else word

def get_wordnet_pos(word):
    """Map NLTK POS tags to WordNet POS for better lemmatization."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(text, remove_stopwords=False, use_stemming=False, use_lemmatization=True):
    """Preprocess text by normalizing, cleaning, tokenizing, correcting, and optionally lemmatizing/stemming."""
    if pd.isna(text) or not isinstance(text, str):
        return ""

    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase

    text = convert_numbers(text)  # Convert numbers to words

    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation

    words = word_tokenize(text)

    words = [correct_spelling(word) for word in words]  # Correct spelling

    if remove_stopwords:
        words = [word for word in words if word not in stop_words]  # Remove stopwords

    if use_stemming:
        words = [stemmer.stem(word) for word in words]  # Stemming

    if use_lemmatization:
        words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]  # Lemmatization

    return " ".join(words)

In [10]:
import import_ipynb
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer



# def translate_text(text):
#     return deep_translate(text)

# def preprocess_single_text(text, remove_stopwords=False, use_stemming=False, use_lemmatization=True):
#     text = preprocess_text(text, remove_stopwords, use_stemming, use_lemmatization)
#     return text

# def feature_extract_single_text(text, vectorizer):
#     text_tfidf = vectorizer.transform([text])
#     return text_tfidf

# def predict_single_text(text, vectorizer, model, remove_stopwords=False, use_stemming=False, use_lemmatization=True):
#     translated_text = translate_text(text)
#     preprocessed_text = preprocess_single_text(translated_text, remove_stopwords, use_stemming, use_lemmatization)
#     text_tfidf = feature_extract_single_text(preprocessed_text, vectorizer)
#     prediction = model.predict(text_tfidf)
#     return prediction

# def run_single_text_pipeline(text):
#     vectorizer = joblib.load('vectorizer.pkl')  
#     model = joblib.load('model.pkl')  

#     prediction = predict_single_text(text, vectorizer, model)

#     print("✅ Prediction completed successfully!")
#     return prediction

# # Example usage:
# text = "Enter the text here for prediction"
# prediction = run_single_text_pipeline(text)
# print(f"Prediction: {prediction}")
