In [None]:
import re
import unicodedata
from typing import List, Dict, Set, Tuple
from collections import defaultdict

# Optional dependencies
try:
    from langdetect import detect, detect_langs
    LANGDETECT_AVAILABLE = True
except ImportError:
    LANGDETECT_AVAILABLE = False

try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False

class EnglishInSpanishDetector:
    def __init__(self):
        # Common English words that appear in Spanish conversations
        self.common_english_words = {
            # Technology and business terms
            'computer', 'laptop', 'email', 'internet', 'website', 'online', 'app',
            'smartphone', 'tablet', 'software', 'hardware', 'wifi', 'bluetooth',
            'facebook', 'instagram', 'whatsapp', 'google', 'amazon', 'netflix',
            
            # Customer service terms
            'customer service', 'manager', 'supervisor', 'representative', 'agent',
            'account', 'balance', 'payment', 'credit', 'debit', 'transaction',
            'statement', 'invoice', 'receipt', 'refund', 'discount', 'promotion',
            
            # Common expressions
            'okay', 'ok', 'yes', 'no', 'please', 'thank you', 'thanks', 'sorry',
            'excuse me', 'hello', 'hi', 'bye', 'goodbye', 'welcome', 'perfect',
            
            # Time and numbers
            'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
            'january', 'february', 'march', 'april', 'may', 'june', 'july',
            'august', 'september', 'october', 'november', 'december',
            
            # Locations and addresses
            'street', 'avenue', 'boulevard', 'road', 'drive', 'lane', 'court',
            'apartment', 'suite', 'floor', 'building', 'office', 'mall', 'center',
            
            # Common verbs and adjectives
            'update', 'upgrade', 'download', 'upload', 'login', 'logout', 'reset',
            'cancel', 'confirm', 'submit', 'apply', 'approve', 'reject', 'pending',
            'available', 'unavailable', 'expired', 'valid', 'invalid', 'premium',
            
            # Measurements and quantities
            'dollar', 'dollars', 'cent', 'cents', 'percent', 'percentage',
            'gallon', 'gallons', 'mile', 'miles', 'inch', 'inches', 'foot', 'feet'
        }
        
        # English function words (articles, prepositions, etc.)
        self.english_function_words = {
            'the', 'a', 'an', 'and', 'or', 'but', 'if', 'when', 'where', 'how',
            'what', 'who', 'which', 'that', 'this', 'these', 'those', 'all',
            'some', 'any', 'every', 'each', 'both', 'either', 'neither',
            'in', 'on', 'at', 'by', 'for', 'with', 'without', 'to', 'from',
            'up', 'down', 'over', 'under', 'above', 'below', 'between', 'among',
            'I', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
            'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'ours',
            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'do', 'does', 'did', 'will', 'would', 'can', 'could', 'should', 'must'
        }
        
        # English patterns
        self.english_patterns = [
            r'\b[a-z]+ing\b',        # -ing endings (running, walking)
            r'\b[a-z]+ed\b',         # -ed endings (worked, played)
            r'\b[a-z]+ly\b',         # -ly adverbs (quickly, slowly)
            r'\b[a-z]+tion\b',       # -tion endings (information, station)
            r'\b[a-z]+ness\b',       # -ness endings (business, fitness)
            r'\bth[a-z]+\b',         # words starting with 'th' (the, this, that)
            r'\b[a-z]*ough[a-z]*\b', # words with 'ough' (through, tough)
            r'\b[a-z]*ph[a-z]*\b',   # words with 'ph' (phone, graph)
        ]
        
        # English names common in customer service
        self.english_names = {
            'john', 'james', 'robert', 'michael', 'william', 'david', 'richard',
            'joseph', 'thomas', 'christopher', 'charles', 'daniel', 'matthew',
            'anthony', 'donald', 'steven', 'paul', 'andrew', 'joshua', 'kenneth',
            'mary', 'patricia', 'jennifer', 'linda', 'elizabeth', 'barbara',
            'susan', 'jessica', 'sarah', 'karen', 'nancy', 'lisa', 'betty',
            'helen', 'sandra', 'donna', 'carol', 'ruth', 'sharon', 'michelle',
            'smith', 'johnson', 'williams', 'brown', 'jones', 'garcia', 'miller',
            'davis', 'rodriguez', 'martinez', 'hernandez', 'lopez', 'gonzalez'
        }
        
        # Speaker patterns (same as Spanish detector)
        self.speaker_patterns = [
            r'(Agent|Customer|Agente|Cliente):\s*',
            r'(A|C|AG|CU):\s*',
            r'\[.*?\]:\s*',
            r'\d+:\d+:\d+\s*-?\s*'
        ]
        
        # Initialize spaCy if available
        self.nlp_en = None
        self.nlp_es = None
        self.spacy_available = False
        
        if SPACY_AVAILABLE:
            try:
                self.nlp_en = spacy.load("en_core_web_sm")
                self.nlp_es = spacy.load("es_core_news_sm")
                self.spacy_available = True
            except OSError:
                self.spacy_available = False
    
    def clean_transcript_text(self, text: str) -> str:
        """Clean transcript text by removing speaker indicators and timestamps"""
        cleaned = text
        
        for pattern in self.speaker_patterns:
            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
        
        cleaned = re.sub(r'[\[\(]\d{1,2}:\d{2}:\d{2}[\]\)]', '', cleaned)
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        
        return cleaned
    
    def detect_english_words(self, text: str) -> List[Dict]:
        """Detect English words using dictionary lookup"""
        matches = []
        words = re.finditer(r'\b\w+\b', text)
        
        all_english_words = self.common_english_words | self.english_function_words
        
        for match in words:
            word = match.group().lower()
            if word in all_english_words:
                start, end = match.span()
                matches.append({
                    'text': match.group(),
                    'start': start,
                    'end': end,
                    'method': 'english_dictionary'
                })
        
        return matches
    
    def detect_english_patterns(self, text: str) -> List[Dict]:
        """Detect English words using morphological patterns"""
        matches = []
        
        for pattern in self.english_patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                word = match.group()
                start, end = match.span()
                
                # Filter out Spanish words that might match English patterns
                if not self.looks_spanish(word.lower()):
                    matches.append({
                        'text': word,
                        'start': start,
                        'end': end,
                        'method': 'english_pattern'
                    })
        
        return matches
    
    def looks_spanish(self, word: str) -> bool:
        """Check if a word looks Spanish (to avoid false positives)"""
        # Spanish indicators
        spanish_chars = {'ñ', 'á', 'é', 'í', 'ó', 'ú', 'ü'}
        if any(char in word for char in spanish_chars):
            return True
        
        # Common Spanish endings
        spanish_endings = ['ción', 'sión', 'idad', 'mente', 'ando', 'iendo', 'ería']
        if any(word.endswith(ending) for ending in spanish_endings):
            return True
        
        return False
    
    def detect_english_names(self, text: str) -> List[Dict]:
        """Detect English names"""
        matches = []
        words = re.finditer(r'\b[A-Z][a-z]+\b', text)
        
        for match in words:
            word = match.group().lower()
            if word in self.english_names:
                start, end = match.span()
                matches.append({
                    'text': match.group(),
                    'start': start,
                    'end': end,
                    'method': 'english_name'
                })
        
        return matches
    
    def detect_english_phrases(self, text: str) -> List[Dict]:
        """Detect common English phrases"""
        phrases = []
        text_lower = text.lower()
        
        english_phrases = [
            'thank you', 'excuse me', 'customer service', 'social security',
            'credit card', 'bank account', 'phone number', 'zip code',
            'how much', 'how many', 'what time', 'right now', 'of course',
            'no problem', 'you\'re welcome', 'have a good day', 'take care',
            'let me know', 'make sure', 'find out', 'check out', 'sign up',
            'log in', 'log out', 'call back', 'hang up', 'pick up'
        ]
        
        for phrase in english_phrases:
            pattern = r'\b' + re.escape(phrase) + r'\b'
            for match in re.finditer(pattern, text_lower):
                start, end = match.span()
                original_phrase = text[start:end]
                phrases.append({
                    'text': original_phrase,
                    'start': start,
                    'end': end,
                    'method': 'english_phrase'
                })
        
        return phrases
    
    def detect_english_in_spanish_context(self, text: str) -> List[Dict]:
        """Detect English words that appear in predominantly Spanish context"""
        if not LANGDETECT_AVAILABLE:
            return []
        
        results = []
        sentences = re.split(r'[.!?]+', text)
        current_pos = 0
        
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) < 10:
                current_pos += len(sentence) + 1
                continue
            
            try:
                # Check if sentence is predominantly Spanish
                detected_langs = detect_langs(sentence)
                is_spanish_context = False
                
                for lang_info in detected_langs:
                    if lang_info.lang == 'es' and lang_info.prob > 0.6:
                        is_spanish_context = True
                        break
                
                if is_spanish_context:
                    # Find English words in this Spanish sentence
                    words = re.finditer(r'\b\w+\b', sentence)
                    for word_match in words:
                        word = word_match.group().lower()
                        if (word in self.common_english_words or 
                            word in self.english_function_words):
                            
                            # Calculate position in original text
                            sent_start = text.find(sentence, current_pos)
                            if sent_start != -1:
                                word_start = sent_start + word_match.start()
                                word_end = sent_start + word_match.end()
                                
                                results.append({
                                    'text': word_match.group(),
                                    'start': word_start,
                                    'end': word_end,
                                    'method': 'english_in_spanish_context',
                                    'spanish_sentence': sentence,
                                    'confidence': lang_info.prob
                                })
            except:
                pass
            
            current_pos += len(sentence) + 1
        
        return results
    
    def spacy_english_detection(self, text: str) -> List[Dict]:
        """Use spaCy to detect English linguistic features"""
        if not self.spacy_available or not self.nlp_en:
            return []
        
        english_features = []
        
        try:
            doc = self.nlp_en(text)
            
            for token in doc:
                # Skip punctuation and spaces
                if not token.is_alpha:
                    continue
                
                # English linguistic indicators
                is_english = False
                feature_type = None
                
                # English POS patterns
                if token.pos_ in ['VERB'] and token.text.endswith(('ing', 'ed')):
                    is_english = True
                    feature_type = 'english_verb_form'
                elif token.pos_ == 'ADV' and token.text.endswith('ly'):
                    is_english = True
                    feature_type = 'english_adverb'
                elif token.pos_ == 'DET' and token.text.lower() in ['the', 'a', 'an']:
                    is_english = True
                    feature_type = 'english_article'
                elif token.text.lower() in self.english_function_words:
                    is_english = True
                    feature_type = 'english_function_word'
                
                if is_english:
                    english_features.append({
                        'text': token.text,
                        'start': token.idx,
                        'end': token.idx + len(token.text),
                        'method': 'spacy_english_linguistic',
                        'feature_type': feature_type,
                        'pos': token.pos_
                    })
        
        except:
            pass
        
        return english_features
    
    def analyze_spanish_text(self, text: str) -> Dict:
        """Main method to analyze Spanish text for English content"""
        cleaned_text = self.clean_transcript_text(text)
        
        results = {
            'original_text': text,
            'cleaned_text': cleaned_text,
            'english_words': [],
            'english_patterns': [],
            'english_names': [],
            'english_phrases': [],
            'english_in_spanish_context': [],
            'spacy_english_features': [],
            'summary': defaultdict(int),
            'spacy_status': self.spacy_available
        }
        
        # Detect English words
        english_words = self.detect_english_words(cleaned_text)
        results['english_words'] = english_words
        results['summary']['english_words'] = len(english_words)
        
        # Detect English patterns
        english_patterns = self.detect_english_patterns(cleaned_text)
        results['english_patterns'] = english_patterns
        results['summary']['english_patterns'] = len(english_patterns)
        
        # Detect English names
        english_names = self.detect_english_names(cleaned_text)
        results['english_names'] = english_names
        results['summary']['english_names'] = len(english_names)
        
        # Detect English phrases
        english_phrases = self.detect_english_phrases(cleaned_text)
        results['english_phrases'] = english_phrases
        results['summary']['english_phrases'] = len(english_phrases)
        
        # Detect English in Spanish context
        english_in_context = self.detect_english_in_spanish_context(cleaned_text)
        results['english_in_spanish_context'] = english_in_context
        results['summary']['english_in_spanish_context'] = len(english_in_context)
        
        # spaCy analysis
        if self.spacy_available:
            spacy_features = self.spacy_english_detection(cleaned_text)
            results['spacy_english_features'] = spacy_features
            results['summary']['spacy_english_features'] = len(spacy_features)
        
        return results
    
    def get_english_content_summary(self, text: str) -> Dict:
        """Get summary of English content in Spanish text"""
        analysis = self.analyze_spanish_text(text)
        
        # Collect all English content
        all_english = []
        
        for category in ['english_words', 'english_patterns', 'english_names', 
                        'english_phrases', 'english_in_spanish_context']:
            all_english.extend([item['text'] for item in analysis[category]])
        
        unique_english = list(set([item.lower() for item in all_english]))
        
        summary = {
            'total_english_items': len(all_english),
            'unique_english_items': len(unique_english),
            'english_content': sorted(unique_english),
            'code_switching_detected': len(analysis['english_in_spanish_context']) > 0,
            'predominant_language': 'mixed' if len(all_english) > 10 else 'spanish',
            'english_categories': {
                'business_terms': any(word in unique_english 
                                    for word in ['account', 'payment', 'service', 'manager']),
                'technology_terms': any(word in unique_english 
                                      for word in ['email', 'computer', 'online', 'app']),
                'greetings': any(word in unique_english 
                               for word in ['hello', 'hi', 'thank you', 'thanks']),
                'english_names': len(analysis['english_names']) > 0
            }
        }
        
        return summary

# Example usage
def test_english_in_spanish():
    detector = EnglishInSpanishDetector()
    
    # Example Spanish texts with English mixed in
    spanish_texts = [
        # Technology context
        "Necesito ayuda con mi email. No puedo hacer login en la website.",
        
        # Customer service context  
        "Hola, quiero hablar con el manager, por favor. Tengo problema con mi account.",
        
        # Mixed conversation
        "Mi nombre es María García. I need help con mi credit card. Thank you.",
        
        # Business context
        "Trabajo en marketing y uso mucho software para analytics. Es muy helpful.",
        
        # Daily conversation
        "Voy al mall con mis friends. Después vamos a eat en el restaurant."
    ]
    
    for i, text in enumerate(spanish_texts, 1):
        print(f"\n{'='*60}")
        print(f"SPANISH TEXT {i}:")
        print(f"{'='*60}")
        print(f"Original: {text}")
        
        # Analyze for English content
        summary = detector.get_english_content_summary(text)
        
        print(f"\nENGLISH CONTENT SUMMARY:")
        print(f"- Total English items: {summary['total_english_items']}")
        print(f"- Unique English content: {summary['english_content']}")
        print(f"- Code-switching detected: {summary['code_switching_detected']}")
        print(f"- Contains business terms: {summary['english_categories']['business_terms']}")
        print(f"- Contains technology terms: {summary['english_categories']['technology_terms']}")
        
        # Get detailed analysis
        analysis = detector.analyze_spanish_text(text)
        
        if analysis['english_phrases']:
            print(f"\nENGLISH PHRASES FOUND:")
            for phrase in analysis['english_phrases']:
                print(f"  - '{phrase['text']}'")
        
        if analysis['english_in_spanish_context']:
            print(f"\nENGLISH IN SPANISH CONTEXT:")
            for item in analysis['english_in_spanish_context']:
                print(f"  - '{item['text']}' in: '{item['spanish_sentence']}'")

if __name__ == "__main__":
    test_english_in_spanish()