### Text analysis 

This notebook conducts text analysis and displays results interactively

#### Word analysis:

* Frequency analysis of verbs, nouns, and adjectives
* Identification of longest words by part of speech
* Exploration of words within specific frequency ranges
* Calculation of word statistics (total, unique, mean/median frequency)

#### Noun Phrase analysis:

* Extraction and counting of frequent noun phrases
* Identification of least common noun phrases above a minimum frequency
* Exploration of noun phrases within a specific frequency range
* Detection of longest noun phrases

#### Word cloud visualization:

* Generation of customizable word clouds for verbs, nouns, and adjectives
* Ability to choose color scheme, maximum number of words, and background color

#### Usecases:
* Terminology extraction
* Vocabulary research
* Content analysis

#### Recommended: familiarity with
- Python basics (variables, functions)
- Environment management
- Directory structure
- Package management
- Git for version control

In [None]:
import numpy as np # numerical computing library for array operations and mathematical functions
from collections import Counter # to count occurrences of items 
import spacy # library for text processing and linguistic analysis
from wordcloud import WordCloud # library for creating word cloud visualizations
import matplotlib.pyplot as plt # plotting library for creating static, animated, and interactive visualizations
from nltk.corpus import stopwords # Natural Language Toolkit's collection of text corpora, including stopwords
import re # regular expressions for pattern matching in text

In [None]:
class TextPreprocessor:
    """Handles text loading and preprocessing with lemmatization"""
    def __init__(self, chunk_size=100000):
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = set(stopwords.words('english'))
        self.chunk_size = chunk_size
        
    def load_text(self, file_path):
        """Load text from file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except FileNotFoundError:
            print(f"Error: The file {file_path} was not found.")
            return ""
            
    def split_paragraphs(self, text):
        """Split text into paragraphs"""
        return [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
        
    def process_text(self, text):
        """Process text in chunks to avoid memory issues"""
        paragraphs = self.split_paragraphs(text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for para in paragraphs:
            if current_length + len(para) > self.chunk_size:
                # Process current chunk
                chunk_text = '\n\n'.join(current_chunk)
                chunks.append(self.nlp(chunk_text))
                # Start new chunk
                current_chunk = [para]
                current_length = len(para)
            else:
                current_chunk.append(para)
                current_length += len(para)
        
        # Process last chunk if it exists
        if current_chunk:
            chunk_text = '\n\n'.join(current_chunk)
            chunks.append(self.nlp(chunk_text))
        
        return chunks

class FrequencyAnalyzer:
    """Handles POS frequency analysis with lemmatization"""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
    
    def get_pos_frequencies(self, doc_chunks):
        """Get frequencies by part of speech from chunks, using lemmas"""
        verbs = Counter()
        nouns = Counter()
        adjectives = Counter()
        
        # Process each chunk
        if isinstance(doc_chunks, list):
            for doc in doc_chunks:
                self._process_doc_tokens(doc, verbs, nouns, adjectives)
        else:
            self._process_doc_tokens(doc_chunks, verbs, nouns, adjectives)
        
        return (FrequencyDist(verbs), 
                FrequencyDist(nouns), 
                FrequencyDist(adjectives))
    
    def _process_doc_tokens(self, doc, verbs, nouns, adjectives):
        """Process tokens using lemmas"""
        for token in doc:
            if not token.is_stop and token.is_alpha and len(token.text) > 2:
                lemma = token.lemma_.lower()
                if lemma not in self.preprocessor.stop_words:
                    if token.pos_ == "VERB":
                        verbs[lemma] += 1
                    elif token.pos_ == "NOUN":
                        nouns[lemma] += 1
                    elif token.pos_ == "ADJ":
                        adjectives[lemma] += 1

class FrequencyDist:
    """Custom frequency distribution class"""
    def __init__(self, counter):
        self.counter = counter
        self._sorted_items = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    
    def most_common(self, n=None):
        """Get most frequent words"""
        if n is None:
            return self._sorted_items
        return self._sorted_items[:n]
    
    def least_common(self, n=None, min_freq=1):
        """Get least frequent words above minimum frequency"""
        filtered = [(w, c) for w, c in self._sorted_items if c >= min_freq]
        if n is None:
            return filtered[::-1]
        return filtered[::-1][:n]
    
    def get_range(self, start=None, end=None):
        """Get words within frequency range"""
        items = [(w, c) for w, c in self._sorted_items 
                if (start is None or c >= start) and 
                   (end is None or c <= end)]
        return items
    
    def get_stats(self):
        """Calculate statistical measures"""
        frequencies = list(self.counter.values())
        return {
            'total_occurrences': sum(frequencies),
            'unique_words': len(self.counter),
            'mean_frequency': np.mean(frequencies),
            'median_frequency': np.median(frequencies),
            'max_frequency': max(frequencies),
            'min_frequency': min(frequencies)
        }

    def get_longest(self, n=10):
        """Get n longest words/phrases by character length"""
        sorted_by_length = sorted(self.counter.items(), 
                            key=lambda x: (len(x[0]), x[1]), 
                            reverse=True)
        return sorted_by_length[:n]
    
    def items(self):
        """Return counter items"""
        return self.counter.items()
    
    def values(self):
        """Return frequency values"""
        return self.counter.values()
    
    def keys(self):
        """Return words"""
        return self.counter.keys()
    
    def __getitem__(self, key):
        """Allow dictionary-style access"""
        return self.counter[key]

class PhraseAnalyzer:
    """Handles noun phrase analysis"""
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
    
    def get_noun_phrases(self, doc_chunks):
        """Extract and count noun phrases"""
        noun_phrases = Counter()
        
        for doc in doc_chunks:
            for chunk in doc.noun_chunks:
                # Clean and normalize the phrase
                phrase = ' '.join([token.lemma_.lower() for token in chunk 
                                 if not token.is_stop and token.is_alpha])
                if phrase and len(phrase.split()) > 1:  # Only phrases with 2+ words
                    noun_phrases[phrase] += 1
        
        return FrequencyDist(noun_phrases)

class TextAnalyzer:
    """Main class that coordinates all analysis"""
    def __init__(self, chunk_size=100000):
        self.preprocessor = TextPreprocessor(chunk_size=chunk_size)
        self.frequency_analyzer = FrequencyAnalyzer(self.preprocessor)
        self.phrase_analyzer = PhraseAnalyzer(self.preprocessor)
    
    def analyze(self, file_path):
        """Perform complete text analysis"""
        print("Loading and preprocessing text...")
        full_text = self.preprocessor.load_text(file_path)
        doc_chunks = self.preprocessor.process_text(full_text)
        
        print("Analyzing word frequencies...")
        verbs, nouns, adjectives = self.frequency_analyzer.get_pos_frequencies(doc_chunks)
        
        print("Analyzing noun phrases...")
        noun_phrases = self.phrase_analyzer.get_noun_phrases(doc_chunks)
        
        return verbs, nouns, adjectives, noun_phrases

class WordCloudGenerator:
    """Handles interactive word cloud generation"""
    def __init__(self):
        self.colormap_options = ['viridis', 'Blues', 'Reds', 'Greens', 
                               'Purples', 'plasma', 'inferno']
    
    def create_wordcloud(self, freq_dist, title, 
                        colormap='viridis', 
                        width=800, 
                        height=400, 
                        max_words=100,
                        background_color='white'):
        """Create and display word cloud"""
        wordcloud = WordCloud(
            width=width,
            height=height,
            background_color=background_color,
            max_words=max_words,
            colormap=colormap
        ).generate_from_frequencies(freq_dist.counter)
        
        plt.figure(figsize=(15, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title)
        plt.show()
    
    def interactive_wordcloud(self, freq_dist, pos_name):
        """Create word cloud with user-specified parameters"""
        print(f"\nCreating word cloud for {pos_name}")
        print("\nAvailable color schemes:")
        for i, cmap in enumerate(self.colormap_options, 1):
            print(f"{i}. {cmap}")
        
        try:
            cmap_choice = int(input("\nChoose color scheme (number): ")) - 1
            colormap = self.colormap_options[cmap_choice]
        except (ValueError, IndexError):
            print("Invalid choice, using default (viridis)")
            colormap = 'viridis'
        
        try:
            max_words = int(input("Maximum number of words (default 100): "))
        except ValueError:
            print("Invalid input, using default (100)")
            max_words = 100
        
        bg_color = input("Background color (white/black, default white): ").lower()
        if bg_color not in ['white', 'black']:
            bg_color = 'white'
        
        self.create_wordcloud(
            freq_dist,
            f"{pos_name} Word Cloud",
            colormap=colormap,
            max_words=max_words,
            background_color=bg_color
        )

def interactive_analysis(verbs, nouns, adjectives, noun_phrases):
    """Interactive function to explore frequencies and create word clouds"""
    wordcloud_gen = WordCloudGenerator()
    valid_pos = {'v', 'n', 'a'}
    
    while True:
        print("\nAnalysis Options:")
        print("1. Word analysis")
        print("2. Noun Phrase analysis")
        print("3. Create word cloud")
        print("4. Show word statistics")
        print("5. Exit")
        
        choice = input("\nEnter your choice (1-5): ")
        if choice not in ['1', '2', '3', '4', '5']:
            print("Please enter a valid option (1-5)")
            continue

        if choice == '1':
            print("\nWord Analysis Options:")
            print("1. Most common words")
            print("2. Least common words")
            print("3. Show words infrequency range")
            print("4. Show longest words")
    
            word_choice = input("\nEnter your choice (1-4): ")
            if word_choice not in ['1', '2', '3', '4']:
                print("Please enter a valid option (1-4)")
                continue

            pos = input("Which POS? (v/n/a): ").lower()
            if pos not in valid_pos:
                print("Please enter a valid POS (v for verbs, n for nouns, a for adjectives)")
                continue

            freq_dist = {'v': verbs, 'n': nouns, 'a': adjectives}[pos]
            pos_name = {'v': 'Verbs', 'n': 'Nouns', 'a': 'Adjectives'}[pos]

            if word_choice == '1':
                try:
                    n = int(input("How many words? "))
                    if n <= 0:
                        print("Please enter a positive number")
                        continue
                except ValueError:
                    print("Please enter a valid number")
                    continue
                print(f"\nMost common {pos_name.lower()}:")
                for word, count in freq_dist.most_common(n):
                    print(f"{word}: {count}")
                    
            elif word_choice == '2':
                try:
                    n = int(input("How many words? "))
                    min_freq = int(input("Minimum frequency? "))
                    if n <= 0 or min_freq < 0:
                        print("Please enter valid numbers (n > 0, min_freq >= 0)")
                        continue
                except ValueError:
                    print("Please enter valid numbers")
                    continue
                print(f"\nLeast common {pos_name.lower()} (min freq: {min_freq}):")
                for word, count in freq_dist.least_common(n, min_freq):
                    print(f"{word}: {count}")
                    
            elif word_choice == '3':
                try:
                    start = int(input("Start frequency: "))
                    end = int(input("End frequency: "))
                    if start < 0 or end < start:
                        print("Please enter valid frequencies (start >= 0, end >= start)")
                        continue
                except ValueError:
                    print("Please enter valid numbers")
                    continue
                print(f"\n{pos_name} with frequency between {start} and {end}:")
                for word, count in freq_dist.get_range(start, end):
                    print(f"{word}: {count}")

            elif word_choice == '4':
                try:
                    n = int(input("How many words? "))
                    if n <= 0:
                        print("Please enter a positive number")
                        continue
                except ValueError:
                    print("Please enter a valid number")
                    continue
                print(f"\nLongest {pos_name.lower()}:")
                for word, count in freq_dist.get_longest(n):
                    print(f"{word} ({len(word)} chars): {count} occurrences")

        elif choice == '2':
            print("\nNoun Phrase Analysis Options:")
            print("1. Most common noun phrases")
            print("2. Least common noun phrases")
            print("3. Show frequency range")
            print("4. Show longest phrases")
    
            phrase_choice = input("\nEnter your choice (1-4): ")
            if phrase_choice not in ['1', '2', '3', '4']:
                print("Please enter a valid option (1-4)")
                continue
            
            if phrase_choice == '1':
                try:
                    n = int(input("How many phrases? "))
                    if n <= 0:
                        print("Please enter a positive number")
                        continue
                except ValueError:
                    print("Please enter a valid number")
                    continue
                print("\nMost common noun phrases:")
                for phrase, count in noun_phrases.most_common(n):
                    print(f"{phrase}: {count}")
                
            elif phrase_choice == '2':
                try:
                    n = int(input("How many phrases? "))
                    min_freq = int(input("Minimum frequency? "))
                    if n <= 0 or min_freq < 0:
                        print("Please enter valid numbers (n > 0, min_freq >= 0)")
                        continue
                except ValueError:
                    print("Please enter valid numbers")
                    continue
                print(f"\nLeast common noun phrases (min freq: {min_freq}):")
                for phrase, count in noun_phrases.least_common(n, min_freq):
                    print(f"{phrase}: {count}")
                
            elif phrase_choice == '3':
                try:
                    start = int(input("Start frequency: "))
                    end = int(input("End frequency: "))
                    if start < 0 or end < start:
                        print("Please enter valid frequencies (start >= 0, end >= start)")
                        continue
                except ValueError:
                    print("Please enter valid numbers")
                    continue
                print(f"\nNoun phrases with frequency between {start} and {end}:")
                for phrase, count in noun_phrases.get_range(start, end):
                    print(f"{phrase}: {count}")

            elif phrase_choice == '4':
                try:
                    n = int(input("How many phrases? "))
                    if n <= 0:
                        print("Please enter a positive number")
                        continue
                except ValueError:
                    print("Please enter a valid number")
                    continue
                print("\nLongest noun phrases:")
                for phrase, count in noun_phrases.get_longest(n):
                    print(f"{phrase} ({len(phrase)} chars): {count} occurrences")
                    
        elif choice == '3':
            pos = input("Which POS? (v/n/a): ").lower()
            if pos not in valid_pos:
                print("Please enter a valid POS (v for verbs, n for nouns, a for adjectives)")
                continue
            freq_dist = {'v': verbs, 'n': nouns, 'a': adjectives}[pos]
            pos_name = {'v': 'Verbs', 'n': 'Nouns', 'a': 'Adjectives'}[pos]
            wordcloud_gen.interactive_wordcloud(freq_dist, pos_name)
                
        elif choice == '4':
            for pos_type, freq_dist in [
                ("Verbs", verbs),
                ("Nouns", nouns),
                ("Adjectives", adjectives)
            ]:
                stats = freq_dist.get_stats()
                print(f"\n{pos_type} Statistics:")
                print("-" * 50)
                print(f"Total {pos_type.lower()}: {stats['total_occurrences']}")
                print(f"Unique {pos_type.lower()}: {stats['unique_words']}")
                print(f"Average frequency: {stats['mean_frequency']:.2f}")
                print(f"Median frequency: {stats['median_frequency']:.2f}")
                print(f"Most frequent: {stats['max_frequency']}")
                print(f"Least frequent: {stats['min_frequency']}")
                            
        elif choice == '5':
            break
            
        input("\nPress Enter to continue...")
if __name__ == "__main__":
    analyzer = TextAnalyzer(chunk_size=100000)
    verbs, nouns, adjectives, noun_phrases = analyzer.analyze('input_data/ulysses.txt')
    interactive_analysis(verbs, nouns, adjectives, noun_phrases)