In [12]:
import csv
import re
from collections import Counter

def analyze_cluster_file(file_path):
    total_queries = 0
    start_word_1gram = Counter()
    start_word_2gram = Counter()
    end_question_count = 0
    length_distribution = Counter()
    total_length_chars = 0
    total_length_words = 0
    queries_with_numbers = 0
    w_question_starts = 0

    # Neue Counter
    non_w_start_counter = Counter()
    non_w_queries = []
    end_char_counter = Counter()
    no_end_punctuation = 0

    # W-Fragewörter
    w_question_words = {"what", "when", "where", "why", "who", "how", "which", "whose", "whom"}

    # Häufige Fragephrasen
    frequent_patterns = [
        "what is", "how to", "how does", "how do", "why does", "can i",
        "does it", "what are", "why is", "why do"
    ]
    pattern_counter = Counter()

    # Regex
    number_pattern = re.compile(r'\d+')
    question_mark_pattern = re.compile(r'\?$')

    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        next(reader)  

        for row in reader:
            if len(row) < 2:
                continue

            query = row[1].strip()
            if not query:
                continue

            total_queries += 1
            words = query.lower().split()
            total_length_chars += len(query)
            total_length_words += len(words)

            if number_pattern.search(query):
                queries_with_numbers += 1

            if question_mark_pattern.search(query):
                end_question_count += 1
            else:
               
                last_char = query[-1]
                if last_char.isalnum():
                    no_end_punctuation += 1
                else:
                    end_char_counter[last_char] += 1

            if words:
                start_word = words[0]
                start_word_1gram[start_word] += 1

                if start_word in w_question_words:
                    w_question_starts += 1
                else:
                    non_w_start_counter[start_word] += 1
                    non_w_queries.append(' '.join(words[:5]))

                if len(words) > 1:
                    first_two = f"{words[0]} {words[1]}"
                    start_word_2gram[first_two] += 1
                    if first_two in frequent_patterns:
                        pattern_counter[first_two] += 1

            length_distribution[len(query)] += 1

 
    top_1grams = start_word_1gram.most_common(10)
    top_2grams = start_word_2gram.most_common(10)
    top_lengths = length_distribution.most_common(10)
    avg_chars = total_length_chars / total_queries
    avg_words = total_length_words / total_queries

    # Ausgabe
    print(f"Total queries: {total_queries}")
    print(f"Queries ending with '?': {end_question_count}")
    print(f"Queries starting with W-question words: {w_question_starts} ({w_question_starts / total_queries:.2%})")
    print(f"Queries not starting with W-question words: {total_queries - w_question_starts} ({(total_queries - w_question_starts) / total_queries:.2%})")
    print(f"Queries containing numbers: {queries_with_numbers} ({queries_with_numbers / total_queries:.2%})")
    print(f"\nAverage query length: {avg_chars:.2f} characters, {avg_words:.2f} words")

    print("\nTop 10 most common 1-word start phrases:")
    for word, count in top_1grams:
        print(f"  {word}: {count}x")

    print("\nTop 10 most common 2-word start phrases:")
    for phrase, count in top_2grams:
        print(f"  {phrase}: {count}x")

    print("\nTop 10 most common query lengths:")
    for length, count in top_lengths:
        print(f"  Length {length}: {count} queries")

    print(f"\nMost common query length: {top_lengths[0][0] if top_lengths else 'N/A'}")

    print("\nMost frequent question patterns (of interest):")
    for phrase in frequent_patterns:
        print(f"  {phrase}: {pattern_counter[phrase]}x")

    print("\nTop 10 start words for non-W queries:")
    for word, count in non_w_start_counter.most_common(10):
        print(f"  {word}: {count}x")

    print("\nSample non-W-query patterns (first 5 words):")
    for example in non_w_queries[:10]:
        print(f"  {example}...")

    print("\nQuery-Endzeichen (außer '?'):")
    for char, count in end_char_counter.most_common(10):
        readable = repr(char)
        print(f"  {readable}: {count}x")

    print(f"\nQueries ohne jegliches Satzzeichen am Ende: {no_end_punctuation} ({no_end_punctuation / total_queries:.2%})")

file_path = 'cluster3.tsv'  
analyze_cluster_file(file_path)


Total queries: 13401
Queries ending with '?': 3880
Queries starting with W-question words: 9731 (72.61%)
Queries not starting with W-question words: 3670 (27.39%)
Queries containing numbers: 2983 (22.26%)

Average query length: 79.05 characters, 11.30 words

Top 10 most common 1-word start phrases:
  how: 5212x
  what: 2970x
  why: 1090x
  is: 406x
  does: 394x
  do: 379x
  can: 351x
  are: 294x
  when: 244x
  who: 124x

Top 10 most common 2-word start phrases:
  what is: 1459x
  how to: 1301x
  how does: 820x
  what are: 514x
  how do: 507x
  how can: 243x
  what situations: 154x
  do not: 153x
  why is: 148x
  why do: 135x

Top 10 most common query lengths:
  Length 48: 262 queries
  Length 94: 233 queries
  Length 46: 215 queries
  Length 79: 213 queries
  Length 96: 205 queries
  Length 91: 188 queries
  Length 104: 186 queries
  Length 88: 185 queries
  Length 108: 182 queries
  Length 106: 181 queries

Most common query length: 48

Most frequent question patterns (of interest):
 