In [26]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ihoyeol/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ihoyeol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
# 텍스트 정제 함수
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = " ".join(text.split())
    return text

# 문장 복잡성 분석 함수 (원본 텍스트 사용)
def analyze_sentence_complexity(original_text):
    sentences = sent_tokenize(original_text)
    word_counts = [len(word_tokenize(sentence)) for sentence in sentences]
    return sum(word_counts) / len(sentences) if sentences else 0

def analyze_repetition_and_print_frequencies_sorted(original_text):
    words = word_tokenize(original_text)
    word_freq = Counter(words)

    # 빈도수가 1보다 큰 단어만 필터링하고 빈도수에 따라 정렬
    repeated_words_sorted = sorted(
        ((word, freq) for word, freq in word_freq.items() if freq > 1), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    repetition_ratio = sum(freq for _, freq in repeated_words_sorted) / len(words) if words else 0

    return repetition_ratio, repeated_words_sorted

# 토큰화 및 불용어 제거 함수
def tokenize_and_remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

In [52]:
# 예제 텍스트
example_text = "Another insight to reduced car zones brings Paris' incident with smog. Paris' officials created a system that would in fact lower smog rates. On Monday, the motorists with evennumbered license plates numbers would be ordered to leave their cars at home, or they would suffer a fine. Same rule would occur on Tuesday, except motorists with oddnumbered license plates were targeted with fines. Congestion, or traffic, was reduced by 60% after five days of intense smog. Diesel fuel played a huge part in this pollution, having the fact that 67% of vehicles in France are of Diesel fuel. The impact of the clearing of smog, resided in banning the Tuesday rule of odd license plates."




In [55]:
# 텍스트 정제 그리고 토큰화 반복, 단어 비율 및 빈도 분석
cleaned_text = clean_text(example_text)
sentence_complexity = analyze_sentence_complexity(example_text)
repetition_ratio, repeated_words_sorted = analyze_repetition_and_print_frequencies_sorted(example_text)
tokens = tokenize_and_remove_stopwords(cleaned_text)

In [61]:
# 결과 출력
print("원본 텍스트:", example_text)
print("\n토큰:", tokens)
print(f"\n문장 복잡성 (평균 단어 수/문장): {sentence_complexity:.2f}")
print(f"반복 단어 비율 (%): {repetition_ratio * 100:.2f}%")
print("\n많이 반복된 단어 순서대로:")
for word, freq in repeated_words_sorted:
    print(f"단어 '{word}': {freq}번 반복")


원본 텍스트: Another insight to reduced car zones brings Paris' incident with smog. Paris' officials created a system that would in fact lower smog rates. On Monday, the motorists with evennumbered license plates numbers would be ordered to leave their cars at home, or they would suffer a fine. Same rule would occur on Tuesday, except motorists with oddnumbered license plates were targeted with fines. Congestion, or traffic, was reduced by 60% after five days of intense smog. Diesel fuel played a huge part in this pollution, having the fact that 67% of vehicles in France are of Diesel fuel. The impact of the clearing of smog, resided in banning the Tuesday rule of odd license plates.

토큰: ['another', 'insight', 'reduced', 'car', 'zones', 'brings', 'paris', 'incident', 'smog', 'paris', 'officials', 'created', 'system', 'would', 'fact', 'lower', 'smog', 'rates', 'monday', 'motorists', 'evennumbered', 'license', 'plates', 'numbers', 'would', 'ordered', 'leave', 'cars', 'home', 'would', 'suffer