In [8]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import plotly.express as px  # plotly.express 임포트
import pandas as pd          # plotly에 데이터를 전달하기 위해 pandas 사용

In [4]:

# --- 2. 전처리 도구 초기화 (동일) ---
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


In [5]:
# --- 3. 1단계의 전처리 파이프라인 (동일) ---
def preprocess_text_pipeline(text):
  lower_text = text.lower()
  tokens = word_tokenize(lower_text)

  processed_tokens = []
  for token in tokens:
    if token.isalpha():
      if token not in stop_words:
        stemmed_token = stemmer.stem(token)
        processed_tokens.append(stemmed_token)

  return processed_tokens

In [6]:
# --- 4. 2단계 (수정): 단어 빈도 분석 및 Plotly 시각화 ---

def analyze_top_words(text, n=10):
  """
  전처리된 텍스트에서 상위 N개 단어 빈도를 분석합니다. (동일)
  """
  tokens = preprocess_text_pipeline(text)
  word_counts = Counter(tokens)
  top_n_words = word_counts.most_common(n)

  return top_n_words

In [17]:
def visualize_top_words_plotly(top_words, top_n):
  """
  Plotly.express를 사용하여 상위 단어 빈도를 바 차트로 시각화합니다.
  """
  # 1. Plotly가 사용하기 쉬운 Pandas DataFrame으로 변환
  # [('nlp', 3), ('languag', 3)] -> DataFrame
  df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])

  # 2. Plotly.express로 바 차트 생성
  #
  fig = px.bar(
    df,
    x='Word',
    y='Frequency',
    title=f'Top {top_n} Most Frequent Words (Plotly)',
    labels={'Word': 'Words (어간)', 'Frequency': 'Frequency (빈도)'}, # 축 레이블 설정
    hover_data=['Word', 'Frequency'] # 마우스 올렸을 때 정보
  )

  # 3. 차트 x축 순서를 빈도순으로 정렬 (가독성 향상)
  fig.update_layout(xaxis={'categoryorder':'total descending'})

  # 4. 차트 표시
  # (Jupyter Notebook, Google Colab, VSCode Notebook 등에서는
  #  자동으로 브라우저 탭을 열거나 인라인으로 표시됩니다.)
  fig.show()

In [12]:

example_text = "Hello, Dr. Smith! How are you today? " \
               "NLTK makes Python programming with natural language " \
               "fun and easy. We are learning about preprocessing." \
               "Natural language processing (NLP) is a subfield of " \
               "linguistics, computer science, and artificial intelligence " \
               "concerned with the interactions between computers and human language, " \
               "in particular how to program computers to process and analyze " \
               "large amounts of natural language data." \
               "Preprocessing is a key step in NLP."

In [13]:
example_text = """
Natural Language Processing (NLP) is a fascinating field within artificial intelligence (AI).
NLP focuses on the interaction between computers and human language, and the ultimate goal
is to enable computers to understand, interpret, and generate human languages in a way that
is both meaningful and useful.

The applications of NLP are everywhere. Think about spam filters in your email,
virtual assistants like Siri or Alexa, and the machine translation services we use daily.
These technologies rely heavily on NLP techniques.

To build these applications, data scientists use various methods.
Early approaches relied on rule-based systems, but modern NLP heavily utilizes
machine learning (ML) and, more recently, deep learning.
Deep learning models, particularly transformers, have achieved state-of-the-art results.

Before feeding text data into any machine learning model, preprocessing is a critical step.
This preprocessing pipeline often includes tokenization (splitting text into words),
stopword removal (removing common words like 'the' or 'is'), and stemming or lemmatization
(reducing words to their root form). For example, 'learning', 'learned', and 'learns'
might all be reduced to 'learn'.

This cleaning process ensures that the model focuses on the words that carry the most meaning.
Good data preprocessing leads to better model performance.
We are learning how to build these preprocessing pipelines right now, which is a fundamental
skill for any data scientist working with language. Learning NLP is a rewarding journey.
"""

In [18]:


print(f"--- 원본 텍스트 ---")
print(example_text)
print("-" * 30)

top_n = 20
# 1. 빈도 분석 실행
top_words = analyze_top_words(example_text, n=top_n)

print(f"--- Top {top_n} 빈출 단어 (어간 기준) ---")
print(top_words)
print("-" * 30)

# 2. Plotly 시각화 실행
print("--- 빈도수 시각화 (Plotly) ---")
visualize_top_words_plotly(top_words, top_n)

--- 원본 텍스트 ---

Natural Language Processing (NLP) is a fascinating field within artificial intelligence (AI). 
NLP focuses on the interaction between computers and human language, and the ultimate goal 
is to enable computers to understand, interpret, and generate human languages in a way that 
is both meaningful and useful. 

The applications of NLP are everywhere. Think about spam filters in your email, 
virtual assistants like Siri or Alexa, and the machine translation services we use daily. 
These technologies rely heavily on NLP techniques.

To build these applications, data scientists use various methods. 
Early approaches relied on rule-based systems, but modern NLP heavily utilizes 
machine learning (ML) and, more recently, deep learning. 
Deep learning models, particularly transformers, have achieved state-of-the-art results.

Before feeding text data into any machine learning model, preprocessing is a critical step. 
This preprocessing pipeline often includes tokenization (sp