In [1]:
!pip install scikit-learn nltk pandas



In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# 1. 머신러닝 도구 임포트
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
# (1: 긍정, 0: 부정)
data = {
  'text': [
    "This movie was absolutely fantastic! Loved it.",
    "What a terrible film. I wasted my time.",
    "An amazing piece of art, truly inspiring.",
    "Completely boring and predictable. Do not recommend.",
    "The acting was brilliant, and the plot was engaging.",
    "I fell asleep halfway through. So dull.",
    "A masterpiece! I will watch it again.",
    "Worst movie I have ever seen in my life.",
    "Simply wonderful, a must-see for everyone.",
    "It was okay, not great but not bad either.", # 중립적 (0으로 분류)
    "The story was very weak and the characters were flat.",
    "I was on the edge of my seat the entire time! Excellent."
  ],
  'label': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1]
}
df = pd.DataFrame(data)

print("--- 1. 원본 데이터 ---")
print(df.head())

--- 1. 원본 데이터 ---
                                                text  label
0     This movie was absolutely fantastic! Loved it.      1
1            What a terrible film. I wasted my time.      0
2          An amazing piece of art, truly inspiring.      1
3  Completely boring and predictable. Do not reco...      0
4  The acting was brilliant, and the plot was eng...      1


In [5]:
stemmer = PorterStemmer()
stop_words_en = set(stopwords.words('english'))

In [6]:
def preprocess_text(text):
  # 1. 정규식 + 소문자화
  text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
  # 2. 토큰화
  tokens = word_tokenize(text)
  # 3. 불용어 제거 + 어간 추출
  clean_tokens = [stemmer.stem(w) for w in tokens if w not in stop_words_en and len(w) > 1]
  # 4. TfidfVectorizer를 위해 다시 문자열로 합치기
  return " ".join(clean_tokens)

In [7]:
# 전처리 함수를 모든 'text' 컬럼에 적용
df['processed_text'] = df['text'].apply(preprocess_text)

In [8]:
print("\n--- 2. 전처리 완료된 텍스트 ---")
print(df[['text', 'processed_text']].head())


--- 2. 전처리 완료된 텍스트 ---
                                                text  \
0     This movie was absolutely fantastic! Loved it.   
1            What a terrible film. I wasted my time.   
2          An amazing piece of art, truly inspiring.   
3  Completely boring and predictable. Do not reco...   
4  The acting was brilliant, and the plot was eng...   

                   processed_text  
0       movi absolut fantast love  
1          terribl film wast time  
2      amaz piec art truli inspir  
3  complet bore predict recommend  
4        act brilliant plot engag  


In [9]:
# (전처리된 텍스트를 숫자 행렬로 변환)
vectorizer = TfidfVectorizer()

In [10]:
X_tfidf = vectorizer.fit_transform(df['processed_text'])
y = df['label']

In [11]:
print("\n--- 3. TF-IDF 변환 결과 (Shape) ---")
# (12개의 문서, 48개의 고유 단어로 변환됨)
print(f"변환된 데이터 Shape: {X_tfidf.shape}")


--- 3. TF-IDF 변환 결과 (Shape) ---
변환된 데이터 Shape: (12, 47)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
  X_tfidf, y, test_size=0.25, random_state=42 # 12개 중 3개를 테스트용으로
)

In [13]:
model = LogisticRegression()
model.fit(X_train, y_train)

print("\n--- 4. 모델 학습 완료 ---")


--- 4. 모델 학습 완료 ---


In [14]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Negative(0)', 'Positive(1)'], zero_division=0)

print("\n--- 5. 모델 평가 결과 ---")
print(f"정확도 (Accuracy): {accuracy:.4f}")
print("\n[Classification Report]")
print(report)


--- 5. 모델 평가 결과 ---
정확도 (Accuracy): 0.3333

[Classification Report]
              precision    recall  f1-score   support

 Negative(0)       0.00      0.00      0.00         2
 Positive(1)       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



In [15]:
print("\n--- 6. 새로운 리뷰 예측 테스트 ---")
new_reviews = [
  "This was an amazing movie, truly great!",
  "I hated this film, it was so bad."
]


--- 6. 새로운 리뷰 예측 테스트 ---


In [16]:

for review in new_reviews:
  # 1. 새 리뷰도 동일하게 전처리
  processed_review = preprocess_text(review)
  # 2. 학습된 Vectorizer로 숫자 벡터 변환 (.transform() 사용!)
  vectorized_review = vectorizer.transform([processed_review])
  # 3. 학습된 모델로 예측
  prediction = model.predict(vectorized_review)

  result = '긍정 (Positive)' if prediction[0] == 1 else '부정 (Negative)'
  print(f"리뷰: '{review}'\n  -> 예측 결과: {result}\n")

리뷰: 'This was an amazing movie, truly great!'
  -> 예측 결과: 긍정 (Positive)

리뷰: 'I hated this film, it was so bad.'
  -> 예측 결과: 긍정 (Positive)

