In [None]:
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = pd.read_csv('adco_data_compcrit.csv',encoding='latin-1', header=None)
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
data= data[['sentiment', 'text']]

def relabel_sentiment(label):
    if label in [3, 4]:
        return 'positive'  # 긍정
    elif label == 2:
        return 'neutral'   # 중립
    elif label in [0, 1]:
        return 'negative'  # 부정
    else:
        return 'unknown'   # 알 수 없는 값

data['sentiment'] = data['sentiment'].apply(relabel_sentiment)

stopwords = stopwords.words('english') #불용어처리: 관사, 대명사, 전치사, 접속사, 부사...
punctuations = list(string.punctuation)

def preprocess_text(text):
  tokens = word_tokenize(text.lower()) #소문자로 바꾸고 텍스트 단어 단위로 분리
  tokens = [token for token in tokens if token not in stopwords and token not in punctuations] #불용어랑 문장부호 제거
  preprocessed_text = ' '.join(tokens) #토큰을 공백으로 연결해 문장형태로 변환 
  return preprocessed_text

data['text']=data['text'].apply(preprocess_text)

X_train, X_test, y_train, y_test = train_test_split(data['text'],data['sentiment'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer() 
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

model= MultinomialNB() 
model.fit(X_train_counts, y_train)
y_pred = model.predict(X_test_counts)

def analyze_sentiments(text, model, vectorizer):

    from nltk.tokenize import sent_tokenize #문장분리
    sentences = sent_tokenize(text)

    sentence_vectors = vectorizer.transform(sentences)

    predictions = model.predict(sentence_vectors)

    total_sentences = len(predictions)
    positive_count = sum(1 for sentiment in predictions if sentiment == "positive")
    negative_count = sum(1 for sentiment in predictions if sentiment == "negative")

    positive_ratio = positive_count / total_sentences if total_sentences > 0 else 0
    negative_ratio = negative_count / total_sentences if total_sentences > 0 else 0

    print(f"칭찬 문장 비율 (긍정): {positive_ratio:.2f}")
    print(f"비판 문장 비율 (부정): {negative_ratio:.2f}")
    return positive_ratio, negative_ratio