In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib import font_manager, rc
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import nltk
import chardet

%matplotlib inline

# nltk 데이터 다운로드 (첫 실행 시 필요)
nltk.download('vader_lexicon')

# nltk 데이터 다운로드 (첫 실행 시 필요)
nltk.download('stopwords')

# 글꼴 경로 지정
font_path = "c:/Windows/Fonts/malgun.ttf"  # 윈도우에 설치된 맑은 고딕 폰트 경로

# 폰트 이름 얻어오기
font_name = font_manager.FontProperties(fname=font_path).get_name()

# matplotlib의 rc(run command) 기능을 이용하여 글꼴 설정
mpl.rc('font', family=font_name)

# 유니코드에서  음수 부호 설정
mpl.rc('axes', unicode_minus=False)

raw = pd.read_csv('../../../../../datasets/paris_reviews.csv')
df = raw.copy()

df = df[['listing_id','date','comments']]

df = df.rename(columns= {
    'listing_id' : '숙소_id',
    'date': '리뷰날짜',
    'comments' : '리뷰'
})

# 리뷰 결측치 제거
df = df[~df['리뷰'].isnull()]

# 리뷰 내의 <br> 제거
df['리뷰'] = df['리뷰'].str.replace('<br>\s*', ' ', regex=True)
df['리뷰'] = df['리뷰'].str.replace('<br/>\s*', ' ', regex=True)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ciw96\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ciw96\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
len(df)

1793899

In [3]:
# nltk 데이터 다운로드 (첫 실행 시 필요)
nltk.download('vader_lexicon')

# nltk 데이터 다운로드 (첫 실행 시 필요)
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ciw96\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ciw96\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 감성 분석 (2시간 10분 걸림)

In [7]:
# 감성 분석 함수 정의
def analyze_sentiment_vader(comment):
    sid = SentimentIntensityAnalyzer()
    scores = sid.polarity_scores(comment)
    return scores

def analyze_sentiment_textblob(comment):
    analysis = TextBlob(comment)
    return analysis.sentiment.polarity

# VADER 감성 분석 결과 추가
df['vader_sentiment'] = df['리뷰'].apply(lambda x: analyze_sentiment_vader(str(x)))

# TextBlob 감성 분석 결과 추가
df['textblob_sentiment'] = df['리뷰'].apply(lambda x: analyze_sentiment_textblob(str(x)))

# VADER 점수 세부 항목 분리
df['부정'] = df['vader_sentiment'].apply(lambda x: x['neg'])  # 부정적인 감성의 비율
df['중립'] = df['vader_sentiment'].apply(lambda x: x['neu'])  # 중립적인 감성의 비율
df['긍정'] = df['vader_sentiment'].apply(lambda x: x['pos'])  # 긍정적인 감성의 비율
df['전체'] = df['vader_sentiment'].apply(lambda x: x['compound'])  # 전체적인 감성 점수


In [8]:
df

Unnamed: 0,숙소_id,리뷰날짜,리뷰,vader_sentiment,textblob_sentiment,부정,중립,긍정,전체
0,39948,2013-09-20,Aliyah et Philippe m'ont réservé un accueil ex...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,0.000,1.000,0.000,0.0000
1,39948,2013-09-29,Aliyah and Philippe are gracious hosts and eve...,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou...",0.325417,0.000,0.690,0.310,0.9847
2,3109,2017-10-28,Tout s'est bien déroulé. Merci bien. PG,"{'neg': 0.2, 'neu': 0.8, 'pos': 0.0, 'compound...",0.000000,0.200,0.800,0.000,-0.1280
3,3109,2017-11-03,Un petit nid fouiller douillet situé dans app...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,0.000,1.000,0.000,0.0000
4,3109,2018-07-24,"Appartement spacieux, propre,clair, et calme à...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.000000,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...
1794001,1107872133955829208,2024-03-16,We had the best time staying at this apartment...,"{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'comp...",0.575000,0.000,0.641,0.359,0.9854
1794002,1109396868279302811,2024-03-15,On a passé un magnifique séjour dans l'apparte...,"{'neg': 0.0, 'neu': 0.964, 'pos': 0.036, 'comp...",0.500000,0.000,0.964,0.036,0.6784
1794003,1108741370485532713,2024-03-10,Superbe découverte que cet appartement d Hugo ...,"{'neg': 0.025, 'neu': 0.937, 'pos': 0.038, 'co...",0.333333,0.025,0.937,0.038,0.3400
1794004,1109220943409848089,2024-03-14,El apartamento es mejor que en las fotos. Todo...,"{'neg': 0.049, 'neu': 0.951, 'pos': 0.0, 'comp...",0.000000,0.049,0.951,0.000,-0.5983


In [11]:
df.to_csv('../../../../../datasets/paris_reviews_check.csv',index=False)

In [2]:
df = pd.read_csv('../../../../../datasets/paris_reviews_check.csv')

NameError: name 'pd' is not defined

In [3]:
df.tail()

Unnamed: 0,숙소_id,리뷰날짜,리뷰,vader_sentiment,textblob_sentiment,부정,중립,긍정,전체
1793894,1107872133955829208,2024-03-16,We had the best time staying at this apartment...,"{'neg': 0.0, 'neu': 0.641, 'pos': 0.359, 'comp...",0.575,0.0,0.641,0.359,0.9854
1793895,1109396868279302811,2024-03-15,On a passé un magnifique séjour dans l'apparte...,"{'neg': 0.0, 'neu': 0.964, 'pos': 0.036, 'comp...",0.5,0.0,0.964,0.036,0.6784
1793896,1108741370485532713,2024-03-10,Superbe découverte que cet appartement d Hugo ...,"{'neg': 0.025, 'neu': 0.937, 'pos': 0.038, 'co...",0.333333,0.025,0.937,0.038,0.34
1793897,1109220943409848089,2024-03-14,El apartamento es mejor que en las fotos. Todo...,"{'neg': 0.049, 'neu': 0.951, 'pos': 0.0, 'comp...",0.0,0.049,0.951,0.0,-0.5983
1793898,1110936505905289590,2024-03-17,Merci à Fabienne pour sa réactivité et son acc...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,1.0,0.0,0.0


In [18]:
# 리뷰 확인 결과 <br/>도 있음

df['리뷰'] = df['리뷰'].str.replace('<br/>\s*', ' ', regex=True)

# tf-idf
- 키워드 추출
- 여기서 쓸모없을거 같은 단어는 뺌(이야기 해봐야함)

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

# nltk 데이터 다운로드 (첫 실행 시 필요)
nltk.download('stopwords')

# 긍정적 리뷰와 부정적 리뷰로 분류
positive_reviews = df[df['긍정'] > 0.5]['리뷰'].dropna().astype(str)
negative_reviews = df[df['부정'] > 0.5]['리뷰'].dropna().astype(str)

# 영어, 스페인어, 독일어 불용어 목록 결합 및 리스트로 변환
stop_words = list(set(stopwords.words('english')) | set(stopwords.words('spanish')) | set(stopwords.words('german')))

# 추가로 제외할 단어 목록
additional_stop_words = [ 'paris','super','perfect','place', 'stay', 'chicago', 'airbnb', 'would', 'us','great','nice','good','amazing','highly']

# 추가적인 불용어 목록을 결합
stop_words.extend(additional_stop_words)

# TF-IDF 벡터라이저 초기화 (결합된 불용어 목록 사용)
tfidf_vectorizer = TfidfVectorizer(max_features=100, stop_words=stop_words)

# 긍정적 리뷰에 대한 TF-IDF 행렬 계산
tfidf_matrix_positive = tfidf_vectorizer.fit_transform(positive_reviews)
feature_names_positive = tfidf_vectorizer.get_feature_names_out()
tfidf_scores_positive = tfidf_matrix_positive.sum(axis=0).A1
tfidf_df_positive = pd.DataFrame({'word': feature_names_positive, 'tfidf_score': tfidf_scores_positive})
top_tfidf_positive = tfidf_df_positive.nlargest(20, 'tfidf_score')  # 상위 20개 단어

# 부정적 리뷰에 대한 TF-IDF 행렬 계산 (불용어 목록 조정)
try:
    tfidf_matrix_negative = tfidf_vectorizer.fit_transform(negative_reviews)
    feature_names_negative = tfidf_vectorizer.get_feature_names_out()
    tfidf_scores_negative = tfidf_matrix_negative.sum(axis=0).A1
    tfidf_df_negative = pd.DataFrame({'word': feature_names_negative, 'tfidf_score': tfidf_scores_negative})
    top_tfidf_negative = tfidf_df_negative.nlargest(20, 'tfidf_score')  # 상위 20개 단어

    # 결과 출력
    print("긍정적인 리뷰에서 상위 20개 단어:")
    print(top_tfidf_positive)
except ValueError as e:
    print(f"TF-IDF 계산 중 오류 발생: {e}")
    print("부정적 리뷰의 예시:")
    print(negative_reviews.head(10))  # 부정적 리뷰 예시 출력


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ciw96\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


긍정적인 리뷰에서 상위 20개 단어:
             word   tfidf_score
54       location  26429.839923
4       apartment  14396.969952
46           host  13322.278813
18          clean  11043.039179
73      recommend   9215.897348
57         lovely   7879.986231
34      excellent   6612.536985
11      beautiful   6544.436829
44        helpful   6110.645415
21    comfortable   5931.863414
99      wonderful   5859.826265
33     everything   5655.952332
43       friendly   5058.353226
29     definitely   4842.490878
89          thank   4723.278382
72         really   4644.008470
22  communication   4614.064375
30           easy   4590.648941
98           well   4413.870392
41           flat   4138.107528

부정적인 리뷰에서 상위 20개 단어:
             word  tfidf_score
20           dire   137.316942
73           rien   136.355694
63        parfait    91.158704
86           tout    84.784115
40            gut    58.951216
93        wohnung    47.803201
79          schön    43.335559
85            top    41.924254
51    

In [15]:

print("\n부정적인 리뷰에서 상위 20개 단어:")
print(top_tfidf_negative)


부정적인 리뷰에서 상위 20개 단어:
             word  tfidf_score
20           dire   137.316942
73           rien   136.355694
63        parfait    91.158704
86           tout    84.784115
40            gut    58.951216
93        wohnung    47.803201
79          schön    43.335559
85            top    41.924254
51           lage    40.481642
64        perfekt    35.724667
15        comment    30.000000
89     unterkunft    27.744252
76         sauber    26.481748
17     complaints    25.977347
6             bad    25.357507
10           bien    21.362360
99          était    20.668609
49  kommunikation    18.422894
19          danke    17.432585
3      aufenthalt    17.344727


# Lda 분석
- 키워드 분석

In [19]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# 텍스트 데이터 벡터화
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df['리뷰'])

# LDA 모델 생성 및 학습
lda = LatentDirichletAllocation(n_components=2, random_state=0)
lda.fit(X)

# 주제와 각 주제의 가장 중요한 단어들 출력
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

print_top_words(lda, vectorizer.get_feature_names(), 10)


AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

NameError: name 'df' is not defined