In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib import font_manager, rc
import folium
from folium.plugins import MarkerCluster
from folium import FeatureGroup, LayerControl
from folium.features import GeoJson
from shapely.geometry import shape, Point
import json
import re  # 정규 표현식 라이브러리 추가
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import nltk
import chardet

# 글꼴 경로 지정
font_path = "c:/Windows/Fonts/malgun.ttf"  # 윈도우에 설치된 맑은 고딕 폰트 경로

# 폰트 이름 얻어오기
font_name = font_manager.FontProperties(fname=font_path).get_name()

# matplotlib의 rc(run command) 기능을 이용하여 글꼴 설정
mpl.rc('font', family=font_name)

# 유니코드에서  음수 부호 설정
mpl.rc('axes', unicode_minus=False)

df_guest_prefer = pd.read_csv('../../../../../datasets/paris_prefer.csv')
df_non_guest_prefer = pd.read_csv('../../../../../datasets/paris_non_prefer.csv')
review = pd.read_csv('../../../../../datasets/paris_reviews.csv')


In [2]:
review = review.rename(columns={
    'listing_id':'숙소_id',
    'date' : '리뷰날짜',
    'comments' : '리뷰'
})

review = review[['숙소_id', '리뷰날짜', '리뷰']]
print(len(review))
# 전처리: 영어와 공백만 남김
review['review'] = review['리뷰'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

# '리뷰날짜' 컬럼을 datetime 객체로 변환
review['리뷰날짜'] = pd.to_datetime(review['리뷰날짜'])

# 2022년 이후 데이터만 필터Q링
review_after_2022 = review[review['리뷰날짜'] >= '2022-01-01']

# 결과 확인
print(review_after_2022)

1794006
                       숙소_id       리뷰날짜  \
210                   165409 2022-08-07   
211                   165409 2022-08-13   
212                   165409 2022-08-24   
213                   165409 2022-09-07   
214                   165409 2022-09-08   
...                      ...        ...   
1794001  1107872133955829208 2024-03-16   
1794002  1109396868279302811 2024-03-15   
1794003  1108741370485532713 2024-03-10   
1794004  1109220943409848089 2024-03-14   
1794005  1110936505905289590 2024-03-17   

                                                        리뷰  \
210      Very cute apartment in a great location, which...   
211      The place was perfect and matched the discript...   
212      Heel prettig appartement in fijne buurt. Hele ...   
213      Great location, close to many restaurants, rep...   
214      The only decent thing about this flat is the l...   
...                                                    ...   
1794001  We had the best time staying at 

In [3]:
# review_after_2022와 df_guest_prefer 데이터프레임을 '숙소_id'를 기준으로 결합
df_guest_prefer_review = pd.merge(review_after_2022, df_guest_prefer, on='숙소_id', how='inner')

# 결과 확인
print(df_guest_prefer_review)

                      숙소_id       리뷰날짜  \
0                      9952 2022-02-19   
1                      9952 2022-03-30   
2                      9952 2022-04-18   
3                      9952 2022-05-14   
4                      9952 2022-05-25   
...                     ...        ...   
143179  1080946236732945943 2024-02-07   
143180  1080946236732945943 2024-02-10   
143181  1080946236732945943 2024-02-28   
143182  1080946236732945943 2024-03-06   
143183  1080946236732945943 2024-03-10   

                                                       리뷰  \
0       We really enjoyed our stay at Elizabeth's flat...   
1       Fantastic stay! Great communication. Ideal loc...   
2       Hemos pasado unos días fantásticos en París, a...   
3       Place is great location! Very lively and the a...   
4                           Very good place. I recommend.   
...                                                   ...   
143179  Super propre, chaud et cosy avec  vue sur la T...   
143180 

In [4]:
# review_after_2022와 df_guest_prefer 데이터프레임을 '숙소_id'를 기준으로 결합
df_non_guest_prefer_review = pd.merge(review_after_2022, df_non_guest_prefer, on='숙소_id', how='inner')

# 결과 확인
print(df_guest_prefer_review)

                      숙소_id       리뷰날짜  \
0                      9952 2022-02-19   
1                      9952 2022-03-30   
2                      9952 2022-04-18   
3                      9952 2022-05-14   
4                      9952 2022-05-25   
...                     ...        ...   
143179  1080946236732945943 2024-02-07   
143180  1080946236732945943 2024-02-10   
143181  1080946236732945943 2024-02-28   
143182  1080946236732945943 2024-03-06   
143183  1080946236732945943 2024-03-10   

                                                       리뷰  \
0       We really enjoyed our stay at Elizabeth's flat...   
1       Fantastic stay! Great communication. Ideal loc...   
2       Hemos pasado unos días fantásticos en París, a...   
3       Place is great location! Very lively and the a...   
4                           Very good place. I recommend.   
...                                                   ...   
143179  Super propre, chaud et cosy avec  vue sur la T...   
143180 

In [5]:
pip install langdetect





In [6]:
from langdetect import detect, DetectorFactory
import pandas as pd

# langdetect가 일관된 결과를 반환하도록 설정
DetectorFactory.seed = 0
# 리뷰의 언어 감지를 수행하고 영어인 리뷰만 필터링하는 함수 정의
def is_english(text):
    try:
        # 텍스트의 언어 감지
        return detect(text) == 'en'
    except:
        # 언어 감지가 실패하면 False 반환
        return False

# '리뷰' 컬럼에 있는 텍스트가 영어인지 확인하여 새로운 컬럼에 결과 저장
df_non_guest_prefer_review['is_english'] = df_non_guest_prefer_review['리뷰'].apply(is_english)

# 영어 리뷰만 선택
df_non_guest_prefer_review = df_non_guest_prefer_review[df_non_guest_prefer_review['is_english']]

# 결과 확인
print(df_non_guest_prefer_review)


                      숙소_id       리뷰날짜  \
0                    171159 2022-01-10   
6                    171159 2022-02-13   
7                    171159 2022-02-16   
11                   171159 2022-03-19   
12                   171159 2022-03-22   
...                     ...        ...   
132578  1084102702408464568 2024-03-01   
132584  1087934692821802498 2024-02-25   
132589  1087934692821802498 2024-03-09   
132590  1092099839702927754 2024-02-20   
132592  1092099839702927754 2024-03-01   

                                                       리뷰  \
0       very conveniently located near opera and gare ...   
6       Carole’s studio is perfect for a couple who wa...   
7                   Lovely little nest in prime locality.   
11      Lovely space right in one of the best areas of...   
12      overall amazing, cozy but close to the metro a...   
...                                                   ...   
132578  I highly Recommend, Cozy place for staying nic...   
132584 

In [7]:
# '리뷰' 컬럼에 있는 텍스트가 영어인지 확인하여 새로운 컬럼에 결과 저장
df_guest_prefer_review['is_english'] = df_guest_prefer_review['리뷰'].apply(is_english)

# 영어 리뷰만 선택
df_guest_prefer_review = df_guest_prefer_review[df_guest_prefer_review['is_english']]

# 결과 확인
print(df_guest_prefer_review)

                      숙소_id       리뷰날짜  \
0                      9952 2022-02-19   
1                      9952 2022-03-30   
3                      9952 2022-05-14   
4                      9952 2022-05-25   
5                      9952 2022-06-10   
...                     ...        ...   
143161  1080250485815449505 2024-03-16   
143175  1080943945230018972 2024-02-03   
143177  1080943945230018972 2024-03-06   
143178  1080943945230018972 2024-03-12   
143183  1080946236732945943 2024-03-10   

                                                       리뷰  \
0       We really enjoyed our stay at Elizabeth's flat...   
1       Fantastic stay! Great communication. Ideal loc...   
3       Place is great location! Very lively and the a...   
4                           Very good place. I recommend.   
5       Elisabeth is such a caring nice and thoughtful...   
...                                                   ...   
143161  Great small neat apartment with easy access to...   
143175 

In [15]:
pip uninstall scipy gensim


In [None]:
pip install scipy gensim

In [11]:

pip install --upgrade scipy

Note: you may need to restart the kernel to use updated packages.


In [12]:

pip install --upgrade gensim

Note: you may need to restart the kernel to use updated packages.


In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


# 불용어 설정
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 추가 불용어 목록
add_stopwords = ['great', 'good', 'nice', 'would', 'place', 'stay', 'apartment', 'house', 'room', 'host', 'comfortable',
                 'amazing', 'perfect', 'wonderful', 'excellent', 'lovely', 'highly', 'recommend', 'definitely', 'really',
                 'london', 'u', 'everything', 'well', 'home', 'helpful', 'also', 'like', 'location', 'staying', 'could',
                 'need', 'airbnb', 'host', 'room', 'u']

# 전체 불용어 집합
custom_stopwords = stop_words.union(set(add_stopwords))

# 어간 추출기 설정
lemma = WordNetLemmatizer()

# 전처리 함수
def preprocess(data):
    processed_words = []
    for title in data['리뷰']:
        en_words = re.sub(r"[^a-zA-Z]+", " ", str(title))  # 정규표현식 사용, 영문 대소문자 제외한 문자를 공백으로 변경
        en_words_token = word_tokenize(en_words.lower())  # 변환된 단어를 토큰화
        en_words_stop = [word for word in en_words_token if not word in custom_stopwords]  # 불용어 제거
        en_words_lemma = [lemma.lemmatize(word) for word in en_words_stop]  # 어간 추출

        processed_words.append(en_words_lemma)
    return processed_words

# 전처리
processed_reviews_t = preprocess(df_guest_prefer_review)
processed_reviews_f = preprocess(df_non_guest_prefer_review)

ImportError: cannot import name 'triu' from 'scipy.linalg' (c:\Users\ciw96\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\linalg\__init__.py)