# Seoul Bike Text Mining - LDA Analyse using Okt
## Objects
- 구글 플레이 스토어 '따릉이' 어플 리뷰와 ‘서울 자전거 따릉이’ 공식 웹사이트 내에 시민의견수렴 게시판에 작성된 의견 텍스트를 수집
- 오프라인의 이용 경험을 구분 지어 분석하기 위해 앱 리뷰를 온라인 사용 경험 분석
- 시민의견수렴 게시판을 오프라인 사용 경험 분석으로 활용
- LDA 토픽 모델링 통해 텍스트 속에 이용 경험의 문제점을 온라인과 오프라인으로 구별하는 분석을 진행

In [None]:
from collections import Counter
import pandas as pd
import numpy as np
from konlpy.tag import Mecab
from konlpy.tag import Okt
from ckonlpy.tag import Twitter

ctwitter = Twitter()
mecab = Mecab()
twitter = Okt()

## 전처리


In [None]:
from ckonlpy.tag import Postprocessor #전처리 라이브러리
from ckonlpy.tag import Twitter
import warnings
warnings.simplefilter("ignore")

ctwitter = Twitter()
ctwitter.add_dictionary(['따릉이','자출',
                       '대여소','연결거치','거치대','고객센터',
                        '다시','퇴근시간','대여시간',
                        '출근시간','한시간','두시간','임시폐쇄',
                         '자동로그인','UX', '1일권', '일일권','가성비',
                         '상담원','재로그인', '인증번호', '가입','렉', '랙','제로페이','결제','큐알코드','QR','뒤로가기',
                         '대여번호''결제','결재','여러번','불편','개불편함','내세금','전반적', '지도화면',
                        ], 'Noun')
#조사
ctwitter.add_dictionary(['까지','하냐'], 'Josa')
#형용사
ctwitter.add_dictionary(['레알','허접한','오지게' ,'어려워','안되', '열받','안됨', '안와' ,'참어렵네'], 'Adjective')
#부사

ctwitter.add_dictionary(['왜', '좀','진짜','최악' , '참'], 'Adverb', force=True)

ctwitter.add_dictionary(['타고','요청드립니다','부탁드립니다',
                         '요청 드립니다','부탁 드립니다','하기'],'Verb')

#무의미한 형태소 및 단어 제거
stopwords = {'수가','까지','드립','번','수','것', '앞','해주','왜',
             '곳','떄','좀','어요','요','이','감사','이','그','고','제','쪽','더','후','시','거',
'려고','적','저','데','등','역','중','해주시','내','면서','어서','때','뭐','못','마다','더니','해도','해도','다가','어가','개','하라','하나','걸'}


#단어 대치
replace = {'어플리케이션': '어플',
           '앱': '어플',
           '폐쇠':'페쇄',
           '젛은':'좋은',
           '연계':'연동',
           '맵':'지도',
           '인터페이스':'UI',
           '유아이':'UI',
           'ui': 'UI',
           'T머니' : '티머니',
           '렉':'오류',
           '랙':'오류',
           '가입': '회원가입',
          '결재':'결제',
           '버젼':'버전',
           '고객 센터': '고객센터',
           '에러': '오류',
           '버그': '오류',
           '업뎃':'업데이트' ,'업댓':'업데이트','업디이트':'업데이트','리뉴얼':'업데이트','개편':'업데이트','업그레이드':'업데이트',
           '업데이틀':'업데이트',
           '업글' :'업데이트',
           '비번':'비밀번호',
           '머지':'뭐지',
           '자동 로그인':'자동로그인',
           '딜레이':'지연',
           '정거장': '대여소','정류장': '대여소','대여수': '대여소',
           '큐알':'QR','qr':'QR','큐알코드':'QR',
           '알코':'QR',
           '리붓':'재부팅',
           '튜토리얼':'이용설명',
           '이용안내':'이용설명',
           '안되요':'안돼요',
           '카톡': '카카오톡',
           '먹통' :'오류',
           '제하': '결제',
            '카카오': '카카오톡',
            '카카오계정': '카카오톡',
           ('안되고', 'Adjective'): '안되',
           '않':'안되',
           '마니':'많이',
           '허다': '많다',
           '안와': '안되',
           '안': '안되',
           '폰':'휴대폰',
           '지도화면':'지도',
           '안됨':'안되',
           '팅김':'오류',
           '장애':'오류',
           '내세금':'세금',
           '개불편함':'불편',
           '연결거치': '연결'}

postprocessor = Postprocessor(ctwitter, stopwords = stopwords,replace = replace)

# Google Play Store

## 데이터 확인

In [1]:
df = pd.read_csv('sbike_google_sept copy.csv').drop('Unnamed: 0', axis=1)

reviews = df['review_text']
df.head(3)

Unnamed: 0,user_name,date,rating,thumbs_up,review_text
0,띠리띠리띠로리로,2017년 10월 5일,1,125,아니 오늘 서울 가서 자전거 타는데 지금 자전거 빌리는데만 1시간이 걸렸음. 분명히...
1,김재현,2019년 4월 21일,1,93,"아 결재 일일권하고 카드 등록도 T교통카드 밖에 안되고, 결국 자전거도 못빌리고 천..."
2,HyeonCheol Kim,2016년 8월 7일,3,67,앱의 활용도가 부족한거같습니다. 구간구간의 거리나 시간을 알수있는 기능이 좀 상세하...


In [7]:
# 별점 4점 이상 준 유저수 -> 매우 적음
df[df['rating'] >=4].count()

user_name      216
date           216
rating         216
thumbs_up      216
review_text    216
dtype: int64

## 사전 추가
기존 형태소 분석기에 추가되지 않은 단어 들을 추가

In [21]:
ctwitter.add_dictionary(['따릉이','자출',
                       '대여소','거치대','고객센터',
                       '임시폐쇄',
                         '자동로그인','UX', '1일권', '일일권','가성비',
                         '상담원','재로그인', '인증번호', '가입','렉', '랙','제로페이','결제','큐알코드','뒤로가기',
                         '대여번호''결제','결재','여러번','불편','개불편함','내세금','전반적', '지도화면','화',
                        ], 'Noun')

#### 형태소 분석 진행

In [33]:
def twitterWithStemming(text):
    sentences_tag = []
    for sentence in text:
        morph = twitter.pos(sentence, stem=False )
        sentences_tag.append(morph)
    return sentences_tag

In [34]:
def stop_wording(sentences_tag):
    token = []
    token_set={}
    token_table=[]
    cnt = 0
    for row in sentences_tag:
        token=[]

        for word,tag in row:

            if tag in 'Noun':
                token.append(word)
            elif tag in 'Adjective':
                token.append(word)
            elif tag in 'Adverv':
                token.append(word)
            elif tag in 'VerbPrefix':
                token.append(word)
                
            elif tag in 'Verb':
                token.append(word)


        token_table.append(token)
    return token_table

### 분석처리를 위한 파일 합병 엑셀화

In [35]:
df_text = twitterWithStemming(reviews)

In [36]:
b = stop_wording(df_text)

## LDA Analysis
- Gensim Package 사용
- 형태소 분석기 : Okt
- 전체 리뷰, 부정리뷰를 나누어서 분석 진행
- 긍정 리뷰의 경우 별점 4~5점은 준 리뷰의 갯수가 매우 적어 분석을 진행 하지 않음

In [37]:
from gensim import corpora

dictionary  = corpora.Dictionary(b)
corpus = [dictionary.doc2bow(text) for text in b ]


### 전체 리뷰에 대한 LDA 분석

In [38]:
import gensim
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=7, alpha='auto', 
                                      update_every=1, chunksize=20000, passes=10)
lda.print_topics()

[(0,
  '0.036*"앱" + 0.010*"좀" + 0.009*"안" + 0.008*"못" + 0.008*"따릉" + 0.007*"쓰레기" + 0.007*"로그인" + 0.007*"사용" + 0.007*"최악" + 0.006*"자전거"'),
 (1,
  '0.030*"앱" + 0.013*"안" + 0.012*"지도" + 0.009*"잘" + 0.009*"업데이트" + 0.007*"자전거" + 0.007*"진짜" + 0.007*"오류" + 0.007*"따릉" + 0.006*"결제"'),
 (2,
  '0.039*"앱" + 0.013*"자전거" + 0.012*"안" + 0.009*"대여" + 0.009*"업데이트" + 0.008*"결제" + 0.008*"로딩" + 0.007*"여소" + 0.006*"따릉" + 0.006*"시간"'),
 (3,
  '0.039*"로그인" + 0.018*"앱" + 0.014*"가입" + 0.014*"오류" + 0.013*"번호" + 0.012*"회원" + 0.010*"자동" + 0.009*"아이디" + 0.007*"안" + 0.007*"비빌"'),
 (4,
  '0.021*"앱" + 0.019*"결제" + 0.013*"대여" + 0.009*"업데이트" + 0.008*"안" + 0.008*"이용" + 0.008*"잘" + 0.007*"사용" + 0.007*"오류" + 0.007*"자전거"'),
 (5,
  '0.020*"앱" + 0.013*"오류" + 0.011*"왜" + 0.011*"업데이트" + 0.010*"진짜" + 0.009*"안" + 0.006*"지도" + 0.006*"최악" + 0.006*"따릉" + 0.006*"자전거"'),
 (6,
  '0.014*"결제" + 0.013*"왜" + 0.012*"앱" + 0.011*"안" + 0.009*"업데이트" + 0.005*"더" + 0.005*"해서" + 0.005*"사용" + 0.005*"못" + 0.004*"자전거"')]

In [57]:
from gensim.models.coherencemodel import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus)) # a measure of how good the model is. lower the better.
 
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=b, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.3276142666133754

Coherence Score:  0.39175704106359943


In [58]:
#구글 플레이 스토어 7개 토픽 모델링 저장
from gensim.test.utils import datapath

temp_file = 'LDA/google_num7'
lda.save(temp_file)

lda_google7 = gensim.models.ldamodel.LdaModel.load(temp_file)
lda_google7.print_topics()

[(0,
  '0.023*"어플" + 0.019*"진짜" + 0.014*"최악" + 0.013*"로딩" + 0.013*"그냥" + 0.011*"사용" + 0.010*"불편" + 0.007*"제대로" + 0.006*"안되" + 0.006*"돈"'),
 (1,
  '0.030*"어플" + 0.020*"오류" + 0.016*"결제" + 0.013*"안되" + 0.012*"쓰레기" + 0.010*"따릉이" + 0.009*"다시" + 0.008*"업데이트" + 0.008*"시간" + 0.008*"지도"'),
 (2,
  '0.039*"어플" + 0.032*"안되" + 0.020*"결제" + 0.019*"업데이트" + 0.017*"따릉이" + 0.016*"지도" + 0.014*"사용" + 0.012*"대여소" + 0.012*"오류" + 0.009*"불편"'),
 (3,
  '0.048*"어플" + 0.038*"로그인" + 0.021*"안되" + 0.016*"결제" + 0.014*"오류" + 0.013*"자전거" + 0.013*"최악" + 0.011*"진짜" + 0.009*"카카오톡" + 0.009*"불편"'),
 (4,
  '0.057*"어플" + 0.018*"안되" + 0.017*"오류" + 0.015*"자전거" + 0.012*"업데이트" + 0.012*"대여" + 0.011*"버전" + 0.010*"지도" + 0.009*"따릉이" + 0.008*"결제"'),
 (5,
  '0.026*"어플" + 0.026*"회원" + 0.019*"회원가입" + 0.018*"안되" + 0.016*"자전거" + 0.013*"따릉이" + 0.010*"로그인" + 0.009*"비밀번호" + 0.009*"이용" + 0.009*"정말"'),
 (6,
  '0.022*"안되" + 0.018*"대여" + 0.018*"어플" + 0.016*"지도" + 0.014*"대여소" + 0.012*"자전거" + 0.011*"결제" + 0.010*"비밀번호" + 0.009*"오류" + 0.008*"업데이트"')

In [175]:
for d in b:
    bow = dictionary.doc2bow(d)
    t = lda.get_document_topics(bow)

In [176]:
t # 토픽 출현 확률

[(0, 0.023274003),
 (1, 0.022734564),
 (2, 0.90867907),
 (3, 0.022542713),
 (4, 0.0227696)]

In [181]:
import numpy as np
topics_terms = lda.state.get_lambda()
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
probality=[]
for i in range(5):
    
    probality.append(topics_terms[i].sum()/topics_terms.sum())

In [182]:
topics = lda.print_topics()
def get_topics(topic):
    listed = topic.split('+')
    topics=[]
    for i in listed:
        topics.append(i[i.index('*')+1:].replace('"', '').rstrip())
    
    return topics

In [192]:
def get_topics_number(topic):
    listed = topic.split('+')
    topics=[]
    for i in listed:
        topics.append(i[:i.index('*')].replace('"', '').rstrip())
    
    return topics

In [188]:
topics_list=[]

for i in topics:
    topics_list.append(get_topics(i[1]))
topics_list

[['어플', '오류', '결제', '따릉이', '이용', '지도', '업데이트', '안되', '최악', '자전거'],
 ['어플', '안되', '로그인', '업데이트', '자전거', '결제', '불편', '따릉이', '오류', '로딩'],
 ['어플', '로그인', '회원가입', '안되', '회원', '비밀번호', '다시', '자전거', '결제', '업데이트'],
 ['안되', '어플', '대여', '자전거', '오류', '진짜', '최악', '대여소', '지도', '검색'],
 ['어플', '결제', '사용', '안되', '오류', '대여소', '지도', '따릉이', '시간', '자전거']]

In [59]:
probality

[0.09384192,
 0.12790182,
 0.025611972,
 0.06432053,
 0.057708878,
 0.13163634,
 0.056753032,
 0.1933901,
 0.10153924,
 0.14729626]

In [34]:
topics = lda.print_topics()
def get_topics_number(topic):
    listed = topic.split('+')
    topics=[]
    for i in listed:
        topics.append(i[:i.index('*')].replace('"', '').rstrip())
    
    return topics

#### 상위 토픽 10개 항목에 대한 토픽 분포

In [37]:
topics_num_list=[]

for i in topics:
    topics_num_list.append(get_topics_number(i[1]))
# print(topics_num_list)
# pd.DataFrame(topics_num_list).to_excel('Yoon/citizen_lda_num.xlsx')
pd.DataFrame(topics_num_list)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.036,0.023,0.02,0.013,0.012,0.01,0.01,0.01,0.009,0.008
1,0.074,0.035,0.035,0.027,0.019,0.017,0.016,0.015,0.014,0.011
2,0.028,0.019,0.012,0.011,0.01,0.009,0.009,0.009,0.008,0.007
3,0.057,0.036,0.034,0.014,0.011,0.009,0.009,0.008,0.008,0.008
4,0.044,0.043,0.032,0.013,0.011,0.009,0.009,0.008,0.008,0.007
5,0.03,0.026,0.024,0.02,0.009,0.008,0.008,0.007,0.007,0.007
6,0.027,0.024,0.017,0.014,0.013,0.013,0.009,0.006,0.006,0.006
7,0.053,0.025,0.016,0.015,0.013,0.011,0.009,0.008,0.008,0.007
8,0.043,0.036,0.013,0.012,0.01,0.01,0.008,0.008,0.007,0.006
9,0.016,0.016,0.014,0.014,0.013,0.012,0.01,0.009,0.009,0.008


In [202]:
topics_num_list=[]

for i in topics:
    topics_num_list.append(get_topics_number(i[1]))
pd.DataFrame(topics_num_list).to_excel('Yoon/google_lda_num.xlsx')

In [184]:
import pandas as pd
pd.DataFrame(topics_list).append(probality).to_excel('Yoon/google_lda_refined.xlsx')

### 부정적 점수를 준 리뷰

In [416]:
df = pd.read_csv('sbike_google_sept.csv')
bad_reviews = df[df['rating']<3]['review_text']

In [418]:
sentence_tag =do_twitter(bad_reviews)

In [419]:
bad_tokenized = stop_wording(sentence_tag)

In [421]:
dictionary  = corpora.Dictionary(bad_tokenized)
corpus = [dictionary.doc2bow(text) for text in bad_tokenized ]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=10000, passes=20)
lda.print_topics()

[(0,
  '0.037*"어플" + 0.022*"결제" + 0.021*"안되" + 0.017*"대여" + 0.016*"자전거" + 0.013*"오류" + 0.010*"사용" + 0.010*"불편" + 0.009*"대여소" + 0.009*"따릉이"'),
 (1,
  '0.062*"어플" + 0.023*"안되" + 0.014*"오류" + 0.014*"결제" + 0.012*"로그인" + 0.011*"따릉이" + 0.010*"회원가입" + 0.010*"회원" + 0.010*"업데이트" + 0.009*"자전거"'),
 (2,
  '0.034*"어플" + 0.027*"로그인" + 0.025*"안되" + 0.021*"지도" + 0.012*"오류" + 0.011*"최악" + 0.009*"대여소" + 0.009*"진짜" + 0.008*"검색" + 0.008*"결제"'),
 (3,
  '0.032*"어플" + 0.018*"안되" + 0.016*"쓰레기" + 0.013*"결제" + 0.012*"진짜" + 0.010*"오류" + 0.009*"로딩" + 0.009*"계속" + 0.009*"최악" + 0.007*"정말"'),
 (4,
  '0.041*"업데이트" + 0.020*"안되" + 0.014*"지도" + 0.014*"대여소" + 0.013*"자전거" + 0.011*"어플" + 0.010*"버전" + 0.009*"최악" + 0.008*"돈" + 0.008*"오류"')]

# 시민의견수렴 게시판

In [431]:
df = pd.read_excel('Seoulbike_complain.xlsx')

text = df['content']

In [438]:
str_text = []
for row in text:
    str_text.append(str(row))

In [439]:
sentence_tag =do_twitter(str_text)

In [440]:
citizen_tokenized = stop_wording(sentence_tag)

In [443]:
dictionary  = corpora.Dictionary(citizen_tokenized)
corpus = [dictionary.doc2bow(text) for text in citizen_tokenized ]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, update_every=1, chunksize=10000, passes=20)
lda.print_topics()

[(0,
  '0.038*"답변" + 0.025*"설치" + 0.021*"언제" + 0.014*"글" + 0.013*"문의" + 0.013*"공지" + 0.010*"이용" + 0.010*"가능" + 0.009*"대여소" + 0.009*"장애"'),
 (1,
  '0.054*"반납" + 0.028*"대여" + 0.028*"자전거" + 0.021*"시간" + 0.020*"안되" + 0.018*"결제" + 0.014*"처리" + 0.012*"이용" + 0.011*"연결" + 0.010*"추가"'),
 (2,
  '0.057*"자전거" + 0.019*"따릉이" + 0.016*"고장" + 0.011*"이용" + 0.008*"생각" + 0.008*"있는" + 0.008*"신고" + 0.007*"입니다" + 0.007*"경우" + 0.006*"있"'),
 (3,
  '0.017*"자전거" + 0.015*"고장" + 0.015*"대여" + 0.014*"안되" + 0.014*"신고" + 0.010*"로그인" + 0.010*"반납" + 0.009*"어플" + 0.009*"이용" + 0.008*"번호"'),
 (4,
  '0.014*"화면" + 0.012*"비밀번호" + 0.011*"표시" + 0.011*"지도" + 0.011*"버튼" + 0.010*"대여소" + 0.008*"비공개" + 0.008*"위치" + 0.007*"안되" + 0.007*"글"'),
 (5,
  '0.033*"따릉이" + 0.015*"거치대" + 0.013*"대여소" + 0.011*"시간" + 0.010*"한강" + 0.009*"공원" + 0.008*"설치" + 0.008*"이용" + 0.007*"질문" + 0.007*"거리"'),
 (6,
  '0.036*"안장" + 0.025*"따릉이" + 0.021*"단" + 0.020*"높이" + 0.017*"자전거" + 0.011*"마일리지" + 0.011*"이용" + 0.011*"무릎" + 0.010*"환승" + 0.010*"키"'),
 (7,
  '0.030*

In [444]:
lda_result = lda.print_topics()

In [458]:
list(lda_result[1])[1:]

['0.054*"반납" + 0.028*"대여" + 0.028*"자전거" + 0.021*"시간" + 0.020*"안되" + 0.018*"결제" + 0.014*"처리" + 0.012*"이용" + 0.011*"연결" + 0.010*"추가"']

In [27]:
verb_list = []
for sentence in sentences_tag:
    for word, tag in sentence:
        if tag in ['Verb']:
            
            verb_list.append(word)
            
len(adj_list)
counts_adj = Counter(verb_list)
print(counts_adj.most_common(100))

[('할', 191), ('잘', 154), ('들', 119), ('건지', 115), ('하는', 115), ('합니다', 107), ('했는데', 96), ('하면', 78), ('하게', 70), ('되고', 68), ('하기', 68), ('만든', 65), ('뜨고', 62), ('하는데', 59), ('가', 52), ('넘', 50), ('않고', 48), ('하네요', 48), ('참', 48), ('까', 46), ('쓰', 44), ('만들', 44), ('쓸', 42), ('했', 42), ('된', 40), ('서', 39), ('한', 39), ('들이', 38), ('하지', 37), ('되', 35), ('되는', 33), ('해야', 32), ('되는데', 32), ('하세요', 32), ('되지', 31), ('됨', 28), ('대', 27), ('되서', 26), ('떠서', 26), ('만들어', 25), ('지', 25), ('써', 23), ('누르면', 22), ('않아', 22), ('빌릴', 22), ('않음', 21), ('고쳐주세요', 21), ('나서', 21), ('돼요', 20), ('했습니다', 20), ('는', 19), ('됩니다', 19), ('들은', 19), ('되면', 18), ('오고', 18), ('돼서', 18), ('뜨', 18), ('받고', 18), ('하네', 17), ('않네요', 17), ('나오고', 17), ('되네요', 17), ('쳐', 17), ('쓰고', 17), ('째', 16), ('드려요', 16), ('빌리는데', 16), ('하다', 16), ('걸리고', 16), ('만드는', 16), ('찾', 16), ('깔', 15), ('내고', 15), ('하려', 15), ('할수', 15), ('하는거', 15), ('눌러도', 14), ('걸림', 14), ('날리고', 14), ('할거면', 14), ('쓰는', 14), ('와', 14), ('먹고', 1

In [17]:
counts_adj = Counter(adj_list)
print(counts_adj.most_common(100))

[('안되고', 116), ('어떻게', 87), ('입니다', 86), ('있', 77), ('안됨', 71), ('좋아요', 57), ('이런', 53), ('있는', 50), ('좋은데', 48), ('같은', 43), ('좋은', 43), ('없', 43), ('안되서', 39), ('없는', 37), ('없고', 37), ('같아요', 35), ('안되네요', 35), ('없음', 34), ('없네요', 34), ('많고', 34), ('같습니다', 31), ('새로', 29), ('있습니다', 29), ('아닌', 27), ('느리고', 27), ('있는데', 26), ('느려', 25), ('많은', 22), ('안되요', 22), ('안되는', 22), ('좋', 21), ('없다', 20), ('있는지', 20), ('좋겠습니다', 19), ('안됩니다', 17), ('없다고', 16), ('좋겠', 15), ('있고', 14), ('아깝다', 14), ('같네요', 14), ('있게', 14), ('어떤', 14), ('있음', 13), ('굉장히', 13), ('있으면', 13), ('느려요', 13), ('이럴거면', 13), ('이런거', 12), ('많음', 12), ('아까', 12), ('안된다', 12), ('있다', 12), ('좋겠네요', 12), ('없습니다', 11), ('많', 11), ('없네', 11), ('느려서', 11), ('많아요', 10), ('같음', 10), ('없어', 10), ('안되는데', 10), ('많습니다', 10), ('같다', 9), ('좋습니다', 9), ('있다고', 9), ('안되나요', 9), ('없는데', 9), ('안되', 9), ('좋으나', 9), ('같', 9), ('많아서', 9), ('같은데', 9), ('없게', 9), ('있는거', 8), ('없으면', 8), ('어렵고', 8), ('빠른', 8), ('안되면', 8), ('편하게', 8), ('있네요', 8), ('

In [18]:
import pandas as pd
df = pd.DataFrame(counts_adj.most_common(100))
df.to_excel('Yoon/s_bike_google_adj.xlsx')

#### 명사 추출후 상위 10개 항목 정렬

In [8]:
df['twitter(no Stem)'] = df['review_text'].apply(lambda x: twitter.pos(x))
df['twitter(Stem)'] = df['review_text'].apply(lambda x: twitter.pos(x, stem=True))

In [13]:
df['cutomizedTwitter(no Stem)'] = df['review_text'].apply(lambda x: ctwitter.pos(x))
df['cutomizedTwitter(Stem)'] = df['review_text'].apply(lambda x: ctwitter.pos(x, stem=True))

In [15]:
df.head(3)

Unnamed: 0,user_name,date,rating,thumbs_up,review_text,twitter(no Stem),twitter(Stem),cutomizedTwitter,cutomizedTwitter(no Stem),cutomizedTwitter(Stem)
0,hayeon,2019년 9월 29일,1,11,뭐 누르기만 해도 앱이 계속 로그아웃되고 네이버로그인을 해봤자 어차피 여기 가입을 ...,"[(뭐, Noun), (누르기, Noun), (만, Josa), (해도, Noun)...","[(뭐, Noun), (누르기, Noun), (만, Josa), (해도, Noun)...","[뭐, 누르기, 만, 해도, 앱, 이, 계속, 로그아웃, 되고, 네이버, 로그인, ...","[(뭐, Noun), (누르기, Noun), (만, Josa), (해도, Noun)...","[(뭐, Noun), (누르기, Noun), (만, Josa), (해도, Noun)..."
1,신아현,2019년 9월 23일,2,4,1. 로그인 화면에서 자동 로그인 체크하고 재로그인하면 이상한 비빌번호 이 저장되서...,"[(1, Number), (., Punctuation), (로그인, Noun), (...","[(1, Number), (., Punctuation), (로그인, Noun), (...","[1, ., 로그인, 화면, 에서, 자동, 로그인, 체크, 하고, 재로그인, 하면,...","[(1, Number), (., Punctuation), (로그인, Noun), (...","[(1, Number), (., Punctuation), (로그인, Noun), (..."
2,LUIs sy park,2019년 9월 29일,5,1,너무 좋아요 . 정말 유용하게 잘 사용 하고있음 그런데 많은 자전거가 다 어디로 사...,"[(너무, Adverb), (좋아요, Adjective), (., Punctuati...","[(너무, Adverb), (좋다, Adjective), (., Punctuatio...","[너무, 좋아요, ., 정말, 유용, 하게, 잘, 사용, 하고, 있음, 그런데, 많...","[(너무, Adverb), (좋아요, Adjective), (., Punctuati...","[(너무, Adverb), (좋아요, Adjective), (., Punctuati..."


In [18]:
df.to_excel('2020_seoulbike/comparsion_twitter.xlsx')

In [22]:
import pandas as pd

df = pd.DataFrame.from_dict(n_counts, orient='index').reset_index()
df = df.rename(columns={'index':'noun', 0:'count'})
df=df.sort_values(by=['count'], axis=0, ascending=False)


In [23]:
#엑셀로 저장
df.to_excel("Yoon/sbike_google_noun.xlsx")

In [18]:
import pandas as pd
df2 = pd.DataFrame.from_dict(counts_adj, orient='index').reset_index()
df2 = df2.rename(columns={'index':'noun', 0:'count'})
df2 = df2.sort_values(by=['count'], axis=0, ascending=False)
df2.head()
df2.to_excel('./google_adj_count.xlsx')