- Crawler from 
[pixnet](https://www.pixnet.net/), 
[travelyam](https://travel.yam.com/)
- Cleaning raw data 

In [171]:
'''
使用說明:
#  僅供以 pandas.df 對專題用的 MySQL db 進行資料的輸出,輸入 (不得修改SQL內容)
1. 匯入class --> from connect_MySQL import MysqlDataFrame
2. 需要匯入packages --> pymysql, pandas, sqlalchemy.create_engine, logging
3. 建立 MysqlDataFrame 物件 --> df = MysqlDataFrame('user', 'pwd', 'db')
4  print(df) --> 可印出使用說明
5. df.show_info() --> 顯示現有的 databases, tables 等資訊
6. df.get_pandas_df("table名稱") --> 得到 pandas.df
7. df.use_sql_query("自己輸入的SQL語法") --> 得到 pandas.df
8. insert_pandas_df(df, "table名稱") --> 將 pandas.df 填入 MySQL
'''

import pandas as pd
import logging
from sqlalchemy import create_engine


class MysqlDataFrame:
    def __init__(self, user, pwd, db='tfb1031_project', ip='10.2.16.174'):
        self.user = user
        self.pwd = pwd
        self.db = db
        self.__conn_ip = ip
        self.__stopWords = [
            'alter', 'update', 'delete', 'drop', 'insert',
            'table', 'database'
            ]
        self.__sysDatabase = [
            'information_schema', 'performance_schema', 'mysql',
            'sakila', 'sys', 'world'
            ]

    def __repr__(self):
        return '''
        1. Call "show_info()" to get the db & table list
        2. Call "get_pandas_df()" to get a pd.DataFrame from MySQL table
        3. Call "use_sql_query()" to get a pd.DataFrame with customized SQL 
        4. Call "insert_pandas_df()" to insert pd.DataFrame to MySQL 
        '''

    def __create_conn(self):
        try:
            engine = create_engine(
                f'mysql+pymysql://{self.user}:{self.pwd}@{self.__conn_ip}:3306/{self.db}'
                )
            return engine
        except Exception as err:
            print(logging.error(str(err)))

    def show_info(self):
        engine = self.__create_conn()

        # Get db list
        sql_db = 'SHOW databases;'
        query_db = engine.execute(sql_db).fetchall()
        db = [i[0] for j, i in enumerate(query_db) if i[0] not in self.__sysDatabase]
        # Get table list
        sql_table = 'SHOW tables;'
        query_table = engine.execute(sql_table).fetchall()
        table = [i[0] for i in query_table]

        information = f'Use db = {self.db}\nUser = {self.user}\nDatabase list = {db}\nTable list =  {table}'
        return print(information)

    def get_pandas_df(self, table='test'):
        engine = self.__create_conn()
        sql = f'select * from {table};'
        try:
            df = pd.read_sql_query(sql, engine)
            return df
        except Exception as err:
            print(logging.error(str(err)))

    # Use user-defined SQL
    def use_sql_query(self, input_sql):
        engine = self.__create_conn()
        for word in self.__stopWords:
            if word in input_sql:
                return print("Please don't alter the data")
        try:
            df = pd.read_sql_query(input_sql, engine)
            return df
        except Exception as err:
            print(logging.error(str(err)))

    def insert_pandas_df(self, df, table):
        engine = self.__create_conn()
        try:
            df.to_sql(table, engine, if_exists='append', index=0)
        except Exception as err:
            print(logging.error(str(err)))
    def convert_str_to_list(self, df, column):
        import ast
        return df[f'{column}'].apply(lambda x: ast.literal_eval(x))

data_sql = MysqlDataFrame('tfb1031_12', 'qwe123456', ip='10.2.14.12')
data_sql.show_info()

Use db = tfb1031_project
User = tfb1031_12
Database list = ['test', 'tfb1031_project']
Table list =  ['aut_feature', 'bnb', 'bnb_article', 'restaurant']


In [3]:
import pymongo
import pandas as pd
from pandas import json_normalize
import jieba
import jieba.analyse
import re
from datetime import datetime

## Import raw data from mongedb

In [7]:
conn_str = "mongodb://tfb1031:pwd@10.2.16.174/raw_data_for_project"
client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=5000)
client.server_info()
db = client.get_database('raw_data_for_project')
collection_pixnet_hotel = db.pixnet_hotel
collection_travelYam_hotel = db.travelYam_hotel

print('pixnet counts =', collection_pixnet_hotel.count_documents({}))
print('travelYam counts =', collection_travelYam_hotel.count_documents({}))

pixnet counts = 8443
travelYam counts = 697


## Transform json to pandas.df

In [8]:
origin_pixnet = json_normalize([json for json in collection_pixnet_hotel.find({})])
origin_travelYam = json_normalize([json for json in collection_travelYam_hotel.find({})])

# make a copy
pixnet = origin_pixnet.copy()
travelYam = origin_travelYam.copy()

In [9]:
#### tmp copy 
pixnet = origin_pixnet.copy()
travelYam = origin_travelYam.copy()

In [10]:
pixnet.head(1)

Unnamed: 0,_id,title,author,member_uniqid,articleUrl,date,tags,hit,reply_count,content,...,poi.address.zipcode,poi.address.country,poi.address.city,poi.address.town,poi.address.street,poi.cover_image_url,poi.rating.avg,poi.rating.count,poi.member_rating,poi.address
0,pixnet_hotel_0,台北西門町住宿~捷絲旅 四星級飯店 只要平價即可享受! 逛完西門町，直接走回,欣旅程，欣生活,2980359e2f52f69eb7,https://zh472.pixnet.net/blog/post/333805750-%...,2021-10-12 05:46:01,"[台北住宿推薦, 台北住宿2021, 西門町住宿, 西門町, 捷絲旅西門町店, 捷絲旅 台北...",10,8,\n\n\n台北美食--- campus cafe 美式校園輕食餐廳 & Triple c...,...,100,TW,台北市,中正區,中華路一段41號5-9樓,https://pic.pimg.tw/zh472/1631501994-423325219...,4.4,5.0,5.0,


In [11]:
travelYam.head(1)

Unnamed: 0,_id,title,author,articleUrl,date,tags,content,imgUrl
0,travelYam_hotel_1,屏東｜三富大酒店：光與影的現代化設計感旅,跟著小毛一起趴趴GO,https://travel.yam.com/Article.aspx?sn=125433,Oct 04.2021,"[屏東, 恆春, 三富大酒店, 飯店]",這家今年開幕的三富大酒店就位於墾丁前哨站恆春小鎮上，整棟建築物藉由光與影之間的設計搭配，...,[https://live.staticflickr.com/65535/514666269...


In [12]:
# Convert ['date'] format to the same --> YYYY-MM-DD
travelYam['date'] = travelYam['date'].apply(lambda x: datetime.strptime(x, '%b %d.%Y').date())
pixnet['date'] = pixnet['date'].apply(lambda x: pd.Timestamp(x).date())

In [13]:
# Combine "pixnet", "traverYam"
print(pixnet.columns)
print(travelYam.columns)

origin_hotel = pd.concat([pixnet, travelYam], ignore_index=1)
hotel = origin_hotel.copy()

Index(['_id', 'title', 'author', 'member_uniqid', 'articleUrl', 'date', 'tags',
       'hit', 'reply_count', 'content', 'imgUrl', 'poi.hash_id', 'poi.name',
       'poi.branch_store_name', 'poi.link', 'poi.address.zipcode',
       'poi.address.country', 'poi.address.city', 'poi.address.town',
       'poi.address.street', 'poi.cover_image_url', 'poi.rating.avg',
       'poi.rating.count', 'poi.member_rating', 'poi.address'],
      dtype='object')
Index(['_id', 'title', 'author', 'articleUrl', 'date', 'tags', 'content',
       'imgUrl'],
      dtype='object')


In [14]:
# Get the columns what I want & drop duplicates
hotel = hotel[['_id', 'title', 'author', 'date', 'articleUrl', 'imgUrl', 'content']]
hotel = hotel.set_index('_id')
hotel = hotel.drop_duplicates('title', keep="first")
print(hotel.shape)
hotel.head(1)

(7575, 6)


Unnamed: 0_level_0,title,author,date,articleUrl,imgUrl,content
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pixnet_hotel_0,台北西門町住宿~捷絲旅 四星級飯店 只要平價即可享受! 逛完西門町，直接走回,欣旅程，欣生活,2021-10-12,https://zh472.pixnet.net/blog/post/333805750-%...,[https://pic.pimg.tw/zh472/1631501995-11726028...,\n\n\n台北美食--- campus cafe 美式校園輕食餐廳 & Triple c...


## Data cleaning

In [15]:
hotel['title'] = hotel['title'].apply(lambda x: "|".join(re.findall('\w+' , x)))
hotel['title'] = hotel['title'].str.replace('臺北' ,'台北')
hotel['author'] = hotel['author'].apply(lambda x: "".join(re.findall('\w', x)))
hotel[['title', 'author']].head()

Unnamed: 0_level_0,title,author
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
pixnet_hotel_0,台北西門町住宿|捷絲旅|四星級飯店|只要平價即可享受|逛完西門町|直接走回,欣旅程欣生活
pixnet_hotel_1,天成集團住宿趣|天成文旅|華山町|忠孝新生站|華山文創園區|結合藝廊與旅,Laura
pixnet_hotel_2,五倍券|台北住宿優惠攻略|餐飲再放大1|5倍|住宿放大2倍|用在這些飯店,萊恩先生
pixnet_hotel_3,台北住宿推薦|台北住宿推薦|超夢幻夜景飯店|TOP10,維克
pixnet_hotel_4,台北住宿|首都唯客樂飯店|松江南京老牌平價商旅|日式套房|早餐分享,AlliTerry


In [16]:
# Drop the rows with empty "content"
emptyContent_index = [i for i in hotel.loc[(hotel['content'] == '') ,'content'].index]
hotel = hotel.drop(emptyContent_index)
    
hotel['content'] = hotel['content'].apply(lambda x: re.findall('[\u4e00-\u9fa5]+\d*\w+[\u4e00-\u9fa5]', x))
print(hotel.shape)
hotel.head(3)

(7524, 6)


Unnamed: 0_level_0,title,author,date,articleUrl,imgUrl,content
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
pixnet_hotel_0,台北西門町住宿|捷絲旅|四星級飯店|只要平價即可享受|逛完西門町|直接走回,欣旅程欣生活,2021-10-12,https://zh472.pixnet.net/blog/post/333805750-%...,[https://pic.pimg.tw/zh472/1631501995-11726028...,"[台北美食, 美式校園輕食餐廳, 農人餐桌, 親子餐廳, 徹思叔叔咖啡廳, 牛小路壽喜燒, ..."
pixnet_hotel_1,天成集團住宿趣|天成文旅|華山町|忠孝新生站|華山文創園區|結合藝廊與旅,Laura,2021-10-01,https://laurasweet0712.pixnet.net/blog/post/22...,[https://paper-attachments.dropbox.com/s_523CE...,"[天成文旅, 華山町, 位於華山文化創意園區附近, 是一棟充滿故事的建築物, 這棟建築物興建..."
pixnet_hotel_2,五倍券|台北住宿優惠攻略|餐飲再放大1|5倍|住宿放大2倍|用在這些飯店,萊恩先生,2021-09-30,https://ohmygodohoh.pixnet.net/blog/post/33427...,[https://pic.pimg.tw/ohmygodohoh/1632990704-17...,"[五倍券發放, 使用就在下週, 很久沒出國, 只好轉戰好的飯店放鬆CHILL一下, 究竟用在..."


In [17]:
# Get the "hotelName" from title
jieba.load_userdict('./static/jiebaDict_hotel.txt')
hotelNames = pd.read_csv("./static/hotelNames.csv")
hotelNames_list = [i for i in hotelNames.name]

def getHotelName(data):
    jieba_title = jieba.lcut(data)
    hotelName = [i for i in jieba_title if i in hotelNames_list]
    if hotelName != []:
        return "|".join(hotelName)
    else:
        return None

hotel['hotelName'] = hotel['title'].apply(getHotelName)
hotel.head(3)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Tibame\AppData\Local\Temp\jieba.cache
Loading model cost 0.653 seconds.
Prefix dict has been built successfully.


Unnamed: 0_level_0,title,author,date,articleUrl,imgUrl,content,hotelName
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
pixnet_hotel_0,台北西門町住宿|捷絲旅|四星級飯店|只要平價即可享受|逛完西門町|直接走回,欣旅程欣生活,2021-10-12,https://zh472.pixnet.net/blog/post/333805750-%...,[https://pic.pimg.tw/zh472/1631501995-11726028...,"[台北美食, 美式校園輕食餐廳, 農人餐桌, 親子餐廳, 徹思叔叔咖啡廳, 牛小路壽喜燒, ...",捷絲旅
pixnet_hotel_1,天成集團住宿趣|天成文旅|華山町|忠孝新生站|華山文創園區|結合藝廊與旅,Laura,2021-10-01,https://laurasweet0712.pixnet.net/blog/post/22...,[https://paper-attachments.dropbox.com/s_523CE...,"[天成文旅, 華山町, 位於華山文化創意園區附近, 是一棟充滿故事的建築物, 這棟建築物興建...",天成文旅
pixnet_hotel_2,五倍券|台北住宿優惠攻略|餐飲再放大1|5倍|住宿放大2倍|用在這些飯店,萊恩先生,2021-09-30,https://ohmygodohoh.pixnet.net/blog/post/33427...,[https://pic.pimg.tw/ohmygodohoh/1632990704-17...,"[五倍券發放, 使用就在下週, 很久沒出國, 只好轉戰好的飯店放鬆CHILL一下, 究竟用在...",


In [18]:
getHotelName

<function __main__.getHotelName(data)>

In [19]:
# Get the final data, call "bnb" 
bnb_columns = ['文章標題', '飯店民宿名稱', '作者', '內文', '發文日期', '文章連結', '圖片連結']
bnb = pd.DataFrame(columns=bnb_columns)
bnb[['文章標題', '飯店民宿名稱', '作者', '內文', '發文日期', '文章連結', '圖片連結']] \
    = hotel.reset_index()[['title', 'hotelName', 'author', 'content', 'date', 'articleUrl', 'imgUrl']]

# Combine finaldata from team members
# bnb = pd.concat([bnb, other], ignore_index=1)

print(' bnb columns =', bnb.shape[0], '\n', 'bnb rows =', bnb.shape[1])
bnb.head(3)

 bnb columns = 7524 
 bnb rows = 7


Unnamed: 0,文章標題,飯店民宿名稱,作者,內文,發文日期,文章連結,圖片連結
0,台北西門町住宿|捷絲旅|四星級飯店|只要平價即可享受|逛完西門町|直接走回,捷絲旅,欣旅程欣生活,"[台北美食, 美式校園輕食餐廳, 農人餐桌, 親子餐廳, 徹思叔叔咖啡廳, 牛小路壽喜燒, ...",2021-10-12,https://zh472.pixnet.net/blog/post/333805750-%...,[https://pic.pimg.tw/zh472/1631501995-11726028...
1,天成集團住宿趣|天成文旅|華山町|忠孝新生站|華山文創園區|結合藝廊與旅,天成文旅,Laura,"[天成文旅, 華山町, 位於華山文化創意園區附近, 是一棟充滿故事的建築物, 這棟建築物興建...",2021-10-01,https://laurasweet0712.pixnet.net/blog/post/22...,[https://paper-attachments.dropbox.com/s_523CE...
2,五倍券|台北住宿優惠攻略|餐飲再放大1|5倍|住宿放大2倍|用在這些飯店,,萊恩先生,"[五倍券發放, 使用就在下週, 很久沒出國, 只好轉戰好的飯店放鬆CHILL一下, 究竟用在...",2021-09-30,https://ohmygodohoh.pixnet.net/blog/post/33427...,[https://pic.pimg.tw/ohmygodohoh/1632990704-17...


In [20]:
# Create new index to insert MySQLf"
primaryKey_ID = pd.Series([f'bnb_{i}' for i in range(1, bnb.shape[0])])
primaryKey_ID

0          bnb_1
1          bnb_2
2          bnb_3
3          bnb_4
4          bnb_5
          ...   
7518    bnb_7519
7519    bnb_7520
7520    bnb_7521
7521    bnb_7522
7522    bnb_7523
Length: 7523, dtype: object

In [21]:
# Save the result
# bnb.to_csv('./static/bnb.csv', index=0)

In [None]:
bnb.groupby('作者').count().sort_values(by='文章標題', ascending=0)

In [None]:
bnb.groupby('飯店民宿名稱').count().sort_values(by='文章標題', ascending=0)

## ====================== insert ========================

In [172]:
origin = pd.read_csv(r'./data/first_clean/booking_com_ALL10_25_23_06_1027-copy.csv')
data = origin.copy()
data


columns = ['bnb_id', 'bnb_name', 'bnb_url', 'star', 'price',
           'address', 'origin_feature', 'score', 'image_url', 'city',
            'x', 'y']
final_data = pd.DataFrame(columns=columns)
final_data

final_data['bnb_name'] = data['hotel-name'].apply(lambda x:"".join(re.findall('\w+', x)))
final_data['bnb_url'] = data['hotel-url']
final_data['star'] = data['hotel-star-rank']
final_data['price'] = data['room-price']
final_data['address'] = data['hotel-address']
final_data['origin_feature'] = data['hotel-feature-list']
final_data['score'] = data['hotel-review-scores']

for j, i in enumerate(final_data['score']):
    if len(str(i)) > 3 :
        final_data = final_data.drop(j)
    elif len(str(i)) == 0:
        final_data = final_data.drop(j)
final_data['score'] = final_data['score']
final_data['image_url'] = data['hotel-img-list']
final_data['city'] = data['hotel-city ']

for j, i in enumerate(final_data['score']):
    if len(str(i)) > 3 :
        print(i)
        final_data = final_data.drop(j)
        
print(final_data.loc[final_data['score'] == '[]', 'bnb_id'])
final_data = final_data.drop([51, 52, 569, 1166, 1352])
final_data['score'] = final_data['score'].apply(lambda x : float(x))

final_data = final_data.reset_index()
final_data = final_data.drop(columns=['index'])
bnbid = pd.Series(['100'+ str(i).zfill(5) for i in range(1, final_data.shape[0]+1)])
final_data['bnb_id'] = bnbid


print(final_data.shape)
final_data.head(1)

569     NaN
1166    NaN
1352    NaN
Name: bnb_id, dtype: object
(1399, 12)


Unnamed: 0,bnb_id,bnb_name,bnb_url,star,price,address,origin_feature,score,image_url,city,x,y
0,10000001,台北109青旅,https://www.booking.com/hotel/tw/109-hostel-ta...,3,1984,台北中正區博愛路36號3樓,['\n\n\n衛浴\n\n\n\n\n\n\n\n\n\n衛生紙\n\n\n\n\n\n\...,9.0,['https://cf.bstatic.com/xdata/images/xphoto/s...,台北市,,


## =============== hotel ===============

In [182]:
origin = pd.read_csv(r'./data/first_clean/hotel_articles.csv')
hdata = origin.copy()

columns = ['文章標題', '飯店民宿名稱', '作者', '內文', '發文日期', '文章連結', '圖片連結']
final_bnb = pd.DataFrame(columns=columns)

hdata['文章標題'] = hdata['title']
hdata['飯店民宿名稱'] = hdata['hotel_name']
hdata['作者'] = hdata['author']
hdata['內文'] = hdata['article']
hdata['發文日期'] = hdata['time']
hdata['文章連結'] = hdata['article_url']
hdata['圖片連結'] = hdata['imgURL']

hdata = hdata[['文章標題', '飯店民宿名稱', '作者', '內文', '發文日期', '文章連結', '圖片連結']]
final_bnb = pd.concat([bnb, hdata], axis=0)


authorid = pd.Series(['300'+ str(i).zfill(5) for i in range(0, 5700)])
author = final_bnb.groupby('作者', as_index=0).count().drop(0)

author['aut_id'] = authorid
author['author'] = author[['作者']]
author['tag'] = None
author = author[['aut_id', 'author', 'tag']]

# data_sql.insert_pandas_df(author, 'bnb_author')

In [176]:
columns = ['bnb_art_id', 'title', 'content', 'date', 'art_url', 'image_url', 'bnb_aut_id', 'bnb_id']
article = pd.DataFrame(columns=columns)
article['title'] = final_bnb['文章標題']
article['content'] = final_bnb['內文']
article['date'] = final_bnb['發文日期']
article['art_url'] = final_bnb['文章連結']
article['image_url'] = final_bnb['圖片連結']
article['bnb_name'] = final_bnb['飯店民宿名稱']
article['author'] = final_bnb['作者']


article_id = pd.Series(['400'+ str(i).zfill(5) for i in range(1, article.shape[0]+1)])

target_bnb = final_data[['bnb_id', 'bnb_name']]
target_aut = author[['bnb_aut_id', 'author']]


a = pd.merge(article, target_bnb, how='left', on='bnb_name')
final_article = pd.merge(a, target_aut, how='left', on='author')
final_article['bnb_art_id'] = article_id
final_article.head(10)


final_article = final_article[['bnb_art_id', 'title', 'content', 'date', 'art_url', 
                               'image_url', 'bnb_aut_id_y', 'bnb_id_y']]
final_article['aut_id'] = final_article['bnb_aut_id_y']
final_article['bnb_id'] = final_article['bnb_id_y']
final_article = final_article[['bnb_art_id', 'title', 'content', 'date', 'art_url', 
                               'image_url', 'aut_id', 'bnb_id']]

final_article['content'] = final_article['content'].apply(lambda x:str(x))
final_article['image_url'] = final_article['image_url'].apply(lambda x:str(x))
final_article.head()

Unnamed: 0,bnb_art_id,title,content,date,art_url,image_url,aut_id,bnb_id
0,40000001,台北西門町住宿|捷絲旅|四星級飯店|只要平價即可享受|逛完西門町|直接走回,"['台北美食', '美式校園輕食餐廳', '農人餐桌', '親子餐廳', '徹思叔叔咖啡廳'...",2021-10-12,https://zh472.pixnet.net/blog/post/333805750-%...,['https://pic.pimg.tw/zh472/1631501995-1172602...,30005027,
1,40000002,天成集團住宿趣|天成文旅|華山町|忠孝新生站|華山文創園區|結合藝廊與旅,"['天成文旅', '華山町', '位於華山文化創意園區附近', '是一棟充滿故事的建築物',...",2021-10-01,https://laurasweet0712.pixnet.net/blog/post/22...,['https://paper-attachments.dropbox.com/s_523C...,30000558,
2,40000003,五倍券|台北住宿優惠攻略|餐飲再放大1|5倍|住宿放大2倍|用在這些飯店,"['五倍券發放', '使用就在下週', '很久沒出國', '只好轉戰好的飯店放鬆CHILL一...",2021-09-30,https://ohmygodohoh.pixnet.net/blog/post/33427...,['https://pic.pimg.tw/ohmygodohoh/1632990704-1...,30005360,
3,40000004,台北住宿推薦|台北住宿推薦|超夢幻夜景飯店|TOP10,"['台北住宿推薦', '超夢幻市景', '台北是台灣的首都', '是旅遊首選之一', '台北...",2021-09-26,https://mazda770.pixnet.net/blog/post/33421641...,['https://pic.pimg.tw/mazda770/1570324867-2843...,30005250,
4,40000005,台北住宿|首都唯客樂飯店|松江南京老牌平價商旅|日式套房|早餐分享,"['之前住過首都大飯店旗艦館', '很喜歡這家的住宿', '很可惜去年疫情關係停業了', '...",2021-08-20,https://tloveq.pixnet.net/blog/post/47813642,['https://pic.pimg.tw/tloveq/1627810982-275640...,30000062,


In [183]:
# data_sql.insert_pandas_df(final_data, 'bnb')
# data_sql.insert_pandas_df(author, 'author_feature')
# data_sql.insert_pandas_df(final_article, 'bnb_article')