In [9]:
import gensim
import pandas as pd
from gensim import corpora
from gensim import similarities
from pprint import pprint

In [10]:
def calSimilarity(app: str, row: int)->pd.DataFrame:
    name_list = ['high_ratings', 'low_ratings']
    sort_by_rating = get_sort_by_rating(app)
    for i in range(2):
        # reviews_df = pd.read_csv(f'crawler_result/{app}/{app}.csv')
        df = sort_by_rating[i]
        reviews = df['review']
        # list of app reviews
        reviews = reviews.to_list()
        # lemmetation
        texts = [[word for word in review.lower().split()] for review in reviews]
        # create dict
        dictionary = corpora.Dictionary(texts)
        # bag-of-words
        corpus = [dictionary.doc2bow(text) for text in texts]
        # LDA modeling
        lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)
        # topics
        # pprint(lda_model.print_topics())
        kw_df = pd.read_csv('kw_db.csv')
        feature_df = kw_df['functions'][row]
        # list of features
        features = feature_df.split('\n')
        # use gensim similarities module
        index = similarities.MatrixSimilarity(lda_model[corpus])
        # 創建一個空的列表來存儲所有的結果
        results = []
        # compare the similarity
        for feature in features:
            feature_bow = dictionary.doc2bow(feature.lower().split())
            feature_lda = lda_model[feature_bow]
            sims = index[feature_lda]
            for review, score in zip(reviews, sims):
                # 將每次迭代的結果添加到results列表中
                results.append({'feature': feature, 'review': review, 'similarity score': score})
        # 使用results列表來創建一個DataFrame
        df = pd.DataFrame(results)
        df.to_csv(f'crawler_result/{app}/feature_sim_score_{app}_{name_list[i]}.csv')
    return df

def get_sort_by_rating(app: str)->[pd.DataFrame, pd.DataFrame]:
    high_ratings = pd.read_csv(f'crawler_result/{app}/{app}_high_ratings.csv')
    low_ratings = pd.read_csv(f'crawler_result/{app}/{app}_low_ratings.csv')
    return [high_ratings, low_ratings] 

In [11]:
kw_df = pd.read_csv('kw_db.csv')
kw_df

Unnamed: 0.1,Unnamed: 0,functions
0,Duoswim:swim workout app,personal AI coach\nguided workouts\n1000+ swim...
1,swim.com:swim tracker,track your swims and achieve your goals\ndisco...
2,swimup-swimming training,personalized training plans\ndrill&technique v...
3,form swim,"choose own plans or workouts in app,can also l..."
4,myswimpro:#1 swim workout app,MySwimPro Coach unlocks:\nPersonalized swim Tr...
5,garmin connect,記錄距離\n設定泳池大小\n泳姿辨識\n自動休息\n技術訓練日誌
6,fitness,automatically tracks your splits and auto sets...


In [12]:
df = calSimilarity('duoswim', 0)
df

Unnamed: 0,feature,review,similarity score
0,personal AI coach,Positive: The coaching side seems ok and decen...,0.707107
1,guided workouts,Positive: The coaching side seems ok and decen...,0.707107
2,1000+ swim workouts,Positive: The coaching side seems ok and decen...,0.941853
3,technique video library,Positive: The coaching side seems ok and decen...,0.707107
4,apple health,Positive: The coaching side seems ok and decen...,0.941852
5,nutrition,Positive: The coaching side seems ok and decen...,0.707107
6,track your swims,Positive: The coaching side seems ok and decen...,0.918823
7,training statistics overview,Positive: The coaching side seems ok and decen...,0.707107


In [13]:
df = calSimilarity('swim-com', 1)
df

Unnamed: 0,feature,review,similarity score
0,track your swims and achieve your goals,"Strictly from an App Design standpoint, it bre...",0.994882
1,track your swims and achieve your goals,This app has all the potential of being great ...,0.994882
2,track your swims and achieve your goals,"I want to like this app. It has potential, and...",0.994882
3,track your swims and achieve your goals,The new iPhone app format update is the worst ...,0.648286
4,track your swims and achieve your goals,This app is great when it works. Just after th...,0.994882
...,...,...,...
751,view performace over time,I have used this app with my old Apple Watch S...,0.978096
752,view performace over time,I want to use this app. I love swim.com and us...,0.980954
753,view performace over time,***Update - 02-10-2017\nMore updates have been...,0.978096
754,view performace over time,App.has gone from functional to brick. Says it...,0.988214


In [14]:
df = calSimilarity('swimup', 2)
df

Unnamed: 0,feature,review,similarity score
0,personalized training plans,They put me on 2 levels before champion but th...,0.949136
1,personalized training plans,I love Swimup’s YouTube channel so I was reall...,0.326032
2,personalized training plans,No voice tutorials makes these videos useless.,0.391971
3,drill&technique video library,They put me on 2 levels before champion but th...,0.352409
4,drill&technique video library,I love Swimup’s YouTube channel so I was reall...,0.939935
5,drill&technique video library,No voice tutorials makes these videos useless.,0.961692
6,easy-to -follow workouts,They put me on 2 levels before champion but th...,0.949131
7,easy-to -follow workouts,I love Swimup’s YouTube channel so I was reall...,0.326047
8,easy-to -follow workouts,No voice tutorials makes these videos useless.,0.391986
9,analytics to track your progress,They put me on 2 levels before champion but th...,0.333196


In [15]:
df = calSimilarity('form-swim', 3)
df

Unnamed: 0,feature,review,similarity score
0,"choose own plans or workouts in app,can also l...",The googles are fine and the workout generatio...,0.515510
1,"choose own plans or workouts in app,can also l...","I like the FORM goggles and what they provide,...",0.515510
2,"choose own plans or workouts in app,can also l...",I was really disappointed to learn the collect...,0.515510
3,"choose own plans or workouts in app,can also l...",Goggles/software do function well and as adver...,0.862287
4,"choose own plans or workouts in app,can also l...",I have these googles and they are pretty neat....,0.515510
...,...,...,...
100,customize your goggles(split times/stroke coun...,My one complaint is the main stats can’t inclu...,0.951017
101,customize your goggles(split times/stroke coun...,New software update broke the goggles,0.980066
102,customize your goggles(split times/stroke coun...,As described in title,0.995979
103,customize your goggles(split times/stroke coun...,It recognizes freestyle but all my breast stro...,0.436337


In [16]:
df = calSimilarity('myswimpro', 4)
df

Unnamed: 0,feature,review,similarity score
0,MySwimPro Coach unlocks:,This is the most complicated app for beginners...,0.524105
1,MySwimPro Coach unlocks:,Save your money if you use a different watch.\...,0.524105
2,MySwimPro Coach unlocks:,I loved the idea of this app and really wanted...,0.524105
3,MySwimPro Coach unlocks:,"I loved this app, but for the last week or mor...",0.524105
4,MySwimPro Coach unlocks:,"So this app is amazing when it works well, but...",0.524105
...,...,...,...
5305,Follow friends,Bait and switch. 'Free app' but all it gives y...,0.716937
5306,Follow friends,Do not get this app! It will work for a while...,0.722264
5307,Follow friends,I downloaded the app to see if I could use it ...,0.718325
5308,Follow friends,App is useless. Cant even log in using email. ...,0.722342
