In [2]:
import gensim
import pandas as pd
from gensim import corpora
from gensim import similarities
from pprint import pprint

In [10]:
def calSimilarity(app: str, row: int)->pd.DataFrame:
    name_list = ['high_ratings', 'low_ratings']
    sort_by_rating = get_sort_by_rating(app)
    for i in range(2):
        # reviews_df = pd.read_csv(f'crawler_result/{app}/{app}.csv')
        df = sort_by_rating[i]
        reviews = df['content']  # Update to use 'content' instead of 'review'
        # list of app reviews
        reviews = reviews.to_list()
        # lemmetation
        texts = [[word for word in str(review).lower().split()] for review in reviews if review is not None and not pd.isna(review)]
        # create dict
        dictionary = corpora.Dictionary(texts)
        # bag-of-words
        corpus = [dictionary.doc2bow(text) for text in texts]
        # LDA modeling
        lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)
        # topics
        # pprint(lda_model.print_topics())
        kw_df = pd.read_csv('kw_db.csv')
        feature_df = kw_df['functions'][row]
        # list of features
        features = feature_df.split('\n')
        # use gensim similarities module
        index = similarities.MatrixSimilarity(lda_model[corpus])
        # 創建一個空的列表來存儲所有的結果
        results = []
        # compare the similarity
        for feature in features:
            feature_bow = dictionary.doc2bow(feature.lower().split())
            feature_lda = lda_model[feature_bow]
            sims = index[feature_lda]
            for review, score in zip(reviews, sims):
                # 將每次迭代的結果添加到results列表中
                results.append({'feature': feature, 'review': review, 'similarity score': score})
        # 使用results列表來創建一個DataFrame
        df = pd.DataFrame(results)
        df.to_csv(f'feature_sim_score_{app}_{name_list[i]}.csv')
    return df

def get_sort_by_rating(app: str)->[pd.DataFrame, pd.DataFrame]:
    # Splitting the combined dataset into two based on the score
    combined_data = pd.read_csv(f'result/{app}/{app}_combined.csv')
    # Good reviews with scores 4 and 5
    good_reviews = combined_data[combined_data['score'] >= 4]
    # Bad reviews with scores 1, 2, and 3
    bad_reviews = combined_data[combined_data['score'] <= 3]
    return [good_reviews, bad_reviews] 

In [4]:
kw_df = pd.read_csv('kw_db.csv')
kw_df

Unnamed: 0.1,Unnamed: 0,functions
0,Duoswim:swim workout app,personal AI coach\nguided workouts\n1000+ swim...
1,swim.com:swim tracker,track your swims and achieve your goals\ndisco...
2,swimup-swimming training,personalized training plans\ndrill&technique v...
3,form swim,"choose own plans or workouts in app,can also l..."
4,myswimpro:#1 swim workout app,MySwimPro Coach unlocks:\nPersonalized swim Tr...
5,garmin connect,記錄距離\n設定泳池大小\n泳姿辨識\n自動休息\n技術訓練日誌
6,fitness,automatically tracks your splits and auto sets...


In [15]:
kw_df['functions'][0]

'personal AI coach\nguided workouts\n1000+ swim workouts\ntechnique video library\napple health\nnutrition\ntrack your swims\ntraining statistics overview'

In [11]:
df = calSimilarity('duoswim', 0)
df

IndexError: index 1 is out of bounds for axis 0 with size 1

In [6]:
df = calSimilarity('swim-com', 1)
df

Unnamed: 0,feature,review,similarity score
0,track your swims and achieve your goals,"Strictly from an App Design standpoint, it bre...",0.862993
1,track your swims and achieve your goals,This app has all the potential of being great ...,0.862993
2,track your swims and achieve your goals,"I want to like this app. It has potential, and...",0.862993
3,track your swims and achieve your goals,The new iPhone app format update is the worst ...,0.521507
4,track your swims and achieve your goals,This app is great when it works. Just after th...,0.505217
...,...,...,...
2113,view performace over time,it worked consistently. I bought the Pebble a...,0.940386
2114,view performace over time,"The app will work, it would track my swim on e...",0.381513
2115,view performace over time,"Polar m600 is an Android watch. However, this ...",0.375241
2116,view performace over time,On my pebble steel I get a message to check my...,0.949478


In [7]:
df = calSimilarity('swimup', 2)
df

Unnamed: 0,feature,review,similarity score
0,personalized training plans,They put me on 2 levels before champion but th...,0.940652
1,personalized training plans,I love Swimup’s YouTube channel so I was reall...,0.936594
2,personalized training plans,No voice tutorials makes these videos useless.,0.417919
3,personalized training plans,"The app is decent, I am using it, but the beha...",0.936594
4,personalized training plans,A lot of potential and probably the best of th...,0.350417
5,personalized training plans,Subscription schemes to bleed customers dry ad...,0.940388
6,personalized training plans,The videod keep loading without opening,0.436227
7,personalized training plans,This app did not meet my expectation. I have u...,0.945363
8,personalized training plans,"I make a subscription and app stop working , c...",0.435867
9,personalized training plans,Greate app but It is not available for samsung...,0.398985


In [8]:
df = calSimilarity('form-swim', 3)
df

Unnamed: 0,feature,review,similarity score
0,"choose own plans or workouts in app,can also l...",The googles are fine and the workout generatio...,0.998270
1,"choose own plans or workouts in app,can also l...","I like the FORM goggles and what they provide,...",0.998270
2,"choose own plans or workouts in app,can also l...",I was really disappointed to learn the collect...,0.998270
3,"choose own plans or workouts in app,can also l...",Goggles/software do function well and as adver...,0.998270
4,"choose own plans or workouts in app,can also l...",I have these googles and they are pretty neat....,0.998270
...,...,...,...
215,customize your goggles(split times/stroke coun...,"Tried to download the app multiple times, but ...",0.283592
216,customize your goggles(split times/stroke coun...,Cannot download. Took 2+ hours and 6 attempts ...,0.260046
217,customize your goggles(split times/stroke coun...,I get many ptoblem when try to connect with de...,0.268003
218,customize your goggles(split times/stroke coun...,"Would be 5 star, only.problrm that's a pain fo...",0.915248


In [12]:
df = calSimilarity('myswimpro', 4)
df

Unnamed: 0,feature,review,similarity score
0,MySwimPro Coach unlocks:,This is the most complicated app for beginners...,0.977847
1,MySwimPro Coach unlocks:,Save your money if you use a different watch.\...,0.242199
2,MySwimPro Coach unlocks:,I loved the idea of this app and really wanted...,0.242199
3,MySwimPro Coach unlocks:,"I loved this app, but for the last week or mor...",0.242199
4,MySwimPro Coach unlocks:,"So this app is amazing when it works well, but...",0.242199
...,...,...,...
9985,Follow friends,This is A good Log workout Love it,0.833289
9986,Follow friends,Not useful at all.,0.906696
9987,Follow friends,Expensive,0.945184
9988,Follow friends,Refund me.,0.855220
