In [17]:
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV

In [24]:
reviews = pd.read_csv('playstore preprocessed for keywords.csv ')

<h2>Preprocessing for Playstore Data for LDA<h2>

In [26]:
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Download the NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to keep only nouns in a sentence
def keep_nouns(sentence):
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)
    nouns = [word for word, pos in pos_tags if pos.startswith('NN')]
    return ' '.join(nouns)

# Apply the function to the 'Reviews' column
reviews['Review'] = reviews['Review'].apply(keep_nouns)

# Display the DataFrame
print(reviews)


[nltk_data] Downloading package punkt to /home/mrahm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mrahm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


                                                 Review  Category  \
0     update Back button Facebook idea months settin...         0   
1     update backup arrow phone i facebook arrow fac...         0   
2     app way times updates app videos times anythin...         0   
3     Edit Cool button issue lot button THE APP garb...         0   
4     ability phone button bottom screen time app ad...         0   
...                                                 ...       ...   
9279                     Bring landscape view streaming         1   
9280                                                way         0   
9281                         Great apps Thanks memories         1   
9282                       notification cant view Thank         1   
9283  Facebook policy freedom expression opposite Tw...         0   

                                      content_processed  
0     ['update', 'facebook', 'idea', 'improves', 'mo...  
1     ['backup', 'arrow', 'phone', 'im', 'facebook',...

In [29]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 17.9 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25h  Downloading pyLDAvis-3.3.0.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 67.1 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25h  Downloading pyLDAvis-3.2.2.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 58.4 MB/s eta 0:00:01
Collecting numexpr
  Downloading numexpr-2.8.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (378 kB)
[K     |████████████████████████████████| 378 kB 59.2 MB/s eta 0:00:

<h2>LDA<h2>

In [30]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora, models
import matplotlib.pyplot as plt


# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha() and word not in stop_words]

reviews['processed_text'] = reviews['Review'].apply(preprocess_text)

# Create a dictionary and a corpus
dictionary = corpora.Dictionary(reviews['processed_text'])
corpus = [dictionary.doc2bow(text) for text in reviews['processed_text']]

# Build the LDA model
num_topics = 20
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

# Visualize the topics
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'lda_visualization.html')

# Display the HTML file in your browser
import webbrowser
webbrowser.open('lda_visualization.html')


[nltk_data] Downloading package punkt to /home/mrahm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mrahm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(0, '0.064*"everything" + 0.063*"app" + 0.046*"session" + 0.037*"facebook" + 0.034*"nothing"')
(1, '0.175*"button" + 0.091*"update" + 0.088*"phone" + 0.082*"app" + 0.038*"facebook"')
(2, '0.250*"ads" + 0.073*"feed" + 0.062*"news" + 0.059*"content" + 0.051*"ad"')
(3, '0.077*"people" + 0.067*"facebook" + 0.045*"share" + 0.037*"app" + 0.032*"features"')
(4, '0.381*"app" + 0.078*"time" + 0.042*"updates" + 0.029*"facebook" + 0.020*"update"')
(5, '0.197*"facebook" + 0.089*"page" + 0.048*"issue" + 0.032*"profile" + 0.028*"please"')
(6, '0.104*"freedom" + 0.089*"application" + 0.065*"speech" + 0.047*"expression" + 0.046*"star"')
(7, '0.110*"comments" + 0.107*"post" + 0.061*"comment" + 0.041*"keeps" + 0.031*"keyboard"')
(8, '0.149*"video" + 0.049*"version" + 0.048*"photos" + 0.040*"videos" + 0.036*"screen"')
(9, '0.078*"data" + 0.057*"privacy" + 0.051*"place" + 0.041*"community" + 0.036*"standards"')
(10, '0.174*"posts" + 0.166*"friends" + 0.083*"pages" + 0.047*"family" + 0.035*"groups"')
(11, 

True

In [3]:
!pip install pandas scikit-learn gensim




In [4]:
!pip install gensim



<h3>Generating dictionary of Topics/Keywords for Playstore Data<h3>

In [42]:
data = [
    ('0.064*"everything" + 0.063*"app" + 0.046*"session" + 0.037*"facebook" + 0.034*"nothing"'),
    ('0.175*"button" + 0.091*"update" + 0.088*"phone" + 0.082*"app" + 0.038*"facebook"'),
    ('0.250*"ads" + 0.073*"feed" + 0.062*"news" + 0.059*"content" + 0.051*"ad"'),
    ('0.077*"people" + 0.067*"facebook" + 0.045*"share" + 0.037*"app" + 0.032*"features"'),
    ('0.381*"app" + 0.078*"time" + 0.042*"updates" + 0.029*"facebook" + 0.020*"update"'),
    ('0.197*"facebook" + 0.089*"page" + 0.048*"issue" + 0.032*"profile" + 0.028*"please"'),
    ('0.104*"freedom" + 0.089*"application" + 0.065*"speech" + 0.047*"expression" + 0.046*"star"'),
    ('0.110*"comments" + 0.107*"post" + 0.061*"comment" + 0.041*"keeps" + 0.031*"keyboard"'),
    ('0.149*"video" + 0.049*"version" + 0.048*"photos" + 0.040*"videos" + 0.036*"screen"'),
    ('0.078*"data" + 0.057*"privacy" + 0.051*"place" + 0.041*"community" + 0.036*"standards"'),
    ('0.174*"posts" + 0.166*"friends" + 0.083*"pages" + 0.047*"family" + 0.035*"groups"'),
    ('0.144*"videos" + 0.089*"reels" + 0.070*"video" + 0.053*"option" + 0.037*"browser"'),
    ('0.141*"marketplace" + 0.067*"bar" + 0.040*"stories" + 0.037*"song" + 0.033*"auto"'),
    ('0.109*"problem" + 0.082*"music" + 0.075*"story" + 0.046*"facebook" + 0.039*"picture"'),
    ('0.105*"facebook" + 0.057*"problems" + 0.055*"lot" + 0.049*"apps" + 0.044*"accounts"'),
    ('0.074*"information" + 0.043*"app" + 0.038*"system" + 0.035*"meta" + 0.031*"money"'),
    ('0.151*"notifications" + 0.071*"times" + 0.067*"day" + 0.063*"app" + 0.037*"days"'),
    ('0.222*"account" + 0.072*"facebook" + 0.051*"password" + 0.039*"email" + 0.037*"phone"'),
    ('0.086*"messages" + 0.071*"notification" + 0.054*"message" + 0.053*"messenger" + 0.049*"someone"'),
    ('0.270*"fb" + 0.025*"time" + 0.024*"timeline" + 0.023*"memories" + 0.022*"events"')
]
newlist = []

# Iterate through each line of data
for line in data:
    # Remove quotes, '*', and '+' and split the line into words and weights
    words_weights = line.replace('"', '').replace('*', '').replace('+', '').split()
    print(words_weights)
    
    # Print the words
    for i in range(0, len(words_weights)):  # Start from index 1, considering odd indices as words
        word = words_weights[i - 1]
        newlist.append(word)
        print(word)


['0.064everything', '0.063app', '0.046session', '0.037facebook', '0.034nothing']
0.034nothing
0.064everything
0.063app
0.046session
0.037facebook
['0.175button', '0.091update', '0.088phone', '0.082app', '0.038facebook']
0.038facebook
0.175button
0.091update
0.088phone
0.082app
['0.250ads', '0.073feed', '0.062news', '0.059content', '0.051ad']
0.051ad
0.250ads
0.073feed
0.062news
0.059content
['0.077people', '0.067facebook', '0.045share', '0.037app', '0.032features']
0.032features
0.077people
0.067facebook
0.045share
0.037app
['0.381app', '0.078time', '0.042updates', '0.029facebook', '0.020update']
0.020update
0.381app
0.078time
0.042updates
0.029facebook
['0.197facebook', '0.089page', '0.048issue', '0.032profile', '0.028please']
0.028please
0.197facebook
0.089page
0.048issue
0.032profile
['0.104freedom', '0.089application', '0.065speech', '0.047expression', '0.046star']
0.046star
0.104freedom
0.089application
0.065speech
0.047expression
['0.110comments', '0.107post', '0.061comment', '0.

In [56]:
import re

# Separate numeric values and words
words = []
numbers = []

for item in newlist:
    # Use regular expression to separate numeric and non-numeric parts
    match = re.match(r'([\d.]+)([a-zA-Z]+)', item)
    if match:
        numeric_part = match.group(1)
        words_part = match.group(2)
        
        # Convert numeric part to float
        numeric_value = float(numeric_part)
        numbers.append(numeric_value)
        words.append(words_part)
    else:
        words.append(item)

print("Words:", words)
print("Numbers:", numbers)


Words: ['nothing', 'everything', 'app', 'session', 'facebook', 'facebook', 'button', 'update', 'phone', 'app', 'ad', 'ads', 'feed', 'news', 'content', 'features', 'people', 'facebook', 'share', 'app', 'update', 'app', 'time', 'updates', 'facebook', 'please', 'facebook', 'page', 'issue', 'profile', 'star', 'freedom', 'application', 'speech', 'expression', 'keyboard', 'comments', 'post', 'comment', 'keeps', 'screen', 'video', 'version', 'photos', 'videos', 'standards', 'data', 'privacy', 'place', 'community', 'groups', 'posts', 'friends', 'pages', 'family', 'browser', 'videos', 'reels', 'video', 'option', 'auto', 'marketplace', 'bar', 'stories', 'song', 'picture', 'problem', 'music', 'story', 'facebook', 'accounts', 'facebook', 'problems', 'lot', 'apps', 'money', 'information', 'app', 'system', 'meta', 'days', 'notifications', 'times', 'day', 'app', 'phone', 'account', 'facebook', 'password', 'email', 'someone', 'messages', 'notification', 'message', 'messenger', 'events', 'fb', 'time', 

In [58]:
word_value_dict = {}

for word, value in zip(words, numbers):
    if word in word_value_dict:
        # If the word already exists in the dictionary, add the value
        word_value_dict[word] += value
    else:
        # If the word is not in the dictionary, create a new entry
        word_value_dict[word] = value

print(word_value_dict)

{'nothing': 0.034, 'everything': 0.064, 'app': 0.669, 'session': 0.046, 'facebook': 0.591, 'button': 0.175, 'update': 0.111, 'phone': 0.125, 'ad': 0.051, 'ads': 0.25, 'feed': 0.073, 'news': 0.062, 'content': 0.059, 'features': 0.032, 'people': 0.077, 'share': 0.045, 'time': 0.10300000000000001, 'updates': 0.042, 'please': 0.028, 'page': 0.089, 'issue': 0.048, 'profile': 0.032, 'star': 0.046, 'freedom': 0.104, 'application': 0.089, 'speech': 0.065, 'expression': 0.047, 'keyboard': 0.031, 'comments': 0.11, 'post': 0.107, 'comment': 0.061, 'keeps': 0.041, 'screen': 0.036, 'video': 0.219, 'version': 0.049, 'photos': 0.048, 'videos': 0.184, 'standards': 0.036, 'data': 0.078, 'privacy': 0.057, 'place': 0.051, 'community': 0.041, 'groups': 0.035, 'posts': 0.174, 'friends': 0.166, 'pages': 0.083, 'family': 0.047, 'browser': 0.037, 'reels': 0.089, 'option': 0.053, 'auto': 0.033, 'marketplace': 0.141, 'bar': 0.067, 'stories': 0.04, 'song': 0.037, 'picture': 0.039, 'problem': 0.109, 'music': 0.08

In [59]:
len(word_value_dict)

83

<h3>Generating dictionary of Topics/Keywords for Appstore Data (Similar to previous, calculated on a different file)<h3>

In [61]:
data2 = [
    ('0.072*"facebook" + 0.039*"people" + 0.029*"fact" + 0.025*"speech" + 0.024*"platform"'),
('0.050*"meta" + 0.048*"tiktok" + 0.040*"mark" + 0.040*"data" + 0.036*"money"'),
('0.036*"creators" + 0.030*"dating" + 0.026*"baby" + 0.026*"scams" + 0.018*"june"'),
('0.074*"business" + 0.057*"page" + 0.042*"facebook" + 0.038*"people" + 0.014*"pages"'),
('0.032*"sounds" + 0.026*"volume" + 0.023*"reel" + 0.020*"action" + 0.015*"trafficking"'),
('0.041*"people" + 0.032*"company" + 0.024*"facebook" + 0.020*"app" + 0.019*"place"'),
('0.078*"app" + 0.078*"facebook" + 0.026*"video" + 0.022*"time" + 0.018*"videos"'),
('0.126*"notifications" + 0.070*"group" + 0.053*"groups" + 0.038*"notification" + 0.032*"page"'),
('0.089*"app" + 0.037*"post" + 0.032*"facebook" + 0.031*"comments" + 0.024*"update"'),
('0.087*"facebook" + 0.086*"account" + 0.028*"phone" + 0.024*"app" + 0.021*"help"'),
('0.031*"space" + 0.016*"landscape" + 0.014*"camera" + 0.013*"night" + 0.013*"word"'),
('0.041*"president" + 0.039*"trump" + 0.026*"states" + 0.024*"people" + 0.022*"truth"'),
('0.097*"photos" + 0.074*"photo" + 0.034*"option" + 0.032*"app" + 0.023*"avatar"'),
('0.067*"reels" + 0.028*"lol" + 0.027*"cat" + 0.024*"notes" + 0.019*"watch"'),
('0.298*"fb" + 0.077*"ads" + 0.029*"ad" + 0.013*"friends" + 0.012*"people"'),
('0.087*"facebook" + 0.056*"account" + 0.031*"community" + 0.024*"standards" + 0.023*"years"'),
('0.072*"facebook" + 0.065*"posts" + 0.054*"people" + 0.050*"friends" + 0.038*"app"'),
('0.042*"people" + 0.024*"women" + 0.019*"children" + 0.015*"platform" + 0.013*"guys"'),
('0.047*"folks" + 0.032*"description" + 0.030*"conspiracy" + 0.023*"voice" + 0.019*"theories"'),
('0.190*"marketplace" + 0.055*"items" + 0.036*"item" + 0.024*"sale" + 0.021*"listings"'),
]
newlist2 = []

# Iterate through each line of data
for line in data2:
    # Remove quotes, '*', and '+' and split the line into words and weights
    words_weights = line.replace('"', '').replace('*', '').replace('+', '').split()
    
    # Print the words
    for i in range(0, len(words_weights)):  # Start from index 1, considering odd indices as words
        word = words_weights[i - 1]
        newlist2.append(word)

# Separate numeric values and words
words2 = []
numbers2 = []

for item in newlist2:
    # Use regular expression to separate numeric and non-numeric parts
    match = re.match(r'([\d.]+)([a-zA-Z]+)', item)
    if match:
        numeric_part = match.group(1)
        words_part = match.group(2)
        
        # Convert numeric part to float
        numeric_value = float(numeric_part)
        numbers2.append(numeric_value)
        words2.append(words_part)
    else:
        words2.append(item)

word_value_dict2 = {}

for word, value in zip(words2, numbers2):
    if word in word_value_dict2:
        # If the word already exists in the dictionary, add the value
        word_value_dict2[word] += value
    else:
        # If the word is not in the dictionary, create a new entry
        word_value_dict2[word] = value

print(word_value_dict2)


{'platform': 0.039, 'facebook': 0.49399999999999994, 'people': 0.25, 'fact': 0.029, 'speech': 0.025, 'money': 0.036, 'meta': 0.05, 'tiktok': 0.048, 'mark': 0.04, 'data': 0.04, 'june': 0.018, 'creators': 0.036, 'dating': 0.03, 'baby': 0.026, 'scams': 0.026, 'pages': 0.014, 'business': 0.074, 'page': 0.089, 'trafficking': 0.015, 'sounds': 0.032, 'volume': 0.026, 'reel': 0.023, 'action': 0.02, 'place': 0.019, 'company': 0.032, 'app': 0.28099999999999997, 'videos': 0.018, 'video': 0.026, 'time': 0.022, 'notifications': 0.126, 'group': 0.07, 'groups': 0.053, 'notification': 0.038, 'update': 0.024, 'post': 0.037, 'comments': 0.031, 'help': 0.021, 'account': 0.142, 'phone': 0.028, 'word': 0.013, 'space': 0.031, 'landscape': 0.016, 'camera': 0.014, 'night': 0.013, 'truth': 0.022, 'president': 0.041, 'trump': 0.039, 'states': 0.026, 'avatar': 0.023, 'photos': 0.097, 'photo': 0.074, 'option': 0.034, 'watch': 0.019, 'reels': 0.067, 'lol': 0.028, 'cat': 0.027, 'notes': 0.024, 'fb': 0.298, 'ads': 0

In [69]:
df1 = pd.read_csv('play_store_facebook_reviews_with_sentiment_full.csv')
df1.drop(columns=['Score'], inplace=True)
df1['Category'] = df1['Category'].replace('good', 1)
df1['Category'] = df1['Category'].replace('bad', -1)
df1 = df1[df1['Category'] != 'neutral']

df2 = pd.read_csv('app_store_facebook_reviews_with_sentiment_full.csv')
df2.drop(columns=['Score'], inplace=True)
df2['Category'] = df2['Category'].replace('good', 1)
df2['Category'] = df2['Category'].replace('bad', -1)
df2 = df2[df2['Category'] != 'neutral']
df2.head

<bound method NDFrame.head of                                                  Review Category
0     I have updated to the latest iPhone software, ...       -1
1     I can't believe you guys, it's annoying to not...       -1
2     Firstly, to get the big one out of the way. Se...       -1
3     Absolute garbage. I never used social media be...       -1
4     this app supposed to be a "Open opinion space ...       -1
...                                                 ...      ...
6015  Just last week I could edit my post on my phon...       -1
6016  My mom passed in May and I submitted 2 request...       -1
6017  This app is trashiest. Facebook has no rules t...       -1
6018  This app is trash!! Y’all are so sensitive 🤬 I...       -1
6019  Literally no excuses. This is a gigantic compa...       -1

[5821 rows x 2 columns]>

In [117]:
key1 = list(word_value_dict.keys())
print(len(key1))
key2 = list(word_value_dict2.keys())
print(len(key2))

83
78


In [82]:

# Initialize a dictionary to store labels
label_dict = {}

# Iterate over keys and find corresponding labels
for key in key1:
    # Find rows where the key is present in the 'Review' column
    matching_rows = df1[df1['Review'].str.contains(key, case=False)]
    
    # Get labels for matching rows
    labels = matching_rows['Category'].tolist()
    
    # Update the dictionary with the key and corresponding labels
    label_dict[key] = labels

# Display the label dictionary
print((label_dict['button']))


[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1,

In [88]:
sum1 = []
for key in key1:
    # Assuming label_dict is your dictionary
    total_sum = sum(label_dict[key])
    if(total_sum>=0):
        total_sum = 1
    else:
        total_sum = 0
    sum1.append(total_sum)
    total_sum = 0
print(sum1)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [157]:
print(label_dict['button'])

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1,

In [90]:

# Initialize a dictionary to store labels
label_dict2 = {}

# Iterate over keys and find corresponding labels
for key in key2:
    # Find rows where the key is present in the 'Review' column
    matching_rows = df2[df2['Review'].str.contains(key, case=False)]
    
    # Get labels for matching rows
    labels = matching_rows['Category'].tolist()
    
    # Update the dictionary with the key and corresponding labels
    label_dict2[key] = labels

# Display the label dictionary
sum2 = []
for key in key2:
    # Assuming label_dict is your dictionary
    total_sum = sum(label_dict2[key])
    if(total_sum>=0):
        total_sum = 1
    else:
        total_sum = 0
    sum2.append(total_sum)
    total_sum = 0
print(sum2)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [104]:
dd = word_value_dict
dd2 = word_value_dict2

<h3>Appending Sentiments to dictionary of Topics/Keywords for Playstore Data (all 0 in this case)<h3>

In [105]:

# Specify the list of keys
keys_to_update = key1

# Specify the new value you want to append (a float in this case)
new_value = 0.0

# Update each key in the dictionary with the new value
for key in keys_to_update:
    current_value = dd.get(key)

    if isinstance(current_value, list):
        current_value.append(new_value)
    else:
        dd[key] = [current_value, new_value] if current_value is not None else [new_value]

print(dd)


{'nothing': [0.034, 0.0, 0.0], 'everything': [0.064, 0.0, 0.0], 'app': [0.669, 0.0, 0.0], 'session': [0.046, 0.0, 0.0], 'facebook': [0.591, 0.0, 0.0], 'button': [0.175, 0.0, 0.0], 'update': [0.111, 0.0, 0.0], 'phone': [0.125, 0.0, 0.0], 'ad': [0.051, 0.0, 0.0], 'ads': [0.25, 0.0, 0.0], 'feed': [0.073, 0.0, 0.0], 'news': [0.062, 0.0, 0.0], 'content': [0.059, 0.0, 0.0], 'features': [0.032, 0.0, 0.0], 'people': [0.077, 0.0, 0.0], 'share': [0.045, 0.0, 0.0], 'time': [0.10300000000000001, 0.0, 0.0], 'updates': [0.042, 0.0, 0.0], 'please': [0.028, 0.0, 0.0], 'page': [0.089, 0.0, 0.0], 'issue': [0.048, 0.0, 0.0], 'profile': [0.032, 0.0, 0.0], 'star': [0.046, 0.0, 0.0], 'freedom': [0.104, 0.0, 0.0], 'application': [0.089, 0.0, 0.0], 'speech': [0.065, 0.0, 0.0], 'expression': [0.047, 0.0, 0.0], 'keyboard': [0.031, 0.0, 0.0], 'comments': [0.11, 0.0, 0.0], 'post': [0.107, 0.0, 0.0], 'comment': [0.061, 0.0, 0.0], 'keeps': [0.041, 0.0, 0.0], 'screen': [0.036, 0.0, 0.0], 'video': [0.219, 0.0, 0.0], 

In [110]:
# List of keys to be deleted
keys_to_delete = ['k', 'e', 'y', '3']

# Delete each key from the dictionary
for key in keys_to_delete:
    if key in dd:
        del dd[key]
        print(f"The key '{key}' has been deleted.")
    else:
        print(f"The key '{key}' does not exist in the dictionary.")

# Print the updated dictionary
print(dd)

The key 'k' has been deleted.
The key 'e' has been deleted.
The key 'y' has been deleted.
The key '3' has been deleted.
{'nothing': [0.034, 0.0, 0.0], 'everything': [0.064, 0.0, 0.0], 'app': [0.669, 0.0, 0.0], 'session': [0.046, 0.0, 0.0], 'facebook': [0.591, 0.0, 0.0], 'button': [0.175, 0.0, 0.0], 'update': [0.111, 0.0, 0.0], 'phone': [0.125, 0.0, 0.0], 'ad': [0.051, 0.0, 0.0], 'ads': [0.25, 0.0, 0.0], 'feed': [0.073, 0.0, 0.0], 'news': [0.062, 0.0, 0.0], 'content': [0.059, 0.0, 0.0], 'features': [0.032, 0.0, 0.0], 'people': [0.077, 0.0, 0.0], 'share': [0.045, 0.0, 0.0], 'time': [0.10300000000000001, 0.0, 0.0], 'updates': [0.042, 0.0, 0.0], 'please': [0.028, 0.0, 0.0], 'page': [0.089, 0.0, 0.0], 'issue': [0.048, 0.0, 0.0], 'profile': [0.032, 0.0, 0.0], 'star': [0.046, 0.0, 0.0], 'freedom': [0.104, 0.0, 0.0], 'application': [0.089, 0.0, 0.0], 'speech': [0.065, 0.0, 0.0], 'expression': [0.047, 0.0, 0.0], 'keyboard': [0.031, 0.0, 0.0], 'comments': [0.11, 0.0, 0.0], 'post': [0.107, 0.0, 0

In [113]:
num_keys = len(dd)
num_keys

83

<h3>Appending Sentiments to dictionary of Topics/Keywords for Appstore Data (also all 0 in this case)<h3>

In [115]:
# Specify the list of keys
keys_to_update = key2

# Specify the new value you want to append (a float in this case)
new_value = 0.0

# Update each key in the dictionary with the new value
for key in keys_to_update:
    current_value = dd2.get(key)

    if isinstance(current_value, list):
        current_value.append(new_value)
    else:
        dd2[key] = [current_value, new_value] if current_value is not None else [new_value]

# List of keys to be deleted
keys_to_delete = ['k', 'e', 'y', '3']

# Delete each key from the dictionary
for key in keys_to_delete:
    if key in dd2:
        del dd2[key]
        print(f"The key '{key}' has been deleted.")
    else:
        print(f"The key '{key}' does not exist in the dictionary.")

# Print the updated dictionary
print(dd2)

The key 'k' does not exist in the dictionary.
The key 'e' does not exist in the dictionary.
The key 'y' does not exist in the dictionary.
The key '3' does not exist in the dictionary.
{'platform': [0.039, 0.0], 'facebook': [0.49399999999999994, 0.0], 'people': [0.25, 0.0], 'fact': [0.029, 0.0], 'speech': [0.025, 0.0], 'money': [0.036, 0.0], 'meta': [0.05, 0.0], 'tiktok': [0.048, 0.0], 'mark': [0.04, 0.0], 'data': [0.04, 0.0], 'june': [0.018, 0.0], 'creators': [0.036, 0.0], 'dating': [0.03, 0.0], 'baby': [0.026, 0.0], 'scams': [0.026, 0.0], 'pages': [0.014, 0.0], 'business': [0.074, 0.0], 'page': [0.089, 0.0], 'trafficking': [0.015, 0.0], 'sounds': [0.032, 0.0], 'volume': [0.026, 0.0], 'reel': [0.023, 0.0], 'action': [0.02, 0.0], 'place': [0.019, 0.0], 'company': [0.032, 0.0], 'app': [0.28099999999999997, 0.0], 'videos': [0.018, 0.0], 'video': [0.026, 0.0], 'time': [0.022, 0.0], 'notifications': [0.126, 0.0], 'group': [0.07, 0.0], 'groups': [0.053, 0.0], 'notification': [0.038, 0.0], 'u

In [116]:
num_keys = len(dd2)
num_keys

78

<h3>Finding the Relevant Keywords from the previous 2 dictionaries with Facebook App features<h3>

In [132]:
facebook_features = ['messenger', 'newsfeed', 'photos', 'song', 'video', 'back', 'button', 'gaming', 'marketplace', 'notification', 'menu', 'reels',
                     'story', 'cover', 'cover photo', 'profile', 'friend', 'event', 'timeline', 'groups', 'status', 'save', 'posts', 'like', 'comments',
                    'share', 'reacts', 'privacy', 'album', 'login', 'signup', 'register', 'pages', 'ad', 'advertisement', 'dating', 'memory', 'message'
                     ]
#  'stories', 'notifications', 'videos', 'community', 'updates', 'update'

In [151]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/mrahm/nltk_data...


True

In [155]:
# from fuzzywuzzy import fuzz

# # Threshold for considering a match
# threshold = 80  # You can adjust this based on your requirements

# # Function to find the closest match
# def find_closest_match(word, dictionary):
#     matches = [(key, fuzz.ratio(word, key)) for key in dictionary]
#     closest_match = max(matches, key=lambda x: x[1])
#     if closest_match[1] >= threshold:
#         return closest_match[0]
#     else:
#         return None

# # Perform word matching
# matched_words = {word: find_closest_match(word, dd) for word in facebook_features}

# matching1 = []
# not_matching1 = []
# # Print matched words
# for word, match in matched_words.items():
#     if match is not None:
#         #print(f"{word} matched with {match} in the dictionary.")
# c    else:
#         #print(f"No match found for {word} in the dictionary.")
#         not_matching1.append(word)

# print(f"Features of the Facebook app from Playstore that is most talked about:{matching1}")
# print('///////////////')
# print(f"Features of the Facebook app from Playstore that is not talked about:{not_matching1}")


from fuzzywuzzy import fuzz
from nltk.stem import WordNetLemmatizer

# Download the WordNet data for lemmatization


# Threshold for considering a match
threshold = 80  # You can adjust this based on your requirements

# Lemmatizer for reducing words to their base forms
lemmatizer = WordNetLemmatizer()

# Function to find the closest match
def find_closest_match(word, dictionary):
    word_base = lemmatizer.lemmatize(word.lower())  # Convert to lowercase and lemmatize
    matches = [(key, fuzz.ratio(word_base, lemmatizer.lemmatize(key.lower()))) for key in dictionary]
    closest_match = max(matches, key=lambda x: x[1])
    if closest_match[1] >= threshold:
        return closest_match[0]
    else:
        return None

# Perform word matching
matched_words = {word: find_closest_match(word, dd) for word in facebook_features}

matching1 = []
not_matching1 = []
# Print matched words
for word, match in matched_words.items():
    if match is not None:
        #print(f"{word} matched with {match} in the dictionary.")
        matching1.append(word)
    else:
        #print(f"No match found for {word} in the dictionary.")
        not_matching1.append(word)
print(f"Features of the Facebook app from Playstore that is most talked about:{matching1}")
print('///////////////')
print(f"Features of the Facebook app from Playstore that is not talked about:{not_matching1}")

Features of the Facebook app from Playstore that is most talked about:['messenger', 'photos', 'song', 'video', 'button', 'marketplace', 'notification', 'reels', 'story', 'profile', 'friend', 'event', 'timeline', 'groups', 'posts', 'comments', 'share', 'privacy', 'pages', 'ad', 'memory', 'message']
///////////////
Features of the Facebook app from Playstore that is not talked about:['newsfeed', 'back', 'gaming', 'menu', 'cover', 'cover photo', 'status', 'save', 'like', 'reacts', 'album', 'login', 'signup', 'register', 'advertisement', 'dating']


In [139]:
from fuzzywuzzy import fuzz

# Threshold for considering a match
threshold = 80  # You can adjust this based on your requirements

# Function to find the closest match
def find_closest_match(word, dictionary):
    matches = [(key, fuzz.ratio(word, key)) for key in dictionary]
    closest_match = max(matches, key=lambda x: x[1])
    if closest_match[1] >= threshold:
        return closest_match[0]
    else:
        return None

# Perform word matching
matched_words = {word: find_closest_match(word, dd2) for word in facebook_features}

matching2 = []
not_matching2 = []
# Print matched words
for word, match in matched_words.items():
    if match is not None:
        #print(f"{word} matched with {match} in the dictionary.")
        matching2.append(word)
    else:
        #print(f"No match found for {word} in the dictionary.")
        not_matching2.append(word)

print(f"Features of the Facebook app from Appstore that is most talked about:{matching2}")
print('///////////////')
print(f"Features of the Facebook app from Appstore that is not talked about:{not_matching2}")

Features of the Facebook app from Appstore that is most talked about:['photos', 'video', 'marketplace', 'notification', 'reels', 'friend', 'groups', 'status', 'posts', 'comments', 'pages', 'ad', 'dating']
///////////////
Features of the Facebook app from Appstore that is not talked about:['messenger', 'newsfeed', 'song', 'back', 'button', 'gaming', 'menu', 'story', 'cover', 'cover photo', 'profile', 'event', 'timeline', 'save', 'like', 'share', 'reacts', 'privacy', 'album', 'login', 'signup', 'register', 'advertisement', 'memory', 'message']


In [140]:
print("The following features are talked about in both of the apps: ")
for word in matching1:
    if word in matching2:
        print(word)

The following features are talked about in both of the apps: 
photos
video
marketplace
notification
reels
friend
groups
posts
comments
pages
ad


In [143]:
from fuzzywuzzy import fuzz

# Threshold for considering a match
threshold = 80  # You can adjust this based on your requirements

# Function to find the closest match for each word in dictionary1 in dictionary2
def find_closest_matches(dict1, dict2):
    matches = {}
    for word1 in dict1:
        closest_match = max([(word2, fuzz.ratio(word1, word2)) for word2 in dict2], key=lambda x: x[1])
        if closest_match[1] >= threshold:
            matches[word1] = closest_match[0]
        else:
            matches[word1] = None
    return matches

# Perform dictionary matching
matched_words = find_closest_matches(dd, dd2)

# Print matched words
for word1, match in matched_words.items():
    if match is not None and match not in matching1 and match not in matching2:
        print(f"Some of the common non-feature matches: {word1}")


Some of the common non-feature matches: app
Some of the common non-feature matches: facebook
Some of the common non-feature matches: update
Some of the common non-feature matches: phone
Some of the common non-feature matches: ads
Some of the common non-feature matches: people
Some of the common non-feature matches: time
Some of the common non-feature matches: updates
Some of the common non-feature matches: page
Some of the common non-feature matches: speech
Some of the common non-feature matches: post
Some of the common non-feature matches: videos
Some of the common non-feature matches: standards
Some of the common non-feature matches: data
Some of the common non-feature matches: place
Some of the common non-feature matches: community
Some of the common non-feature matches: friends
Some of the common non-feature matches: option
Some of the common non-feature matches: stories
Some of the common non-feature matches: accounts
Some of the common non-feature matches: apps
Some of the common