In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
# mount drive on colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer

In [None]:
# dowload default set for stop words and for lamentisation from nltk library
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# import the data file
path = f'/content/drive/MyDrive/Colab Notebooks/output/fkss_pos.csv'
df = pd.read_csv(path)

In [None]:
df.head()

Unnamed: 0,label,score,rating,review,comment,text
0,positive,0.980728,5,Perfect product!,Good product 👍👍👍👍👍 thanks FlipkartREAD MORE,Good product 👍👍👍👍👍 thanks Flipkart
1,positive,0.983053,5,Awesome,Very nice product go fot it ❤️READ MORE,Very nice product go fot it ❤️
2,positive,0.970506,5,Excellent,What a beautiful sunscreen if sweating too muc...,What a beautiful sunscreen if sweating too muc...
3,positive,0.971611,5,Fabulous!,Good 😊👍🏻READ MORE,Good 😊👍🏻
4,positive,0.879352,5,Fabulous!,Osm flipkart i love ponds all producksREAD MORE,Osm flipkart i love ponds all producks


In [None]:
# define custom set of additional stop words
df["text"] = df["text"].astype(str)
stop_words = stopwords.words('english')
stop_words.extend(['sunscreen','video', 'please', 'thank', 'thanks','make', 'pls', 'bro', 'mam', 'review', 'hard', 'hi', 'dr', 'ur', 'waiting', 'wa','effort', 'job', 'princess', 'jomol', 'sent', 'di', 'sir', 'kar', 'sakte', 'kr', 'skte', 'plz', 'magic', 'best'])

In [None]:
corpus = []

In [None]:
# remove special characters and process the text
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    ps = WordNetLemmatizer()
    review = [ps.lemmatize(word) for word in review]
    review = [word for word in review if not word in set(stop_words)]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
# Using tfidf vectorizer to create monograms, this gives a csr matrix as output
cv = TfidfVectorizer(ngram_range=(1,1))
corpus1 = cv.fit_transform(corpus)

In [None]:
# calculating the mean tfidf scores of each word, gives an array of avg scores
avg = corpus1.mean(axis=0)

In [None]:
# converting data to a pandas dataframe to read easier
avg = pd.DataFrame(avg, columns=cv.get_feature_names_out())

In [None]:
# dataframe transpose for better reading
avg = avg.T

In [None]:
# renaming column and copy index data into a separate column
avg = avg.rename(columns={0:'score'})
avg['word'] = avg.index

In [None]:
# sorting the values in descending order acc to the tfidf avg score
avg = avg.sort_values('score', ascending=False)
avg = avg[:50]
avg

Unnamed: 0,score,word
good,0.278753,good
product,0.184992,product
nice,0.179029,nice
super,0.030192,super
like,0.027756,like
skin,0.027446,skin
love,0.025213,love
flipkart,0.023154,flipkart
go,0.020995,go
awesome,0.019393,awesome


In [None]:
# repeating process for bigrams
cv_bigram = TfidfVectorizer(ngram_range=(2,2))
corpus1_bigram = cv_bigram.fit_transform(corpus)
avg_bigram = corpus1_bigram.mean(axis=0)
avg_bigram = pd.DataFrame(avg_bigram, columns=cv_bigram.get_feature_names_out())
avg_bigram = avg_bigram.T
avg_bigram = avg_bigram.rename(columns={0:'score'})
avg_bigram['word'] = avg_bigram.index
avg_bigram = avg_bigram.sort_values('score', ascending=False)
avg_bigram = avg_bigram[:50]

In [None]:
avg_bigram

Unnamed: 0,score,word
good product,0.129212,good product
nice product,0.063079,nice product
ok good,0.01178,ok good
product love,0.0107,product love
amazing product,0.010559,amazing product
thanku flipkart,0.009206,thanku flipkart
product good,0.009206,product good
awesome product,0.008156,awesome product
good oily,0.007889,good oily
good skin,0.006704,good skin


In [None]:
# repeating process for trigrams
cv_trigram = TfidfVectorizer(ngram_range=(3,3))
corpus1_trigram = cv_trigram.fit_transform(corpus)
avg_trigram = corpus1_trigram.mean(axis=0)
avg_trigram = pd.DataFrame(avg_trigram, columns=cv_trigram.get_feature_names_out())
avg_trigram = avg_trigram.T
avg_trigram = avg_trigram.rename(columns={0:'score'})
avg_trigram['word'] = avg_trigram.index
avg_trigram = avg_trigram.sort_values('score', ascending=False)
avg_trigram = avg_trigram[:50]

In [None]:
avg_trigram

Unnamed: 0,score,word
much really love,0.006289,much really love
like product super,0.006289,like product super
unmatchable go pond,0.006289,unmatchable go pond
good skin dry,0.006289,good skin dry
good product love,0.006289,good product love
good product flipkart,0.006289,good product flipkart
nice good working,0.006289,nice good working
ok good product,0.006289,ok good product
good oily skin,0.006289,good oily skin
nice product love,0.006289,nice product love


In [None]:
unigrams_list = avg['word'].to_list()
bigrams_list = avg_bigram['word'].to_list()
trigrams_list = avg_trigram['word'].to_list()

In [None]:
# grouping unigrams and bigrams
def convert(lst):
    return ([item.split() for item in lst])

bigrams_split = convert(bigrams_list)

check = pd.DataFrame(columns=['topic','subtopic'])

for i in unigrams_list:
    #print(i)
    counter=0
    for j in bigrams_split:
        if counter<5 and (i==j[0] or i==j[1]):
            bigram_words = ' '.join(j)
            check = pd.concat([check, pd.concat([pd.Series(i, name='topic'), pd.Series(bigram_words, name='subtopic')], axis=1)], axis=0)
            counter = counter + 1

one_2 = check.groupby(['topic'], as_index = False, sort = False).agg({'subtopic': ', '.join})

In [None]:
one_2

Unnamed: 0,topic,subtopic
0,good,"good product, ok good, product good, good oily..."
1,product,"good product, nice product, product love, amaz..."
2,nice,"nice product, super nice, nice nice, nice one,..."
3,super,"super nice, super hai, quality super, product ..."
4,like,"like dis, like product"
5,skin,"good skin, oily skin, skin look, skin dry, typ..."
6,love,"product love, favourite love, really love"
7,flipkart,"thanku flipkart, product flipkart, productthan..."
8,go,"mean go, product go, blowing go, go pond, unma..."
9,awesome,"awesome product, awesome purchasing"


In [None]:
# grouping unigrams and trigrams
def convert(lst):
    return ([item.split() for item in lst])

trigrams_split = convert(trigrams_list)

check = pd.DataFrame(columns=['topic','subtopic'])

for i in unigrams_list:
    #print(i)
    counter=0
    for j in trigrams_split:
        if counter<5 and (i==j[0] or i==j[1] or i==j[2]):
            trigram_words = ' '.join(j)
            check = pd.concat([check, pd.concat([pd.Series(i, name='topic'), pd.Series(trigram_words, name='subtopic')], axis=1)], axis=0)
            counter = counter + 1

one_3 = check.groupby(['topic'], as_index = False, sort = False).agg({'subtopic': ', '.join})

In [None]:
one_3

Unnamed: 0,topic,subtopic
0,good,"good skin dry, good product love, good product..."
1,product,"like product super, good product love, good pr..."
2,nice,"nice good working, nice product love, nice pro..."
3,super,like product super
4,like,"like product super, amazing really like"
5,skin,"good skin dry, good oily skin, boost skin mois..."
6,love,"much really love, good product love, nice prod..."
7,flipkart,"good product flipkart, nice productthanks flip..."
8,go,"unmatchable go pond, mind blowing go, product ..."
9,much,"much really love, much love sun, good much fli..."


In [None]:
path = f'/content/drive/MyDrive/Colab Notebooks/output/keywords/positive/fkss_pos_uni_bi.csv'
one_2.to_csv(path, index=False)

In [None]:
path = f'/content/drive/MyDrive/Colab Notebooks/output/keywords/positive/fkss_pos_uni_tri.csv'
one_3.to_csv(path, index=False)

In [None]:
path = f'/content/drive/MyDrive/Colab Notebooks/output/keywords/positive/fkss_pos_single.csv'
avg.to_csv(path, index=False)