In [None]:
import os
import re

def env_vars():
    with open('.env', 'r') as f:
        lines = f.readlines()
    
    lines = [re.sub('"|\n', '', x) for x in lines]
    lines = [{x.split('=')[0]: x.split('=')[1]} for x in lines]

    env_object = {}
    for json_ in lines:
        for key, value in json_.items():
            env_object[key] = value

    return env_object

ENV_VARS = env_vars()
OPEN_AI_KEY=ENV_VARS['OPEN_AI_KEY']
RAPID_API_KEY=ENV_VARS['RAPID_API_KEY']
HUGGINGFACEHUB_API_TOKEN=ENV_VARS['HUGGINGFACEHUB_API_TOKEN']

OPEN_AI_KEY = os.getenv('OPEN_AI_KEY', default='')
os.environ['OPENAI_API_KEY'] = OPEN_AI_KEY

HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN', default='')
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN


In [None]:

#####Overview

#~Get Reviews

#~Process Reviews

#~Lemmatize reviews and remove stopwords

#~Find most common words with spacy ----try tfidf vectorizer and use top 5% of the words

#~Find phrases around most common words ----graph theory 

#~Sample phrases so each common keyword is represented adequately

#~Use GPT to generate labels

#~Assign labels with BERT / hugging face model zero shot learning classification

#~Assign sub

In [None]:
#~Get Reviews
#~Process Reviews
import pandas as pd
from datetime import datetime
reviews_df = pd.read_csv('boxing_glove_reviews.csv', sep='|')

In [None]:
#~Lemmatize reviews and remove stopwords
import spacy
import re
 
spacy_nlp = spacy.load("en_core_web_md")

def remove_stops(doc):
    # Filter out stop words by using the `token.is_stop` attribute
    return [token.text for token in doc if not token.is_stop]

def lemmatize(doc):
    # Take the `token.lemma_` of each non-stop word
    return [token.lemma_ for token in doc if not token.is_stop]

# titles = reviews_df['TITLE'].astype(str).to_list()
reviews = reviews_df['REVIEW'].astype(str).to_list()

lemmatized_reviews = [re.sub(r'[^\w\s]', '', str(x)) for x in reviews]
lemmatized_reviews = [lemmatize(spacy_nlp(x)) for x in lemmatized_reviews]
lemmatized_reviews = [' '.join(x) for x in lemmatized_reviews]

In [None]:
#~Find most common words with spacy
from collections import Counter

# 'NOUN', 'ADJ'
def find_types_of_words(text, word_type_list, nlp=spacy_nlp):
    doc = nlp(text)
    words = [x.text.lower() for x in doc if x.pos_ in word_type_list]
    words = list(set(words))
    words.sort()
    ignore_words = ['have']
    words = [x for x in words if x not in ignore_words]
    
    return words

def most_common_words(text_list, word_type_list):
    all_review_adjectives = []
    for review in text_list:
        adjs = find_types_of_words(review, word_type_list)
        all_review_adjectives.extend(adjs)
    most_common_words = Counter(all_review_adjectives).most_common(500)
    return most_common_words

# title_top_nouns_adjs_verbs = most_common_words(titles, ['NOUN', 'ADJ', 'VERB'])
review_top_nouns_adjs_verbs = most_common_words(lemmatized_reviews, ['NOUN', 'ADJ', 'VERB'])

In [None]:
#~Sample reviews so each common keyword is represented adequately
import re

review_top_nouns_adjs_verbs_top_twenty = review_top_nouns_adjs_verbs[0:20]
review_top_nouns_adjs_verbs_vals = [x[0] for x in review_top_nouns_adjs_verbs_top_twenty]
review_top_nouns_adjs_verbs_regex = '(?:' + '|'.join(review_top_nouns_adjs_verbs_vals) + ')'
phrase_around_top_nouns_adjs_verbs_regex = '[ 0-9a-zA-Z]{1,50} ' + review_top_nouns_adjs_verbs_regex + '[ 0-9a-zA-Z]{1,50}[ \\.]{1}'

def phrase_around_top_words(review, regex):
    try:
        return re.findall(regex, review)
    except:
        return ['']

# review_top_nouns_adjs_verbs
starttime = datetime.now()
_phrases_around_keywords = []
for x in lemmatized_reviews[0:1500]:
    _phrases_around_keywords.extend(phrase_around_top_words(x, phrase_around_top_nouns_adjs_verbs_regex))

phrases_around_keywords = []
for x in _phrases_around_keywords[0:1500]:
    temp = {
        'keyword': re.findall(review_top_nouns_adjs_verbs_regex, x)[0],
        'phrase': x
    }
    phrases_around_keywords.append(temp)

phrases_around_keywords_df = pd.DataFrame(phrases_around_keywords)

print('Total time to finish:\t', str(datetime.now() - starttime))
# phrases_around_keywords_df

_sampled_phrases = phrases_around_keywords_df.groupby('keyword').head(8)['phrase'].to_list()
sampled_phrases = []

counter = 0
for x in _sampled_phrases:
    counter += 1
    sampled_phrases.append(str(counter) + ') ' + x)

sum([len(x) for x in sampled_phrases])
# for x in sampled_phrases[0:20]:
#     print(x)
#     print('')


In [None]:
#~Use GPT to generate labels

from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain

create_topics_template = """
    Question: Please create 10 features that describe mostly frequently mentioned qualities of the product from the reviews below. Please output the categories as a numbered list separated by newline characters Reviews: {reviews}
    
    Answer: Here are 10 features:
"""
openai_llm = OpenAI(verbose=True, temperature=.1, model_name="text-davinci-003")
simple_prompt = PromptTemplate(input_variables=["reviews"], template=create_topics_template)
# openai_llm = ChatOpenAI(verbose=True, temperature=.1, model_name="gpt-3.5-turbo")
chain = LLMChain(llm=openai_llm, prompt=simple_prompt)
_features = chain.run(' '.join(sampled_phrases))
features = _features.split('\n')
features = [re.sub('[0-9]{1,2}\\. ', '', x) for x in features]


In [None]:
#~Assign labels with BERT zero shot learning classification
from transformers import pipeline

# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# classifier = pipeline("zero-shot-classification", model='cross-encoder/nli-deberta-base')
classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1")

print(reviews[30])

In [None]:
total_start_time = datetime.now()
for rev in reviews[0:20]:
    incremental_start_time = datetime.now()
    x = classifier(rev, candidate_labels=features)['labels'][0]
    print('time for increment:\t', str(datetime.now() - incremental_start_time))
    print(x)


print('time for total:\t', str(datetime.now() - total_start_time))

In [None]:
# total_start_time = datetime.now()
# x_revs = reviews[0:10]
# x = classifier(x_revs, candidate_labels=features[0:10])#['labels'][0]

# print('time for total:\t', str(datetime.now() - total_start_time))

for feature in features:
    print(feature)

In [None]:
sample_reviews_df = reviews_df.head(500)
sample_reviews_df['TOPIC'] = sample_reviews_df.apply(lambda x: classifier(str(x['REVIEW']), candidate_labels=features)['labels'][0], axis=1)

In [None]:
sample_review = sample_reviews_df['REVIEW'].to_list()[0]
sample_topic = sample_reviews_df['TOPIC'].to_list()[0]

def subtopic_labler(review, topic):
    review = str(review)
    topic = str(topic)
    
    subtopic_regex = ' [ 0-9a-zA-Z]{1,20} ' + topic.lower() + '[^ \..]{0,20}'
    try:
        return re.findall(subtopic_regex, review.lower())[0]
    except:
        return ''

subtopic_labler(sample_review, sample_topic)

sample_reviews_df['SUBTOPIC'] = sample_reviews_df.apply(lambda x: subtopic_labler(x['REVIEW'], x['TOPIC']), axis=1)

In [None]:
# test = sample_reviews_df['SUBTOPIC'].to_list()
# test = [x for x in test if x!='']
# for x in test:
#     print(x)

subtopic_rnd = sample_reviews_df[sample_reviews_df['SUBTOPIC']!=''].groupby('TOPIC').head(3)[['TOPIC', 'SUBTOPIC']]
subtopic_rnd.sort_values('TOPIC')

In [None]:
word_list = list(set([x[0] for x in review_top_nouns_adjs_verbs]))
word_list.sort()
word_list = ', '.join(word_list)
word_list

prompt = """
    I am going to give you a list of words. Please tell me which refer to people ("who"), which refer to dates or times ("when"), which refer to places ("where"), and which describe actions or activities ("activities"). Please return your response as a JSON object with who, when, where, and what as keys, and the results as lists for values. If a word does not fit into one of those categories, you can exclude it from the response. Please exclude adjectives from your response. Words: 
"""