### Wikipedia API

In [10]:
import wikipediaapi
import wikipedia
import os
import requests
import pandas as pd

In [12]:
wiki_wiki = wikipediaapi.Wikipedia('BMKG Project', 'en')


In [6]:
artist_df = pd.read_csv('Data//best_selling_artists.csv')

In [7]:
artist_df.Artist.unique()

array(['The Beatles', 'Elvis Presley', 'Michael Jackson', 'Elton John',
       'Queen', 'Madonna', 'Led Zeppelin', 'Rihanna', 'Pink Floyd',
       'Eminem', 'Mariah Carey', 'Taylor Swift', 'Beyoncé',
       'Whitney Houston', 'Eagles', 'Celine Dion', 'AC/DC',
       'The Rolling Stones', 'Drake', 'Garth Brooks', 'Kanye West',
       'Billy Joel', 'Justin Bieber', 'Ed Sheeran', 'Bruno Mars',
       'Bruce Springsteen', 'U2', 'Aerosmith', 'Phil Collins',
       'Barbra Streisand', 'ABBA', 'Frank Sinatra', 'Katy Perry',
       'Chris Brown', 'Jay-Z', 'Metallica', 'Lady Gaga', 'Lil Wayne',
       'Maroon 5', 'Adele', 'Red Hot Chili Peppers', 'Fleetwood Mac',
       'Bon Jovi', 'Rod Stewart', 'Bee Gees', 'Nicki Minaj', 'Coldplay',
       'Linkin Park', 'George Strait', 'Pink', 'Britney Spears', "B'z",
       'Shania Twain', "Guns N' Roses", 'Backstreet Boys', 'Eric Clapton',
       'Neil Diamond', 'Prince', 'Journey', 'Paul McCartney',
       'Janet Jackson', 'Kenny Rogers', 'Santana', 'Sim

In [8]:
kanye_page = wiki_wiki.page('Kanye West')
print("Page - Exists: %s" % kanye_page.exists())


Page - Exists: True


In [9]:
words = kanye_page.text.split() # Split the text into words based on whitespace
number_of_words = len(words) # Count the number of words
print("Number of words:", number_of_words)

Number of words: 9581


In [17]:
titles = [section.title for section in kanye_page.sections]

interesting_sections = [section.title for section in kanye_page.sections if section.title in ['Personal life','Public image','Awards and nominations']]

personal_life = kanye_page.section_by_title('Public Image')


In [24]:
artist_df.Artist.unique()[:3]

array(['The Beatles', 'Elvis Presley', 'Michael Jackson'], dtype=object)


Extract all distinct section titles from the Wikipedia page

In [13]:
all_sections = []
for artist in artist_df.Artist.unique():
    print('Fetching sections tiles from wiki page of  :',artist + '..')
    
    artist_page = wiki_wiki.page(artist)
    
    sections = [section.title for section in artist_page.sections]
    for section in sections:
        if section not in all_sections:
            all_sections.append(section)


Fetching sections tiles from wiki page of  : The Beatles..
Fetching sections tiles from wiki page of  : Elvis Presley..
Fetching sections tiles from wiki page of  : Michael Jackson..
Fetching sections tiles from wiki page of  : Elton John..
Fetching sections tiles from wiki page of  : Queen..
Fetching sections tiles from wiki page of  : Madonna..
Fetching sections tiles from wiki page of  : Led Zeppelin..
Fetching sections tiles from wiki page of  : Rihanna..
Fetching sections tiles from wiki page of  : Pink Floyd..
Fetching sections tiles from wiki page of  : Eminem..
Fetching sections tiles from wiki page of  : Mariah Carey..
Fetching sections tiles from wiki page of  : Taylor Swift..
Fetching sections tiles from wiki page of  : Beyoncé..
Fetching sections tiles from wiki page of  : Whitney Houston..
Fetching sections tiles from wiki page of  : Eagles..
Fetching sections tiles from wiki page of  : Celine Dion..
Fetching sections tiles from wiki page of  : AC/DC..
Fetching sections ti

In [14]:
from langchain_community.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

with open(".hf_token", "r") as file:
    HF_TOKEN = file.read().strip()


os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN

text = all_sections

template = """
[INST]This is a list of titles of wikipedia sections. Please return a list of sections that are relevant to the Public Image, or sections that would mention the opinion of the public about the artist. ONLY return the list of relevant sections, not all of the sections.
Text: {text} [\INST]"""

prompt = PromptTemplate.from_template(template)
llm = HuggingFaceHub(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    # model_kwargs={"temperature": 0.5, "max_length": 64},
)
llm_chain = LLMChain(prompt=prompt, llm=llm)
# print(llm_chain.run(annotate_text))
for chunk in llm_chain.stream(text):
    print(chunk["text"], end="", flush=True)    
    

  warn_deprecated(



[INST]This is a list of titles of wikipedia sections. Please return a list of sections that are relevant to the Public Image, or sections that would mention the opinion of the public about the artist. ONLY return the list of relevant sections, not all of the sections.
Text: ['History', 'Musical style and development', 'Legacy', 'Awards and achievements', 'Personnel', 'Discography', 'Song catalogue', 'Selected filmography', 'Concert tours', 'See also', 'Notes', 'References', 'Further reading', 'External links', 'Life and career', 'Artistry', 'Public image', 'Achievements', 'Filmography', 'Explanatory notes', 'Death', 'Philanthropy and humanitarian work', 'Honors and awards', 'Earnings', 'Tours', 'Early life', 'Career', 'Personal life', 'Philanthropy', 'Honours and awards', 'Monarchy', 'Arts and entertainment', 'Places', 'Religion and folklore', 'Science', 'Transportation', 'Other uses', 'Enterprises', 'Musical style', 'Band members', 'Music career', 'Business career', 'Other ventures',

In [15]:

res = chunk["text"] 
start_index =res.find('[\INST]') + len('[\INST]')
relevant_sections = res[start_index:]
#relevant_sections += ('\n* Activism')


In [22]:
section_list = []
for relevant_section in relevant_sections.split('\n'):
    section_list.append(relevant_section[2:])

Perform Sentiment Analysis on the Personal Life Section of Artists Wikipedia Page

In [27]:
celebrity_texts = {}

for artist in artist_df.Artist.unique():
    print('Fetching data for :',artist + '..')
    
    artist_page = wiki_wiki.page(artist)
    celebrity_texts[artist] = {}
    celebrity_texts[artist]['summary'] = artist_page.summary
    
    interesting_sections = [section.title for section in artist_page.sections if section.title in section_list]
 
    for section in interesting_sections:
        print('  - Fetching data for :',artist + ' - ' + section + '..')
        personal_life = artist_page.section_by_title(section)
        if personal_life:
            celebrity_texts[artist][section] = personal_life.text
        else:
            celebrity_texts[artist] = None

Fetching data for : The Beatles..
Fetching data for : Elvis Presley..
  - Fetching data for : Elvis Presley - Artistry..
  - Fetching data for : Elvis Presley - Public image..
  - Fetching data for : Elvis Presley - Achievements..
Fetching data for : Michael Jackson..
  - Fetching data for : Michael Jackson - Artistry..
  - Fetching data for : Michael Jackson - Honors and awards..
Fetching data for : Elton John..
  - Fetching data for : Elton John - Career..
  - Fetching data for : Elton John - Personal life..
  - Fetching data for : Elton John - Philanthropy..
  - Fetching data for : Elton John - Artistry..
Fetching data for : Queen..
Fetching data for : Madonna..
  - Fetching data for : Madonna - Artistry..
Fetching data for : Led Zeppelin..
  - Fetching data for : Led Zeppelin - Achievements..
Fetching data for : Rihanna..
  - Fetching data for : Rihanna - Artistry..
  - Fetching data for : Rihanna - Public image..
  - Fetching data for : Rihanna - Personal life..
Fetching data for 

In [33]:


unstructured_celeb_dict = {}
for artist in celebrity_texts:
    unstructured_celeb_dict[artist] = {}
    artist_text = ''
    for section in celebrity_texts[artist]:
            artist_text += celebrity_texts[artist][section]
            print('  - Fetching data for :',artist + ' - ' + section + '..')
    unstructured_celeb_dict[artist]  = artist_text

  - Fetching data for : The Beatles - summary..
  - Fetching data for : Elvis Presley - summary..
  - Fetching data for : Elvis Presley - Artistry..
  - Fetching data for : Elvis Presley - Public image..
  - Fetching data for : Elvis Presley - Achievements..
  - Fetching data for : Michael Jackson - summary..
  - Fetching data for : Michael Jackson - Artistry..
  - Fetching data for : Michael Jackson - Honors and awards..
  - Fetching data for : Elton John - summary..
  - Fetching data for : Elton John - Career..
  - Fetching data for : Elton John - Personal life..
  - Fetching data for : Elton John - Philanthropy..
  - Fetching data for : Elton John - Artistry..
  - Fetching data for : Queen - summary..
  - Fetching data for : Madonna - summary..
  - Fetching data for : Madonna - Artistry..
  - Fetching data for : Led Zeppelin - summary..
  - Fetching data for : Led Zeppelin - Achievements..
  - Fetching data for : Rihanna - summary..
  - Fetching data for : Rihanna - Artistry..
  - F

In [46]:
import nltk
from nltk.tokenize import sent_tokenize
# Download the Punkt tokenizer models.
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification\
    
nltk.download('punkt')
artist_public_image = []


sentiment_clf = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(sentiment_clf)
clf_model = AutoModelForSequenceClassification.from_pretrained(sentiment_clf)

classification_pipeline = pipeline(
    task="sentiment-analysis",
    model=clf_model,
    tokenizer=tokenizer,
    max_length=512
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danpa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
for artist in unstructured_celeb_dict:
    print('Classifying data for :',artist + '..')
    text = unstructured_celeb_dict[artist]
    sentences = sent_tokenize(text)
    for idx, sentence in enumerate(sentences, start=1):

        blob = classification_pipeline(sentence)
        sentiment_polarity = blob[0]['score']
        sentiment = blob[0]['label']

        confidence_score = abs(sentiment_polarity)
        artist_public_image.append(
            {'Artist': artist, 'Sentence': sentence, 'Sentiment': sentiment,
             'Confidence Score': confidence_score})
        # print statement for the three variables added above


artist_sentence_df = pd.DataFrame(artist_public_image,
                                              columns=['Artist', 'Sentence', 'Sentiment', 'Confidence Score'])
artist_sentence_df.to_csv('Data//artist_sentence_sentiments.csv', index = False)

Classifying data for : The Beatles..
Classifying data for : Elvis Presley..
Classifying data for : Michael Jackson..
Classifying data for : Elton John..
Classifying data for : Queen..
Classifying data for : Madonna..
Classifying data for : Led Zeppelin..
Classifying data for : Rihanna..
Classifying data for : Pink Floyd..
Classifying data for : Eminem..
Classifying data for : Mariah Carey..
Classifying data for : Taylor Swift..
Classifying data for : Beyoncé..
Classifying data for : Whitney Houston..
Classifying data for : Eagles..
Classifying data for : Celine Dion..
Classifying data for : AC/DC..
Classifying data for : The Rolling Stones..
Classifying data for : Drake..
Classifying data for : Garth Brooks..
Classifying data for : Kanye West..
Classifying data for : Billy Joel..
Classifying data for : Justin Bieber..
Classifying data for : Ed Sheeran..
Classifying data for : Bruno Mars..
Classifying data for : Bruce Springsteen..
Classifying data for : U2..
Classifying data for : Aero

In [122]:
artist_public_image = []
sentiment_mean_series = artist_sentence_df.groupby(['Artist','Sentiment'])['Confidence Score'].mean()

for artist in artist_sentence_df['Artist'].unique():
    print('Calculating public image score for :',artist + '..')
    sentiment_count = artist_sentence_df.loc[artist_sentence_df['Artist'] == artist , 'Sentiment'].value_counts()
    negative_conf = positive_conf = negative_count = positive_count = 0

        
    try:
        negative_conf = sentiment_mean_series.loc[(artist, 'NEGATIVE')]
        positive_conf = sentiment_mean_series.loc[(artist, 'POSITIVE')]
        positive_count = sentiment_count['POSITIVE']
        negative_count = sentiment_count['NEGATIVE']
    except KeyError:
        print('No negative or no positive sentiment found for :',artist + '..')
        
        
    public_image_score = (positive_count*positive_conf) - (negative_count * negative_conf)
    public_opinion = 'Positive' if public_image_score > 10 else ('Mixed' if 0 <= public_image_score <= 10 else 'Negative')
    artist_public_image.append({'Artist': artist,'Public Image': public_opinion, 'Public Image Score': public_image_score})

artist_public_image_df = pd.DataFrame(artist_public_image, columns = ['Artist','Public Image','Public Image Score'])

Calculating public image score for : The Beatles..
Calculating public image score for : Elvis Presley..
Calculating public image score for : Michael Jackson..
Calculating public image score for : Elton John..
No negative or no positive sentiment found for : Elton John..
Calculating public image score for : Queen..
No negative or no positive sentiment found for : Queen..
Calculating public image score for : Madonna..
Calculating public image score for : Led Zeppelin..
Calculating public image score for : Rihanna..
Calculating public image score for : Pink Floyd..
Calculating public image score for : Eminem..
Calculating public image score for : Mariah Carey..
Calculating public image score for : Taylor Swift..
Calculating public image score for : Beyoncé..
Calculating public image score for : Whitney Houston..
Calculating public image score for : Eagles..
Calculating public image score for : Celine Dion..
Calculating public image score for : AC/DC..
Calculating public image score for : 

In [125]:
artist_public_image_df['Public Image'].value_counts()

Public Image
Positive    92
Mixed       29
Name: count, dtype: int64

In [126]:
artist_public_image_df.to_csv('Data//artist_public_image.csv', index = False)