In [1]:
from bs4 import BeautifulSoup
import requests, time, re
import pandas as  pd
import numpy as np
from tqdm import tqdm


page = requests.get("https://en.wikipedia.org/wiki/List_of_Bollywood_actors") # Wikipedia Page for list of bollywood actors
soup = BeautifulSoup(page.text, 'lxml')
div = soup.find_all("div",{"class" : "div-col columns column-width"})
info = [i.findAll(href=re.compile("(/wiki/)+([A-Za-z0-9_:()])+")) for i in div]
info = [i for j in info for i in j]
actor = [] # Actor name
actor_link = [] # Actor info
for tag in info:
    actor.append(tag.get('title'))
    actor_link.append("https://en.wikipedia.org"+tag.get('href'))
    
data_actor = pd.DataFrame({'celebrities':actor,'sex':['male']*(len(actor)), 'profile_link':actor_link}) #  Actor DataFrame

In [2]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_Bollywood_actresses") # Wikipedia Page for list of bollywood actress
soup = BeautifulSoup(page.text, 'lxml')
div = soup.find_all("table",{"class" : "wikitable sortable"})
info = [i.findAll('a') for i in div]
info = [i for j in info for i in j if i.get('class') == None]
actoress = [] # Actress name
actoress_link = [] # Actress info
for tag in info:
    if len(re.findall(r'[0-9]+', tag.get('title'))) ==0 and ['film' in tag.get('title')][0]==False:
        actoress.append(tag.get('title'))
        actoress_link.append("https://en.wikipedia.org"+tag.get('href'))
    
data_actoress = pd.DataFrame({'celebrities':actoress, 'sex':['female']*(len(actoress)), 'profile_link':actoress_link}) # Actress DataFrame

In [3]:
data = pd.concat([data_actor,data_actoress], axis=0) # Concatenating indian cinema actor and actress Dataframe
data.head()

Unnamed: 0,celebrities,sex,profile_link
0,Aamir Khan,male,https://en.wikipedia.org/wiki/Aamir_Khan
1,Aarun Nagar,male,https://en.wikipedia.org/wiki/Aarun_Nagar
2,Abhishek Bachchan,male,https://en.wikipedia.org/wiki/Abhishek_Bachchan
3,Abhay Deol,male,https://en.wikipedia.org/wiki/Abhay_Deol
4,Abhay Mahajan,male,https://en.wikipedia.org/wiki/Abhay_Mahajan


In [4]:
page = requests.get("http://ideonomy.mit.edu/essays/traits.html") #Primary words for personality traits on MIT website
soup = BeautifulSoup(page.text, 'lxml')

li_tag = soup.findAll('li')
bag_of_trait_words = [(i.get_text().split('\n')[1]).lower() for i in li_tag] # Bag of words to find trait sentiment
pos_trait = bag_of_trait_words[:234] # Bag of words for positive sentiment
neu_trait = bag_of_trait_words[234:346] # Bag of words for neutral sentiment
neg_trait = bag_of_trait_words[346:] # Bag of words for negative sentiment


positive_trait = [] # Celebrity positive trait percentage
neutral_trait = [] # Celebrity neutral trait percentage
negative_trait = [] # Celebrity negative trait percentage
img_link = [] # Celebrity image link


def preprocess(text):
    para = re.compile(r'<[^>]+>').sub('', text) # Removing html tags
    para = re.sub('[^a-zA-Z]', ' ', para) # Remove punctuations and numbers
    para = re.sub(r"\s+[a-zA-Z]\s+", ' ', para) # Single character/word removal
    para = re.sub(r'\s+', ' ', para) # Removing multiple spaces
    return para.lower()


#Function that calculate percentage of traits in celebrity
def trait(soup):
    div = soup.findAll("p")
    text = [i.get_text() for i in div]
    text = preprocess(''.join(text)) # Data cleaning       
    trait_words = [words for words in text.split() if words in bag_of_trait_words] #Available trait words in given celebrity data
    
    pos = []
    nut = []
    neg = []
    for word in list(set(trait_words)):
        if word in pos_trait:
            pos.append(word)
        elif word in neu_trait:
            nut.append(word)
        elif word in neg_trait:
            neg.append(word)

    jaccard_index = lambda x,y:(np.round(len(x)/(len(set(y))),2)*100)  # Using jaccardian similarity to calculate percentage of trait sentiment
    positive_trait.append(jaccard_index(pos,trait_words))
    neutral_trait.append(jaccard_index(nut,trait_words))
    negative_trait.append(jaccard_index(neg,trait_words))
    return ''

#function that will return celebrity images link
def img_links(soup):
    div = soup.find_all("table",{"class" : "infobox biography vcard"})[0]
    x = div.findAll("a",{"class" : "image"})[0]
    return img_link.append("https:" + ''.join(x.findAll('img')[0].get('src').split('.jpg')[0].split('/thumb')) + ".jpg")
    

for link in tqdm(data.profile_link):
    time.sleep(2)
    page = requests.get(link)
    soup = BeautifulSoup(page.text, 'lxml')
    
    try:
        img_links(soup)        
    except:
        img_link.append(np.nan) # If image link not available, filling with NaN value
        
    try:
        trait(soup)
    except:
        # If data not available it with fill with NaN value
        positive_trait.append(np.nan)
        neutral_trait.append(np.nan)
        negative_trait.append(np.nan)

100%|██████████| 791/791 [33:59<00:00,  2.58s/it]


In [24]:
data['positive_trait'] = positive_trait
data['neutral_trait'] = neutral_trait
data['negative_trait'] = negative_trait
data['img_link'] = img_link
data.reset_index(inplace = True)
data.to_csv('celeb_data.csv')
data = data[['celebrities', 'sex', 'positive_trait','neutral_trait', 'negative_trait', 'img_link', 'profile_link']]
print("There are %s records in indian celebrity dataset"%(len(data)))
data.head(10)

There are 791 records in indian celebrity dataset


Unnamed: 0,celebrities,sex,positive_trait,neutral_trait,negative_trait,img_link,profile_link
0,A. K. Hangal,male,51.0,31.0,17.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/A._K._Hangal
1,Kapil Sharma (comedian),male,0.0,0.0,100.0,,https://en.wikipedia.org/wiki/Kapil_Sharma_(co...
2,Kartik Aaryan,male,50.0,33.0,17.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Kartik_Aaryan
3,Karmveer Choudhary,male,57.0,29.0,14.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Karmveer_Choudhary
4,Sanjay Mishra (actor),male,100.0,0.0,0.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Sanjay_Mishra_(a...
5,Karanvir Bohra,male,75.0,0.0,25.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Karanvir_Bohra
6,Karan Wahi,male,,,,,https://en.wikipedia.org/wiki/Karan_Wahi
7,Karan Singh Grover,male,67.0,33.0,0.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Karan_Singh_Grover
8,Karan Patel,male,50.0,12.0,38.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Karan_Patel
9,Karan Johar,male,33.0,0.0,67.0,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Karan_Johar


'https://en.wikipedia.org/wiki/Kapil_Sharma_(comedian)'