In [156]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from collections import namedtuple
from collections import defaultdict
import os
import pickle
from time import time, sleep
import random
import csv
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

In [2]:
# Define the Song data structure
# Remember: namedtuples require that you plug in 
# every data type.

Song = namedtuple('Song', ['header', 'verified', 'metadata', 'lyrics'])

In [11]:
# path to the chromedriver executable
def start_driver(url):

    """This function will open a Chrome browser at the given url.
    Input: a valid URL as a string.
    Doesn't return anything, but creates a global variable *driver*
    that can be called upon elsewhere."""
    
    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver

    
    # Using the global keyword so that we can call upon the driver
    # further down in the code.
    
    global driver 
    driver = webdriver.Chrome(chromedriver)

    driver.get(url)

In [4]:
# Start by opening the all rap songs page.

all_rap_songs_genius = "https://genius.com/tags/rap/all"
start_driver(all_rap_songs_genius)

In [40]:
# Use Selenium to scroll down to the bottom of the screen
# This only works if you manually focus on the Chrome browser.

for i in range (10):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(1)

In [41]:
# This code snippet is there to grab links to songs
# on the https://genius.com/tags/rap/all page.
# It appears that 1000 is the most search results
# that genius.com will display at one time.


link_selector = '//a[@class= " song_link"]'
top_songs = driver.find_elements_by_xpath(link_selector)


link_list = []

for elem in top_songs:
    link_list.append(elem.get_attribute('href'))

In [44]:
# I save the list of links as a pickle.
# That way, I don't have to do this again.

with open('links.pkl', 'wb') as link_pickle:
    pickle.dump(link_list,link_pickle)

In [3]:
# If I pick up again I can just pull from the pickle

with open('links.pkl', 'rb') as link_pickle:
    link_list = pickle.load(link_pickle)

In [12]:
def scrape_info(list_of_urls):
    
    """This function uses the Chrome driver to visit 
    every page on a list of Genius links,
    and gathers the info I want to analyze.
    Input needs to be:
    a list of links to Genius pages, otherwise it won't work. 
    Output will be:
    a dict of named tuples of the Song format."""

    dict_of_songs = defaultdict(Song)

    for i, url in enumerate(list_of_urls):
        driver.get(url)
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        sleep(.5+2*random.random())

        # Gather and filter metadata

        views_regex = re.compile('views', re.IGNORECASE)
        contrib_regex = re.compile('contributors', re.IGNORECASE)
        tag_regex = re.compile('(\w+,\s\w+)+')

        all_metadata = driver.find_elements_by_class_name('metadata_with_icon')

        filtered_metadata = [metadata.text for metadata in all_metadata
                             if re.search(views_regex, metadata.text)
                             or re.search(contrib_regex, metadata.text)
                             or re.search(tag_regex, metadata.text)]

        # Scrape raw lyrics

        lyrics_string = ''

        lyrics = driver.find_elements_by_class_name('song_body-lyrics')
        for lyric in lyrics:
            lyrics_string += lyric.text

        # Scrape header

        header_string = driver.find_element_by_class_name(
            'header_with_cover_art-primary_info').text

        # Determine whether artist has contributed to their song page

        try:
            ver_art = driver.find_element_by_class_name(
                'song_verified_artists-section')
            verified_bool = True
        except:
            verified_bool = False
            
        # Create Song named tuple and add to dictionary

        dict_of_songs[url] = Song(header=header_string, lyrics=lyrics_string,
                                  verified=verified_bool, metadata=filtered_metadata)
        if (i+1 % 100 == 0):
            time.sleep(320)

    return dict_of_songs

In [19]:
# Pickling the song dict
with open('songs_dict.pkl', 'wb') as song_file:
    pickle.dump(song_dict, song_file)

In [3]:
# Unpickling the song dict
with open('songs_dict.pkl', 'rb') as song_file:
    song_dict = pickle.load(song_file)

In [13]:
def clean_lyrics(raw_lyrics):
    
    # This code snippet removes the header

    header_regex = re.compile('^.+\n')
        
    # This code removes the footer

    footer_regex = re.compile('\d+ Embed.+', re.DOTALL)
    
    # This should remove the little snippets in brackets

    bracket_regex = re.compile(r'\[.+\]')
    
    # This should remove the special characters that creep in
    
    character_regex = re.compile(r'''[\(\)\'\"\!\?]''')
    
    cleaned_lyrics = re.sub(footer_regex, '', raw_lyrics)
    cleaned_lyrics = re.sub(header_regex, '', cleaned_lyrics)
    cleaned_lyrics = re.sub(bracket_regex, '', cleaned_lyrics)
    cleaned_lyrics = re.sub(character_regex, '', cleaned_lyrics)
    
    return cleaned_lyrics.lower()

In [157]:
def list_to_regex(list_of_words):
    
    full_string = list_of_words[0]
    for word in list_of_words[1:]:
        new_word = '|' + word
        full_string += new_word
    return re.compile(full_string)

In [162]:
def load_semantic_fields(path):
    # semantic fields
    fields = []
    with open(path) as csv_file:
        dict_reader = csv.reader(csv_file, delimiter=',')
        for row in dict_reader:
            fields.append(row)
    
    global religion_regex, curse_regex, clothes_regex, cars_regex, foreign_sem_field

    religion_regex = list_to_regex(fields[0])
    curse_regex = list_to_regex(fields[1])
    clothes_regex = list_to_regex(fields[2])
    cars_regex = list_to_regex(fields[3])
    
    # names of tags I want to filter out
    foreign_sem_field = set(fields[4])

load_semantic_fields('semantic_fields.csv')

In [163]:
def process_songs(dict_of_songs):
    
    """This function takes a dictionary of Song named tuples
    and returns a list of dictionaries with just the numerical and categorical info
    for each song which to be used in the dataframe."""

    dict_list = []

    for link in dict_of_songs:
        song = dict_of_songs[link]

        info_dict = {'title': '',
                     'artist': '',
                     'words': 0,
                     'views': 0,
                     'contrib': 0,
                     'tags': '',
                     'religion': 0,
                     'curse': 0,
                     'clothes': 0,
                     'cars': 0,
                     'producer': '',
                     'verified': None}

        if song.verified:
            info_dict['verified'] = 1
        else:
            info_dict['verified'] = 0
        
        lyrics = clean_lyrics(song.lyrics)
        word_list = lyrics.split()
        song_length = len(word_list)
        info_dict['words'] = song_length
        
        info_dict['religion'] = len(re.findall(religion_regex,lyrics)) / song_length
        info_dict['curse'] = len(re.findall(curse_regex,lyrics)) / song_length
        info_dict['clothes'] = len(re.findall(clothes_regex,lyrics)) / song_length
        info_dict['cars'] = len(re.findall(cars_regex,lyrics)) / song_length

        # This should extract the number of views from the metadata

        info_dict['views'] = int(song.metadata[0].split()[0].replace(',', ''))
        
        # This should extract the number of contributors from the metadata

        info_dict['contrib'] = int(song.metadata[1].split()[0].replace(',', ''))
        
        # This should get the tags
        
        try:
            info_dict['tags'] = song.metadata[2]
        except:
            info_dict['tags'] = ''

        # This should get the title from the header

        info_dict['title'] = song.header.split('\n')[0]

        # This should get the artist name from the header

        info_dict['artist'] = song.header.split('\n')[1]

        # This should get the producer name from the header
        # If there is no producer, this should be None.

        producer_regex = re.compile('Produced by (.+)')
        try:
            info_dict['producer'] = re.match(producer_regex, song.header.split('\n')[2]).group(1)
        except:
            info_dict['producer'] = ''
        
        dict_list.append(info_dict)

        
    return dict_list

In [15]:
def foreign_to_nan(tags):
    
    """I use this to filter out songs that are in a foreign language.
    """
    
    if not tags:
        return tags
    for tag in tags.split(', '):
        if tag in foreign_sem_field:
            return np.nan
    return tags


In [245]:
# Run this cell once you've pulled the song_dict to initialize the dataframe.

dl = process_songs(song_dict)

df = pd.DataFrame(dl)

df['tags'] = df.tags.apply(lambda x: foreign_to_nan(x))

df = df.dropna(axis = 0, how = 'any')

df['logviews'] = np.log2(df['views'])


# Removing the very top changes the results significantly

mask = df['views'] > 5000000

df = df.drop(df[mask].index, axis = 0)

In [229]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['cars', 'clothes', 'curse', 'religion', 'verified','contrib']], df['logviews'], test_size=0.33, random_state=20)
m = LinearRegression()
m.fit(X_train, y_train)
m.score(X_train, y_train)

0.23492132538947075

In [230]:
m.coef_

array([-1.48568636e+00,  6.11674316e+00, -1.63021719e-01, -6.22221093e+00,
        1.27696675e-01,  3.31050054e-03])

In [211]:
m.score(X_test,y_test)

0.1918983065680412

In [217]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['cars', 'clothes', 'curse', 'religion', 'verified','contrib']], df['views'], test_size=0.33, random_state=22)
p = PolynomialFeatures()
Xpoly = p.fit_transform(X_train)
Xpolytest = p.transform(X_test)
m = LinearRegression()
m.fit(Xpoly,y_train)
m.score(Xpoly,y_train)

0.2832240049412208

In [218]:
m.score(Xpolytest,y_test)

0.00451711483799766

In [227]:
# 28 songs above 5 million views

len(old_df[mask])

# 26 foreign songs

len(old_df)


28

In [224]:
len(old_df)
old_df['tags'] = old_df.tags.apply(lambda x: foreign_to_nan(x))

old_df = old_df.dropna(axis = 0, how = 'any')

In [50]:
def split_and_validate(X, y):
    
    '''
    For a set of features and target X, y, perform a 80/20 train/val split, 
    fit and validate a linear regression model, and report results
    '''
    
    # perform train/val split
    X_train, X_val, y_train, y_val = \
        train_test_split(X, y, test_size=0.2, random_state=42)
    
    # fit linear regression to training data
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    # score fit model on validation data
    val_score = lr_model.score(X_val, y_val)
    
    # report results
    print('\nValidation R^2 score was:', val_score)
    print('Feature coefficient results: \n')
    for feature, coef in zip(X.columns, lr_model.coef_):
        print(feature, ':', coef) 

In [51]:
split_and_validate(df.loc[:,['cars', 'clothes', 'curse', 'religion', 'verified','contrib']],df.loc[:,['logviews']])


Validation R^2 score was: 0.18248245197482094
Feature coefficient results: 

cars : [ 6.64591842e+00 -3.04214649e+00  9.13226688e-02 -1.02253066e+00
  3.57152584e-02  1.12507275e-03]


In [371]:
cars_df = df[df['cars'] != 0]

In [372]:
curse_df = df[df['curse'] != 0]

In [232]:
# Define the model
lm1 = smf.ols('logviews ~ cars + clothes + curse + religion + verified + contrib', data=df)

# Fit the model
fit1 = lm1.fit()

# Print summary statistics of the model's performance
fit1.summary()

0,1,2,3
Dep. Variable:,logviews,R-squared:,0.225
Model:,OLS,Adj. R-squared:,0.22
Method:,Least Squares,F-statistic:,45.5
Date:,"Thu, 18 Apr 2019",Prob (F-statistic):,5.29e-49
Time:,14:45:41,Log-Likelihood:,-868.82
No. Observations:,946,AIC:,1752.0
Df Residuals:,939,BIC:,1786.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,19.6508,0.048,410.727,0.000,19.557,19.745
cars,6.8875,11.816,0.583,0.560,-16.301,30.076
clothes,4.1706,2.929,1.424,0.155,-1.577,9.918
curse,-0.1967,0.918,-0.214,0.830,-1.998,1.604
religion,-1.2835,3.109,-0.413,0.680,-7.385,4.818
verified,0.1217,0.042,2.910,0.004,0.040,0.204
contrib,0.0032,0.000,15.533,0.000,0.003,0.004

0,1,2,3
Omnibus:,49.518,Durbin-Watson:,0.451
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54.549
Skew:,0.57,Prob(JB):,1.43e-12
Kurtosis:,2.711,Cond. No.,122000.0


In [114]:
"""I realized that there were some songs where
the driver failed to scrape the metadata.
Here's the code I used to fix it:
songs_to_fix = []
for key in song_dict:
    if not song_dict[key].metadata:
        songs_to_fix.append(key)
        
start_driver(songs_to_fix[0])
missing_info = scrape_info(songs_to_fix)
for key in missing_info:
    song_dict[key] = missing_info[key]"""

In [132]:
bible_links = [mark.format(i) for i in range(1,16)]

mark = 'https://www.biblegateway.com/passage/?search=Mark+{}&version=NIV'

full_text = ''

for link in bible_links:

    response = requests.get(link)

    soup = BeautifulSoup(response.text)

    NIV_tags = soup.find_all(class_='version-NIV')

    text_list = [tag.text for tag in NIV_tags]

    bible_text = re.sub('\d+\\xa0','',text_list[0])
    bible_text = re.sub('\[.\]','',bible_text)
    bible_text = re.sub('\\xa0', '', bible_text)
    footnote_regex = re.compile('Footnotes.+',re.DOTALL)
    bible_text = re.sub(footnote_regex,'',bible_text)
    
    full_text += bible_text

full_text = clean_lyrics(full_text)

freq_dict = defaultdict(int)

for word in full_text.split():
    freq_dict[word] += 1

freq_dict

text_analysis = pd.Series(freq_dict)

freq_words = text_analysis[text_analysis > 5]


freq_words = freq_words.sort_values()

freq_words.drop(['the', 'and','to','of', 'he', 'they', 'a', 'in', 'you', 'his', 'with', 'him', 'is', 'will', 'for', 'that', 'not', 'was', 'it', 'on', 'but', 'be', 'were', 'when', 'them', 'i', 'had', 'at', 'one', 'out', 'said'])