# Scraping with Beautiful Soup

In [None]:
# http://www.nytimes.com/1983/04/24/arts/secular-requiems-death-as-public-art.html?pagewanted=all&mcubz=3 

In [79]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [None]:
r = requests.get('https://en.wikipedia.org/wiki/Music_for_the_Requiem_Mass')
print(r.text[0:500])

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
results = soup.find_all('div')[60]#(class='div-col columns column-width')
results#.find_all('a')[22]['title']

In [None]:
full_list = []
renaissance = results.find_all('a')[1:30]
for name in renaissance:
    name = (name.text + ', Renaissance')
    print(name)
    full_list.append(name)

In [None]:
full_list

In [None]:
#df = pd.DataFrame(renaissance, columns=['Name', 'Period'])

In [None]:
baroque = results.find_all('li')[30:42]
for name in baroque:
    name = (name.text + ', Baroque')
    print(name)
    full_list.append(name)

In [None]:
#full_list

In [None]:
classical = results.find_all('li')[42:59]
for name in classical:
    name = (name.text + ', Classical')
    print(name)
    full_list.append(name)

In [None]:
full_list

In [None]:
romantic = results.find_all('li')[59:81]
for name in romantic:
    name = (name.text + ', Romantic')
    print(name)
    full_list.append(name)

In [None]:
twentiethcentury = results.find_all('li')[81:144]
for name in twentiethcentury:
    name = (name.text + ", 20th Century")
    print(name)
    full_list.append(name)

In [None]:
twentyfirst = results.find_all('li')[144:181]
for name in twentyfirst:
    name = (name.text + ", 21st Century")
    print(name)
    full_list.append(name)

In [None]:
full_list

# Applying a Tabular Structure to Scraped Data

In [None]:
df = pd.DataFrame(full_list, columns=['name'])
len(df)#['name'].str.contains('Renaissance')

In [None]:
ren = df[df['name'].str.contains('Renaissance')]
len(ren)

In [None]:
baroque = df[df['name'].str.contains('Baroque')]
len(baroque)

In [None]:
classical = df[df['name'].str.contains('Classical')]
len(classical)

In [None]:
romantic = df[df['name'].str.contains('Romantic')]
len(romantic)

In [None]:
twenty = df[df['name'].str.contains('20th Century')]
len(twenty)

In [None]:
twentyone = df[df['name'].str.contains('21st Century')]
len(twentyone)

In [None]:
#df.to_csv('created_requiem.csv', index=False,encoding='utf-8')

# Finding patterns in new database (created by cleaning out portion of the requiem) 

In [80]:
df2 = pd.read_csv('requiem_cleaned - Sheet1.csv')

In [81]:
# http://pandas.pydata.org/pandas-docs/version/0.17.0/generated/pandas.DataFrame.fillna.html
df2.fillna(value=0, inplace=True)

In [82]:
df2['Period'].value_counts()

20th Century    60
Renaissance     35
21st Century    35
Romantic        20
Baroque         17
Classical       17
Portugal         1
Name: Period, dtype: int64

In [83]:
df2.columns

Index(['Name', 'Country', 'Period', 'Introit', 'Kyrie', 'Gradual', 'Tract',
       'Sequence/Dies Irae', 'Offertory', 'Sanctus', 'Benedictus', 'Agnus Dei',
       'Communion/lux aeterna', 'Lacrimosa', 'Pie Jesu', 'Libera Me',
       'In Paradisum', 'Other Texts', 'Commissioned'],
      dtype='object')

In [None]:
df2.groupby('Period')['Other Texts'].value_counts()
#This gives us what other texts were incorporated into Requiems per time period 

# Classical Requiems

In [87]:
df_classical = df2[df2['Period'] == 'Classical']

In [89]:
len(df_classical)

17

In [101]:
df_classical['In Paradisum'].sum()

1.0

## Romantic

In [102]:
df_romantic = df2[df2['Period'] == 'Romantic']

In [103]:
len(df_romantic)

20

In [118]:
df_romantic['In Paradisum'].sum()

1.0

### What Makes Up a 21st Century Requiem? 

In [None]:
df_21 = df2[df2['Period'] == '21st Century']
df_21.head(2)

In [None]:
len(df_21)

In [None]:
df_21['Introit'].sum()

In [None]:
df_21['Kyrie'].sum()

In [None]:
df_21['Gradual'].sum()

In [None]:
df_21['Tract'].sum()

In [None]:
df_21['Sequence/Dies Irae'].sum()

In [None]:
df_21['Offertory'].sum()

In [None]:
df_21['Sanctus'].sum()

In [None]:
df_21['Agnus Dei'].sum()

In [None]:
df_21['Communion/lux aeterna'].sum()

In [None]:
df_21['Pie Jesu'].sum()

In [None]:
df_21['Libera Me'].sum()

In [None]:
df_21['In Paradisum'].sum()

## What about 20th Century? 

In [None]:
df2

In [None]:
df_20 = df2[df2['Period'] == '20th Century']
df_20.head(2)

In [None]:
len(df_20)

In [None]:
df_20['Introit'].sum()

In [None]:
df_20['Kyrie'].sum()

In [None]:
df_20['Gradual'].sum()

In [None]:
df_20['Tract'].sum()

In [None]:
df_20['Sequence/Dies Irae'].sum()

In [None]:
df_20['Offertory'].sum()

In [None]:
df_20['Sanctus'].sum()

In [None]:
df_20['Agnus Dei'].sum()

In [None]:
df_20['Communion/lux aeterna'].sum()

In [None]:
df_20['Pie Jesu'].sum()

In [None]:
df_20['Libera Me'].sum()

In [None]:
df_20['In Paradisum'].sum()

In [None]:
df2.columns

In [None]:
df2['Period'].value_counts()

In [None]:
df2.loc['20th Century']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df2.set_index('Period',inplace=True)

In [None]:
df2_new = df2[['Introit', 'Kyrie', 'Gradual','Tract','Sequence/Dies Irae','Offertory','Sanctus','Agnus Dei','Communion/lux aeterna','Pie Jesu', 'Libera Me','In Paradisum']]
df2_new

In [None]:
cosine_similarity(df2_new.values)

In [None]:
df2.T