# Project 4: Song Recommender
## Part I: Gathering Data and Model

### 1. Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, balanced_accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, BaggingRegressor, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.inspection import plot_partial_dependence

#setting the default for display
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/olivialara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/olivialara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2. Download Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [2]:
ls drive/MyDrive/Project-4-Team-4/Data/labeled_lyrics_cleaned.csv

drive/MyDrive/Project-4-Team-4/Data/labeled_lyrics_cleaned.csv


In [3]:
model = pd.read_csv('drive/MyDrive/Project-4-Team-4/Data/labeled_lyrics_cleaned.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'drive/MyDrive/Project-4-Team-4/Data/labeled_lyrics_cleaned.csv'

In [7]:
model.head()

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371


### 3. Clean Lyrics

In [8]:
def regex_cleaner(words):
    
    # set token
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")
    
    # tokenize words
    clean_words = my_tokenizer.tokenize(words.lower())
    
    # return words
    return ' '.join(clean_words)

In [9]:
model["clean_lyrics"] = model['seq'].map(regex_cleaner)

### 4. Drop NaNs and Unnecessary Data

In [10]:
model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158353 entries, 0 to 158352
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    158353 non-null  int64  
 1   artist        158353 non-null  object 
 2   seq           158353 non-null  object 
 3   song          158353 non-null  object 
 4   label         158353 non-null  float64
 5   clean_lyrics  158353 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 7.2+ MB


In [11]:
model = model[["artist", "song", "clean_lyrics"]]

In [12]:
model.head(3)

Unnamed: 0,artist,song,clean_lyrics
0,Elijah Blake,Everyday,no no i ain't ever trapped out the bando but o...
1,Elijah Blake,Live Till We Die,the drinks go down and smoke goes up i feel my...
2,Elijah Blake,The Otherside,she don't live on planet earth no more she fou...


In [13]:
model.to_csv('drive/MyDrive/Project-4-Team-4/Data/all_songs_cleaned')

In [30]:
data = pd.read_csv('drive/MyDrive/Project-4-Team-4/Data/all_songs_cleaned')

In [31]:
data.isna().sum()

Unnamed: 0      0
artist          0
song            0
clean_lyrics    1
dtype: int64

In [32]:
data.head()

Unnamed: 0.1,Unnamed: 0,artist,song,clean_lyrics
0,0,Elijah Blake,Everyday,no no i ain't ever trapped out the bando but o...
1,1,Elijah Blake,Live Till We Die,the drinks go down and smoke goes up i feel my...
2,2,Elijah Blake,The Otherside,she don't live on planet earth no more she fou...
3,3,Elijah Blake,Pinot,trippin' off that grigio mobbin' lights low tr...
4,4,Elijah Blake,Shadows & Diamonds,i see a midnight panther so gallant and so bra...


In [33]:
data.dropna(inplace=True)

In [34]:
data.head()

Unnamed: 0.1,Unnamed: 0,artist,song,clean_lyrics
0,0,Elijah Blake,Everyday,no no i ain't ever trapped out the bando but o...
1,1,Elijah Blake,Live Till We Die,the drinks go down and smoke goes up i feel my...
2,2,Elijah Blake,The Otherside,she don't live on planet earth no more she fou...
3,3,Elijah Blake,Pinot,trippin' off that grigio mobbin' lights low tr...
4,4,Elijah Blake,Shadows & Diamonds,i see a midnight panther so gallant and so bra...


### 5. Add Sentiment Scores

In [35]:
pip install vaderSentiment



In [36]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [37]:
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
 
    return sentiment_dict['compound']

In [38]:
data["vader_valence"] = data['clean_lyrics'].map(sentiment_scores)

In [39]:
data.head()

Unnamed: 0.1,Unnamed: 0,artist,song,clean_lyrics,vader_valence
0,0,Elijah Blake,Everyday,no no i ain't ever trapped out the bando but o...,-0.9303
1,1,Elijah Blake,Live Till We Die,the drinks go down and smoke goes up i feel my...,-0.9984
2,2,Elijah Blake,The Otherside,she don't live on planet earth no more she fou...,-0.9909
3,3,Elijah Blake,Pinot,trippin' off that grigio mobbin' lights low tr...,-0.9686
4,4,Elijah Blake,Shadows & Diamonds,i see a midnight panther so gallant and so bra...,0.9962


In [40]:
data.drop(columns=["Unnamed: 0"], inplace=True)

In [41]:
data.head()

Unnamed: 0,artist,song,clean_lyrics,vader_valence
0,Elijah Blake,Everyday,no no i ain't ever trapped out the bando but o...,-0.9303
1,Elijah Blake,Live Till We Die,the drinks go down and smoke goes up i feel my...,-0.9984
2,Elijah Blake,The Otherside,she don't live on planet earth no more she fou...,-0.9909
3,Elijah Blake,Pinot,trippin' off that grigio mobbin' lights low tr...,-0.9686
4,Elijah Blake,Shadows & Diamonds,i see a midnight panther so gallant and so bra...,0.9962


In [42]:
data.to_csv('drive/MyDrive/Project-4-Team-4/Data/all_songs_cleaned_and_vadered')

In [59]:
data.to_parquet('drive/MyDrive/Project-4-Team-4/Data/all_songs_cleaned_and_vadered.parquet') 

In [44]:
data.head()

Unnamed: 0,artist,song,clean_lyrics,vader_valence
0,Elijah Blake,Everyday,no no i ain't ever trapped out the bando but o...,-0.9303
1,Elijah Blake,Live Till We Die,the drinks go down and smoke goes up i feel my...,-0.9984
2,Elijah Blake,The Otherside,she don't live on planet earth no more she fou...,-0.9909
3,Elijah Blake,Pinot,trippin' off that grigio mobbin' lights low tr...,-0.9686
4,Elijah Blake,Shadows & Diamonds,i see a midnight panther so gallant and so bra...,0.9962


In [45]:
small_data = data.drop(columns="clean_lyrics")

In [46]:
small_data.to_csv('drive/MyDrive/Project-4-Team-4/Data/all_songs_vadered_no_lyrics' )

In [58]:
small_data.to_parquet('drive/MyDrive/Project-4-Team-4/Data/all_songs_vadered_no_lyrics.parquet') 

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158352 entries, 0 to 158352
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   artist         158352 non-null  object 
 1   song           158352 non-null  object 
 2   clean_lyrics   158352 non-null  object 
 3   vader_valence  158352 non-null  float64
dtypes: float64(1), object(3)
memory usage: 11.0+ MB


### 6. Song Recommender Function

In [4]:
def song_recs (sentence):

  my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")
    
  clean_words = my_tokenizer.tokenize(sentence.lower())
    
    
  cleaned_sentence =  ' '.join(clean_words)
 
 
  sid_obj = SentimentIntensityAnalyzer()
 
  sentiment_dict = sid_obj.polarity_scores(cleaned_sentence)

  mask= (small_data["vader_valence"]>(sentiment_dict['compound']))
    
  recs = small_data[mask].sort_values(by="vader_valence")[:10]
  
  return recs[["artist", "song"]]



In [64]:
song_recs("A warm sunny day.")

Unnamed: 0,artist,song
151927,Sandy Denny,Take Me Away
42493,Eydie Gorme,The Ladies Who Lunch [From Company]
54738,Balance of Power,Under the Spell
106349,Pulp,Pencil Skirt
7077,H.E.R.,Changes
18619,Rosanne Cash,I Don't Know Why You Don't Want Me
65619,Beach Fossils,Vacation
9230,John's Children,Mustang Ford
125960,Kristin Hersh,Tuesday Night
66925,New World Orchestra,Theme from Friends


In [65]:
song_recs("Black and stiff, but not a bad fit. Will you marry it? It is waterproof, shatterproof, proof Against fire and bombs through the roof. Believe me, they’ll bury you in it.")

Unnamed: 0,artist,song
134505,Slash,The Truth
127484,Bigg Tyme,Still Tippin'
6822,GWAR,Antarctican Drinking Song
14604,Rebecca Martin,These Bones Are Yours Alone
42753,Face to Face,Everything Is Everything
84449,Tinie Tempah,Simply Unstoppable
72935,George Jones,Hundred Proof Memories
110907,Rob Zombie,Meet the Creeper [Pink Pussy Mix]
11446,Michael Schenker,Only You Can Rock Me
14024,Linda Ronstadt,Ramblin' Round


In [66]:
song_recs("Two roads diverged in a yellow wood, And sorry I could not travel both And be one traveler, long I stood And looked down one as far as I could To where it bent in the undergrowth;")

Unnamed: 0,artist,song
158303,Miranda Sex Garden,My Funny Valentine
151614,Snow Patrol,Workwear Shop
51623,Aqueduct,Keep It Together
111547,Then Jerico,Quiet Place (Apathy & Sympathy)
93776,Racer X,Street Lethal
117499,Iron Savior,Deadly Sleep
30409,Generation X,Youth Youth Youth
28547,Days of the New,Shelf in the Room
11440,Michael Schenker,The Storm
52688,Prince,Time
