# This notebook contains the analysis code for the scatterplot of differences between Country and R&B/Hip-Hop music

## Step 1: Load the Data

In [1]:
import pandas as pd

In [2]:
# set the directory where the data are stored
# here I'm referencing the data directory from the 
# scripts directory (might only work in Linux)
in_dir = '../Data/'
raw_data_file_csv = 'country_rbhh_blog.csv'
# function to read the raw data
def read_raw_csv(in_dir_f, file_name_f):
    raw_data_df_f = pd.read_csv(in_dir_f + file_name_f)
    raw_data_df_f.fillna('', inplace=True)
    return raw_data_df_f
    

In [3]:
raw_data_df = read_raw_csv(in_dir, raw_data_file_csv)

In [4]:
raw_data_df.genre.value_counts()

R&B/Hip-Hop    2754
Country        2444
Name: genre, dtype: int64

In [5]:
raw_data_df.tail()

Unnamed: 0,rank,song,artist,genre,year,lyrics,lyrics_clean,lyrics_scrubbed,language
5193,96,Ball If I Want To,DaBaby,R&B/Hip-Hop,2021,Ball If I Want To Lyrics[Intro]\n(If I want to...,(If I want to) (If I want to) (If I want to) ​...,dope bitch birthday ball ball pull foreign car...,en
5194,97,N 2 Deep,Drake Featuring Future,R&B/Hip-Hop,2021,N 2 Deep Lyrics[Part I]\n\n[Verse 1: Drake]\nO...,"Oh yeah, yeah Kept the Galleria open 'til ten ...",yeah yeah galleria open friends spend crazy ba...,en
5195,98,Outside,MO3 & OG Bobby Billions,R&B/Hip-Hop,2021,Outside (Better Days) Lyrics[Intro: OG Bobby B...,"Yeah, yeah They scared to come outside Tell th...",yeah yeah scared boys pray boys stay safe mama...,en
5196,99,WUSYANAME,"Tyler, The Creator Featuring YoungBoy Never Br...",R&B/Hip-Hop,2021,"WUSYANAME Lyrics[Intro: Tyler, The Creator, Ty...","Something real, yeah Baby Oh nah, we don't do ...",real yeah baby nah backpack haha pick shit yea...,en
5197,100,Rags2Riches,Rod Wave Featuring ATR Son Son,R&B/Hip-Hop,2021,"Rags2Riches Lyrics[Intro: Rod Wave]\n(Ayy, Zyp...","(Ayy, Zypitano got that gas) (Six, chill, fool...",ayy zypitano gas chill fool real real yeah uh ...,en


## Step 2: Cleaning (Remove Strange Characters, Unused Columns & Drop Duplicates)

In [6]:
# cleaning function:
def clean_raw_data(input_df):
    text_cols = ['lyrics', 'lyrics_clean', 'lyrics_scrubbed']
    # remove weird html things
    for col in text_cols:
        input_df[col] = input_df[col].str.replace('\u200E', '')
        input_df[col] = input_df[col].str.replace('', '')
        input_df[col] = input_df[col].str.replace("Â€~", '')
        input_df[col] = input_df[col].str.replace("repeat chorus", '', case = False)
        input_df[col] = input_df[col].str.replace("instrumental", '', case = False)
    print(input_df.shape)
    # remove duplicates on song and artist
    input_df.drop_duplicates(subset = ['song', 'artist'], ignore_index = True, inplace = True)
    # specify columns to keep:
    cols_to_keep = ['song', 'artist', 'genre', 'lyrics_clean', 'lyrics_scrubbed']
    input_df = input_df[cols_to_keep]
    print(input_df.shape)
    return(input_df)
    
    

In [7]:
clean_df = clean_raw_data(raw_data_df)
clean_df.shape

(5198, 9)
(4620, 5)


(4620, 5)

## Step 3: Analysis with Scattertext
### https://github.com/JasonKessler/scattertext
### https://spacy.io/universe/project/scattertext

In [8]:
import spacy
# load the spaCy English language model
# we use the small one here
# for more info see:
# https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm')

# import scattertext
import scattertext as st

## Scattertext analysis with lyrics scrubbed column

In [9]:
%%time
corpus_scrubbed = st.CorpusFromPandas(clean_df,
                             category_col='genre',
                             text_col='lyrics_scrubbed',
                             nlp=nlp).build()

CPU times: user 1min 30s, sys: 117 ms, total: 1min 30s
Wall time: 1min 31s


## Produce the Scattertext HTML Visualization

In [10]:
html = st.produce_scattertext_explorer(
    corpus_scrubbed,
    category='R&B/Hip-Hop', category_name='R&B/Hip-Hop', not_category_name='Country',
    minimum_term_frequency=50, pmi_threshold_coefficient=4,
    width_in_pixels=1000, metadata=corpus_scrubbed.get_df()['song'],
    transform=st.Scalers.dense_rank
)

open('lyrics_scrubbed_scatterplot.html', 'w').write(html)

4865595