In [3]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Import the dataset
# Downloaded from: https://www.kaggle.com/datasets/unanimad/grammy-awards
df = pd.read_csv('the_grammy_awards_winners.csv')
df.head()

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
0,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",https://www.grammy.com/sites/com/files/styles/...,True
1,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V...",https://www.grammy.com/sites/com/files/styles/...,True
2,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ...",https://www.grammy.com/sites/com/files/styles/...,True
3,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H...",https://www.grammy.com/sites/com/files/styles/...,True
4,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C...",https://www.grammy.com/sites/com/files/styles/...,True


In [5]:
# Get all column names. Can also be accomplished with df.columns but
# format is a bit weird, so used a for-loop and print statements
print('Columns:')
for column in df.columns:
    print(column)

Columns:
year
title
published_at
updated_at
category
nominee
artist
workers
img
winner


# EDA Summary:
## `year`
* Range $[1958, 2019]$.
* More entries for more recent years.
* No `NaN` values.

## `title`
* Redundant info, should be dropped.

## `published_at`
* Redundant info, should be dropped.

## `updated_at`
* Redundant info, should be dropped.

## `category`
* $638$ unique values
* Potentially useful for analysis and data aggregation during preprocessing.

## `nominee`
* Should be kept, probably the most important column for identification.
* 6 `NaN` values, can just drop these rows

## `artist`
* $38%$ of rows are `NaN` values. Can potentially impute with `workers`?
* Should be kept, useful for analysis and data aggregation during preprocessing.

## `workers`
* **A LOT** of `NaN` values but is fine (explanation in EDA).
* Should be kept, can be used for feature engineering during preprocessing (ex: number of workers).

## `img`
* Useless, should be dropped.

## `winner`
* Useless *by itself* (entire column of true values). Needs to be combined with supplemental data sets.

## `year`

In [6]:
# check unique values
df['year'].unique()

array([2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009,
       2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998,
       1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987,
       1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976,
       1975, 1974, 1973, 1972, 1971, 1970, 1969, 1968, 1967, 1966, 1965,
       1964, 1963, 1962, 1961, 1960, 1959, 1958])

In [7]:
# check each unique value's count
df['year'].value_counts().sort_values(ascending=False)

2019    433
2007    111
2008    111
2006    110
2010    109
       ... 
1968     40
1962     39
1960     39
1959     35
1958     28
Name: year, Length: 62, dtype: int64

In [8]:
# check nulls
year_nans = sum(df['year'].isnull())
print("Number of NaN values: {}".format(year_nans))

Number of NaN values: 0


Looks like the data contains information for grammys from 1958 to 2019. From sorting through the data, it appears there are more entries for more recent years, which I guess makes sense. We could consider only keeping rows where `year` $> 2000$.

In addition, `year` contains no `NaN` values.

## `title`

In [9]:
df['title'].unique()

array(['62nd Annual GRAMMY Awards  (2019)',
       '61st Annual GRAMMY Awards  (2018)',
       '60th Annual GRAMMY Awards  (2017)',
       '59th Annual GRAMMY Awards  (2016)',
       '58th Annual GRAMMY Awards  (2015)',
       '57th Annual GRAMMY Awards  (2014)',
       '56th Annual GRAMMY Awards  (2013)',
       '55th Annual GRAMMY Awards  (2012)',
       '54th Annual GRAMMY Awards  (2011)',
       '53rd Annual GRAMMY Awards  (2010)',
       '52nd Annual GRAMMY Awards  (2009)',
       '51st Annual GRAMMY Awards  (2008)',
       '50th Annual GRAMMY Awards  (2007)',
       '49th Annual GRAMMY Awards  (2006)',
       '48th Annual GRAMMY Awards  (2005)',
       '47th Annual GRAMMY Awards  (2004)',
       '46th Annual GRAMMY Awards  (2003)',
       '45th Annual GRAMMY Awards  (2002)',
       '44th Annual GRAMMY Awards  (2001)',
       '43rd Annual GRAMMY Awards  (2000)',
       '42nd Annual GRAMMY Awards  (1999)',
       '41st Annual GRAMMY Awards  (1998)',
       '40th Annual GRAMMY Award

From looking at the unique values, don't think much information can be gathered here. The `year` column already captures the same amount of information. `title` can probably just be dropped.

## `published_at` & `updated_at`

In [10]:
df["published_at"]

0       2020-05-19T05:10:28-07:00
1       2020-05-19T05:10:28-07:00
2       2020-05-19T05:10:28-07:00
3       2020-05-19T05:10:28-07:00
4       2020-05-19T05:10:28-07:00
                  ...            
4805    2017-11-28T00:03:45-08:00
4806    2017-11-28T00:03:45-08:00
4807    2017-11-28T00:03:45-08:00
4808    2017-11-28T00:03:45-08:00
4809    2017-11-28T00:03:45-08:00
Name: published_at, Length: 4810, dtype: object

In [11]:
df['updated_at']

0       2020-05-19T05:10:28-07:00
1       2020-05-19T05:10:28-07:00
2       2020-05-19T05:10:28-07:00
3       2020-05-19T05:10:28-07:00
4       2020-05-19T05:10:28-07:00
                  ...            
4805    2019-09-10T01:11:09-07:00
4806    2019-09-10T01:11:09-07:00
4807    2019-09-10T01:11:09-07:00
4808    2019-09-10T01:11:09-07:00
4809    2019-09-10T01:11:09-07:00
Name: updated_at, Length: 4810, dtype: object

Similar to the `title` column, the `year` column captures the same information in these two columns. I don't think knowing the exact time (day, month, hour, minute) is going to be useful in our analysis. Both columns can be dropped.

## `category`

In [12]:
df['category'].value_counts()

Song Of The Year                                                   70
Record Of The Year                                                 69
Album Of The Year                                                  66
Best Opera Recording                                               64
Best Album Notes                                                   63
                                                                   ..
Best Sacred Performance (Musical)                                   1
Best Jazz Performance - Small Group Or Soloist With Small Group     1
Best Jazz Performance - Large Group Or Soloist With Large Group     1
Best Contemporary Vocal Performance By A Group                      1
Best Classical Performance - Operatic Or Choral                     1
Name: category, Length: 638, dtype: int64

Contains $638$ unique values, which can be interpreted as "Grammy Award Categories" (self-explanatory column name). Definitely could be useful in analysis; being nominated for "Song Of The Year" could be used as a predictor for album sales/success.

## `nominee`

In [13]:
# check unique values and their counts
df['nominee'].value_counts()

Bridge Over Troubled Water    7
Robert Woods                  7
Berlioz: Requiem              7
Steven Epstein                7
David Frost                   6
                             ..
Push The Button               1
The Art Of Romance            1
Devils & Dust                 1
B.Y.O.B.                      1
Virtuoso                      1
Name: nominee, Length: 4131, dtype: int64

Cell above indicates there are repeat nominations. Regardless, should definitely keep this column as it contains information on *what* was nominated; song, album, performance, etc.

In [14]:
# check nulls
nominee_nans = sum(df['nominee'].isnull())
print("Number of NaN values: {}".format(nominee_nans))

Number of NaN values: 6


`nominee` contains $6$ `NaN` values! Taking a closer look at them:

In [15]:
df[df['nominee'].isnull()]

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
2261,2000,43rd Annual GRAMMY Awards (2000),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,"Remixer of the Year, Non-Classical",,,,,True
2359,1999,42nd Annual GRAMMY Awards (1999),2017-11-28T00:03:45-08:00,2019-09-10T01:09:02-07:00,"Remixer Of The Year, Non-Classical",,,,,True
2454,1998,41st Annual GRAMMY Awards (1998),2017-11-28T00:03:45-08:00,2019-09-10T01:08:19-07:00,"Remixer Of The Year, Non-classical",,,,,True
2547,1997,40th Annual GRAMMY Awards (1997),2017-11-28T00:03:45-08:00,2019-09-10T01:07:37-07:00,"Remixer Of The Year, Non-Classical",,,,,True
4525,1965,8th Annual GRAMMY Awards (1965),2017-11-28T00:03:45-08:00,2019-09-10T01:06:59-07:00,Best New Country & Western Artist,,,,,True
4573,1964,7th Annual GRAMMY Awards (1964),2017-11-28T00:03:45-08:00,2019-09-10T01:06:11-07:00,Best New Country & Western Artist Of 1964,,,,,True


Not entirely sure what this means. Does this mean there was no winner for that specific `category`? Did that `category` even exist? Regardless, can probably just drop these rows during preprocessing.

## `artist`

In [16]:
# check unique values and their counts
df['artist'].value_counts()

(Various Artists)                                          66
U2                                                         18
Aretha Franklin                                            16
Bruce Springsteen                                          13
Ella Fitzgerald                                            13
                                                           ..
Rihanna Featuring Jay-Z                                     1
Common Featuring Kanye West                                 1
Lupe Fiasco Featuring Jill Scott                            1
Gerald Levert                                               1
David Seville And The Chipmunks (Ross Bagdasarian, Sr.)     1
Name: artist, Length: 1658, dtype: int64

Should definitely keep. Can be used for analysis, but most importantly could be useful in aggregation during preprocessing.

In [17]:
# check nulls
artist_nans = sum(df['artist'].isnull())
print("Number of NaN values: {}".format(artist_nans))

Number of NaN values: 1840


Compared to the `nominee` column, the `artist` column contains *a lot more* `NaN` values. Taking a closer look at those:

In [18]:
df[df['artist'].isnull()]

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
16,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Song Of The Year,Bad Guy,,"Billie Eilish O'Connell & Finneas O'Connell, s...",https://www.grammy.com/sites/com/files/styles/...,True
17,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Song Of The Year,Always Remember Us This Way,,"Natalie Hemby, Lady Gaga, Hillary Lindsey & Lo...",https://www.grammy.com/sites/com/files/styles/...,True
18,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Song Of The Year,Bring My Flowers Now,,"Brandi Carlile, Phil Hanseroth, Tim Hanseroth ...",https://www.grammy.com/sites/com/files/styles/...,True
19,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Song Of The Year,Hard Place,,"Ruby Amanfu, Sam Ashworth, D. Arcelious Harris...",https://www.grammy.com/sites/com/files/styles/...,True
20,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Song Of The Year,Lover,,"Taylor Swift, songwriter (Taylor Swift)",https://www.grammy.com/sites/com/files/styles/...,True
...,...,...,...,...,...,...,...,...,...,...
4805,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,Best Classical Performance - Instrumentalist (...,Tchaikovsky: Piano Concerto No. 1 In B Flat Mi...,,"Van Cliburn, artist (Symphony Of The Air Orche...",,True
4806,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,Best Classical Performance - Instrumentalist (...,Segovia Golden Jubilee,,"Andres Segovia, artist",https://www.grammy.com/sites/com/files/styles/...,True
4807,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,Best Classical Performance - Chamber Music (In...,Beethoven: Quartet 130,,"Hollywood String Quartet (Alvin Dinkin, Paul S...",,True
4808,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,Best Classical Performance - Vocal Soloist (Wi...,Operatic Recital,,,,True


In [19]:
# Delving into one specific nominee to see why there are NaN values
df[df['nominee']=='Bad Guy']

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
0,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",https://www.grammy.com/sites/com/files/styles/...,True
16,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Song Of The Year,Bad Guy,,"Billie Eilish O'Connell & Finneas O'Connell, s...",https://www.grammy.com/sites/com/files/styles/...,True
34,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Best Pop Solo Performance,Bad Guy,Billie Eilish,,https://www.grammy.com/sites/com/files/styles/...,True


So apparently in the "Song Of The Year" `category`, `artist` is NaN but for "Record Of The Year", `artist` is filled? From a [google search](https://www.grammy.com/news/whats-difference-grammy-record-year-vs-song-year), arrived at a potential explanation from [grammy.com](https://www.grammy.com/news/whats-difference-grammy-record-year-vs-song-year):

<img src="images/roty_vs_soty.png" alt="roty vs soty explanation"/>

So is it because there was no *single* songwriter for "Bad Guy", so `artist` was filled with `NaN`?

In [20]:
# Compute proportion of NaNs
prop_nans = artist_nans / len(df)
print("Proportion of NaN values: {:.4f}".format(prop_nans))

Proportion of NaN values: 0.3825


Regardless, from the cell above, the proportion of `NaN` values is quite high, so we can't drop every row where `artist` is `NaN`. Also can't drop the entire column because as mentioned previously, could contain useful information for analysis/aggregation. We could maybe impute it with values from the `worker` column?

## `workers`

In [21]:
# Using value_counts() would not be useful here since the column contains a list of names
# Just check for nulls instead
# check nulls
workers_nans = sum(df['workers'].isnull())
print("Number of NaN values: {}".format(workers_nans))

Number of NaN values: 2190


May seem to be worse than the `artist` column, but from taking a closer look:

In [22]:
df[df['workers'].isnull()]

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
24,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Best New Artist,Billie Eilish,,,,True
25,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Best New Artist,Black Pumas,,,,True
26,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Best New Artist,Lil Nas X,,,,True
27,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Best New Artist,Lizzo,,,,True
28,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Best New Artist,Maggie Rogers,,,,True
...,...,...,...,...,...,...,...,...,...,...
4794,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,Best Rhythm & Blues Performance,Tequila,The Champs,,https://www.grammy.com/sites/com/files/styles/...,True
4801,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,"Best Sound Track Album, Dramatic Picture Score...",Gigi,Andre Previn,,https://www.grammy.com/sites/com/files/styles/...,True
4802,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,"Best Performance, Documentary Or Spoken Word",The Best Of The Stan Freberg Shows,Stan Freberg,,https://www.grammy.com/sites/com/files/styles/...,True
4803,1958,1st Annual GRAMMY Awards (1958),2017-11-28T00:03:45-08:00,2019-09-10T01:11:09-07:00,Best Recording For Children,The Chipmunk Song,David Seville And The Chipmunks (Ross Bagdasar...,,,True


The plethora of `NaN`s make sense since not every single `category` contains a set of `workers`; for example the "Brand New Artist" `category` would have no `workers` since it's just a single person being nominated (which in this case is the `nominee`).

Column could still be kept. We can use regex to split and extract the *number* of workers. Could be a useful feature during analysis.

## `img`
From a quick peruse, the `img` column seems to be useless. From taking a closer look:

In [23]:
df['img']

0       https://www.grammy.com/sites/com/files/styles/...
1       https://www.grammy.com/sites/com/files/styles/...
2       https://www.grammy.com/sites/com/files/styles/...
3       https://www.grammy.com/sites/com/files/styles/...
4       https://www.grammy.com/sites/com/files/styles/...
                              ...                        
4805                                                  NaN
4806    https://www.grammy.com/sites/com/files/styles/...
4807                                                  NaN
4808                                                  NaN
4809                                                  NaN
Name: img, Length: 4810, dtype: object

All the links seems to just redirect to [grammy.com](grammy.com). Below is a screenshot of the current site:

<img src="images/img_redirect.png" alt="img link image"/>

Can probably just drop the `img` column.

## `winner`

In [24]:
df['winner'].value_counts()

True    4810
Name: winner, dtype: int64

Dataset only contains grammy winners... Completely useless column unless combined with another data set.

Supplemental data sets are going to be needed (e.g. Data sets with all nominees)

# UPDATE 11/22
New data set found including not just winners in `the_grammy_awards.csv`

In [25]:
df2 = pd.read_csv('the_grammy_awards.csv')
df2

Unnamed: 0,year,category,nominee,workers,winner
0,1959,Album of the Year,The Music from Peter Gunn.,Henry Mancini,True
1,1959,Best Album Cover,Only the Lonely,Frank Sinatra (art director),True
2,1959,Best Arrangement,The Music From Peter Gunn,Henry Mancini (artist/arranger),True
3,1959,Best Classical Performance - Chamber Music (in...,Beethoven: Quartet 130,"The Hollywood String Quartet, Paul Shure (arti...",True
4,1959,Best Classical Performance - Instrumentalist (...,Segovia Golden Jubilee,Andrés Segovia,True
...,...,...,...,...,...
6318,2019,Record of the Year,Rockstar,"Post Malone (artist), 21 Savage (artist), Loui...",False
6319,2019,Song of the Year,This is America (Childish Gambino),"Ludwig Göransson (songwriter), Young Thug (son...",True
6320,2019,Song of the Year,The Joke (Brandi Carlile),"Brandi Carlile (songwriter), Dave Cobb (songwr...",False
6321,2019,Song of the Year,Boo'd Up (Ella Mai),"Ella Mai (songwriter), Larrance Dopson (songwr...",False


In [42]:
#Import scikit-learn dataset library
from sklearn import datasets
data = df2

# Import train_test_split function
from sklearn.model_selection import train_test_split

X=data[['category', 'year', 'workers']]  # Features
y=data['winner']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [41]:
from sklearn.linear_model import LinearRegression

data = df2[['winner','category']]  # load data set
X = data.iloc[:, 0].values.reshape(-1, 1)  # values converts it into a numpy array
Y = data.iloc[:, 1].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column
linear_regressor = LinearRegression()  # create object for the class
linear_regressor.fit(X, Y)  # perform linear regression
Y_pred = linear_regressor.predict(X)  # make predictions

ValueError: could not convert string to float: 'Album of the Year'

In [36]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

print('Accuracy with descriptive labels is Accuracy: 0.001349527665317139')

NameError: name 'y_pred' is not defined

In [None]:
url = 'https://datahub.berkeley.edu/user/nishkachotai/edit/the_grammy_awards.csv'
df_cleaned = pd.read_csv(url)

In [None]:
#Import scikit-learn dataset library
from sklearn import datasets
data = df_cleaned

# Import train_test_split function
from sklearn.model_selection import train_test_split

X=data[['duration', 'languages','comments', 'month', 'num_speaker']]  # Features
y=data['views']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

print('Accuracy without descriptive labels is Accuracy: 0.005398110661268556')

In [None]:
# Now you can predict approximate how many views it is.
# helpful link: https://www.datacamp.com/community/tutorials/random-forests-classifier-python
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)