In [1]:
import pandas as pd
import numpy as np
hate_spotify_dataset = pd.read_csv("data/labeled_dataset.csv")

In [2]:
lyrics = hate_spotify_dataset.lyrics

## Feature generation nach Davidson

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re

stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = []
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) lots of whitespace with one instance

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

def tokenize(lyrics):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]*", lyrics.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", lyrics.lower())
    tokens = [stemmer.stem(t) for t in lyrics.split()]
    return tokens

def basic_tokenize(lyrics):
    """Same as tokenize but without the stemming"""
    lyrics = " ".join(re.split("[^a-zA-Z.,!?]*", lyrics.lower())).strip()
    return lyrics.split()

vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.501
    )

In [4]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(lyrics).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores



In [5]:
#vocab

In [6]:
#Get POS tags for lyrics and save as a string
lyrics_tags = []
for l in lyrics:
    tokens = basic_tokenize(preprocess(l))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    #for i in range(0, len(tokens)):
    tag_str = " ".join(tag_list)
    lyrics_tags.append(tag_str)
        #print(tokens[i],tag_list[i])

In [7]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.501,
    )

In [8]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(lyrics_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [9]:
#!pip install vaderSentiment
#!pip install textstat

In [10]:
#Now get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def other_features(lyrics):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(lyrics)
    
    words = preprocess(lyrics) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(lyrics)
    num_terms = len(lyrics.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound']]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(lyrics):
    feats=[]
    for l in lyrics:
        feats.append(other_features(l))
    return np.array(feats)

In [11]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound"]

In [12]:
feats = get_feature_array(lyrics)

In [13]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [14]:
M.shape

(1766, 10543)

In [15]:
#Anmerkung: wir mussten items() statt iteritems() schreiben, weil es das nicht mehr in Python 3 gibt
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

## Running the model nach Davidson 

In [16]:

X = pd.DataFrame(M)
y = hate_spotify_dataset['label'].astype(int)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [18]:
#Anmerkung: wir haben zusätzlich solver="liblinear" eingefügt, weil sonst L1 nicht unterstützt wird
select = SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01, solver='liblinear'))
X_ = select.fit_transform(X,y)

In [19]:
model = LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_, y)



In [20]:
y_preds = model.predict(X_)

In [21]:
report = classification_report( y, y_preds )

In [22]:
print(report)

              precision    recall  f1-score   support

           0       0.46      0.97      0.62       331
           1       0.95      0.11      0.20       346
           2       0.92      0.87      0.89      1089

    accuracy                           0.74      1766
   macro avg       0.78      0.65      0.57      1766
weighted avg       0.84      0.74      0.71      1766



# Using information from the model to obtain the matrix X_ generically

## Obtaining information about the model nach Davidson

In [23]:
import sys
if sys.version_info[0] >= 3:
    unicode = str

In [24]:
final_features = select.get_support(indices=True) #get indices of features
final_feature_list = [unicode(feature_names[i]) for i in final_features] #Get list of names corresponding to indices

In [25]:
print (final_feature_list)

["'em", "'fore", '(a', '(hell', '(hey)', '(i', '(woah)', '(yeah!)', '(yeah)', '(you', '..', 'afraid', "ain't", 'ali', 'anoth one', 'are,', 'arm', 'around know', 'asham', 'ass', 'at,', 'at?', 'away,', 'ayi', 'ayy,', 'baby,', "baby, i'm", 'back', 'back ass', 'back never', 'bad,', 'bad, bad,', 'ball', 'baller', 'bands,', 'better', 'bitch', 'bitch better', 'bitch,', 'blood,', 'blue', 'boom', 'boom,', 'bop', 'bounc back', 'boy', 'brain', 'break', 'bring', 'buck,', 'bullet', 'bump', 'buy', 'california', 'camera', "can't", "can't keep", 'caus', 'chao', 'chick', 'chill', 'cocaine,', 'come aliv', 'cuz', 'da', "dancin' like", 'dawg', 'day,', 'dear', 'die', 'dig', 'dirti', 'do, do,', 'dolla', 'done', 'doo', 'dope', 'dream', 'drive crazi', 'drop', 'dumb', 'ear', 'east', 'eat', 'em', 'empir', 'ever', 'ever make', 'everi day', "everi day i'm", 'everi littl', 'everybodi', 'ex,', 'face', 'famili', 'feel', 'feel better', 'free', 'fuck', 'fuck)', "fuckin'", 'full', 'funki', 'gangsta', 'gat', 'get floor'

In [26]:
print(len(final_feature_list))

316


### Anmerkung: Wir müssen die jeweils letzten zugehörigen Elemente einer Feature-Gruppe ersetzen 

In [28]:
#Getting names for each class of features
ngram_features = final_feature_list[:final_feature_list.index("you, that'")+1]
pos_features = final_feature_list[final_feature_list.index("you, that'")+1:final_feature_list.index('NN WP')+1]
oth_features = final_feature_list[final_feature_list.index('NN WP')+1:]

## Generating ngram features nach Davidson

In [29]:
new_vocab = {v:i for i, v in enumerate(ngram_features)}
new_vocab_to_index = {}
for k in ngram_features:
    new_vocab_to_index[k] = vocab[k]

In [30]:
#Get indices of text features
ngram_indices = final_features[:len(ngram_features)]

In [31]:
#TODO: Pickle new vectorizer

In [32]:
new_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_vocab
    )

In [33]:
import joblib
joblib.dump(new_vectorizer, 'final_tfidf.pkl') 

['final_tfidf.pkl']

In [34]:
tfidf_ = new_vectorizer.fit_transform(lyrics).toarray()



In [36]:
#Verifying that results are the same

In [37]:
tfidf_[1,:]

array([ 5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  7.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  7.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  2.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        2.,  0.,  0.,  0.,  1.,  2.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0

In [38]:
tfidf_[1,:].sum()

119.0

In [39]:
tfidf_.shape

(1766, 306)

In [40]:
X_[1,:tfidf_.shape[1]]

array([12.66836503,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  8.9969446 ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , 13.51785905,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        2.60717547,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        3.65238667,  0.        ,  0.        ,  0.        ,  0.        ,
        4.41602937,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [41]:
X_[1,:tfidf_.shape[1]].sum()

293.2405891929479

In [42]:
X_.shape

(1766, 316)

In [43]:
X_[1,:tfidf_.shape[1]].shape

(306,)

Results are the same if use IDF but the problem is that IDF will be different if we use different data. Instead we have to use the original IDF scores and multiply them by the new matrix.

In [44]:
idf_vals_ = idf_vals[ngram_indices]

In [45]:
idf_vals_.shape

(306,)

In [46]:
#TODO: Pickle idf_vals

joblib.dump(idf_vals_, 'final_idf.pkl')

['final_idf.pkl']

### Zahl ändern! 

In [79]:
(tfidf_[1,:]*idf_vals_) == X_[1,:306] #Got same value as final process array!

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [80]:
tfidf_*idf_vals_ == X_[:,:306]

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [81]:
tfidffinal = tfidf_*idf_vals_

In [82]:
tfidffinal.shape

(1766, 306)

## Generating POS features nach Davidson
This is simpler as we do not need to worry about IDF but it will be slower as we have to compute the POS tags for the new data. Here we can simply use the old POS tags.

In [83]:
new_pos = {v:i for i, v in enumerate(pos_features)}

In [84]:
#TODO: Pickle pos vectorizer
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
new_pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_pos
    )

In [85]:
joblib.dump(new_pos_vectorizer, 'final_pos.pkl') 

['final_pos.pkl']

In [86]:
pos_ = new_pos_vectorizer.fit_transform(lyrics_tags).toarray()

In [87]:
pos_.shape

(1766, 3)

In [88]:
pos_[1,:]

array([0., 0., 1.])

### Anmerkung: Haben wir hier die richtigen Zahlen eingetragen? 

In [89]:
X_[1,306:309]

array([0.        , 5.76842218, 1.        ])

In [90]:
pos_[:,:] == X_[:,306:309]

array([[ True,  True,  True],
       [ True, False,  True],
       [ True,  True,  True],
       ...,
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [91]:
pos_[:,:].sum()

3152.0

In [92]:
pos_.shape

(1766, 3)

In [93]:
X_[:,306:309].sum()

3925.365896765484

In [94]:
X_.shape

(1766, 316)

## Finally, we can look at the other features

In [95]:
print (other_features_names)

['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neg', 'vader pos', 'vader neu', 'vader compound']


In [96]:
print (oth_features)

['FRE', 'num_syllables', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words']


The functions can be modified to only calculate and return necessary fields.

In [97]:
def other_features_(lyrics):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(lyrics)
    
    words = preprocess(lyrics) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(lyrics)
    num_terms = len(lyrics.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    features = [FRE, syllables, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms]#, sentiment['compound']], FKRA is also removed, check the first feature list, you are not using it!
    #features = pandas.DataFrame(features)
    return features

def get_feature_array_(lyrics):
    feats=[]
    for l in lyrics:
        feats.append(other_features_(l))
    return np.array(feats)

In [98]:
feats_ = get_feature_array_(lyrics)

In [99]:
feats_[0,:]

array([-241.25,  411.  , 1731.  , 1731.  ,  341.  ,  341.  ,  199.  ])

In [100]:
feats_

array([[-241.25,  411.  , 1731.  , ...,  341.  ,  341.  ,  199.  ],
       [-621.85,  818.  , 3556.  , ...,  722.  ,  722.  ,  308.  ],
       [-426.5 ,  608.  , 2592.  , ...,  528.  ,  528.  ,  284.  ],
       ...,
       [-435.7 ,  592.  , 2603.  , ...,  542.  ,  542.  ,  291.  ],
       [-679.14,  902.  , 3949.  , ...,  776.  ,  776.  ,  415.  ],
       [-521.59,  682.  , 2810.  , ...,  627.  ,  627.  ,  227.  ]])

In [101]:
feats_.shape

(1766, 7)

In [102]:
X_[0,309:]

array([-241.25,  411.  , 1731.  , 1731.  ,  341.  ,  341.  ,  199.  ])

In [103]:
X_.shape

(1766, 316)

In [104]:
feats_[:,:] == X_[:,309:]

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

## Now that we have put it all together using a simplified process we can assess if these new data return the same answers.

In [105]:
M_ = np.concatenate([tfidffinal, pos_, feats_],axis=1)

In [106]:
M_.shape

(1766, 316)

In [107]:
X__ = pd.DataFrame(M_)

In [108]:
y_preds_ = model.predict(X__)

In [109]:
report = classification_report( y, y_preds_ )

In [110]:
print(report)

              precision    recall  f1-score   support

           0       0.46      0.97      0.62       331
           1       0.93      0.11      0.20       346
           2       0.92      0.87      0.89      1089

    accuracy                           0.74      1766
   macro avg       0.77      0.65      0.57      1766
weighted avg       0.84      0.74      0.71      1766

