# Supplementary Exploration into Text Features (TrainSet)

In [1]:
# imports
import pandas as pd
import numpy as np

from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif

In [2]:
# train inputs
train_features = pd.read_csv("train_features.csv")

# isolate text features
X_train_text = train_features.iloc[:,1:3].copy(deep=True)

# train labels
y_train = pd.read_csv("train_labels.csv")

In [3]:
# create new columns for folk, metal, and dance and electronica labels
y_train['folk'] = 0
y_train['metal'] = 0
y_train['dance and electronica'] = 0

# value is either 0 or 1
for i, row in enumerate(y_train.iloc[:,1]):
    if row == 'folk':
        y_train.loc[i, 'folk'] = 1
    elif row == 'metal':
        y_train.loc[i, 'metal'] = 1
    elif row == 'dance and electronica':
        y_train.loc[i, 'dance and electronica'] = 1

In [4]:
# apply PorterStemmer to title
stemmer = PorterStemmer()
X_train_text['title'] = X_train_text['title'].apply(lambda x: ", ".join([stemmer.stem(w) for w in word_tokenize(x)]))

# concatenate title and tags
X_train_text['textual'] = X_train_text['title'] + ", " + X_train_text['tags']

X_train_text = X_train_text.drop(['title','tags'], axis=1)

In [5]:
# apply tfidf vectorization
vectorizer = TfidfVectorizer()
X_train_text = pd.DataFrame(vectorizer.fit_transform(X_train_text.iloc[:,0]).todense())

# Folk

In [6]:
# find top 20 words with highest MI with respect to the folk genre
scores = mutual_info_classif(X_train_text, y_train.loc[:,'folk'], random_state=42)

# capture words/features
features = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# sort list (descending MI order) and replace value with its index
sorted_scores = np.argsort(scores)[::-1]

# get top 20
top20_scores = sorted_scores[:20]
top20_features = [features[i] for i in top20_scores]

print("\nFOLK -- Feature, Mutual Information statistic:")
for i, ind in enumerate(top20_scores):
    print(f"\n\t{i+1}.\t{features[ind]}, {round(scores[ind],3)}")


FOLK -- Feature, Mutual Information statistic:

	1.	that, 0.019

	2.	vas, 0.017

	3.	and, 0.017

	4.	ketchup, 0.017

	5.	do, 0.017

	6.	there, 0.016

	7.	mi, 0.015

	8.	pero, 0.015

	9.	stranger, 0.014

	10.	the, 0.014

	11.	para, 0.014

	12.	would, 0.014

	13.	kettl, 0.014

	14.	cambiar, 0.014

	15.	crook, 0.014

	16.	to, 0.014

	17.	was, 0.013

	18.	cuando, 0.013

	19.	coz, 0.013

	20.	lost, 0.013


# Metal

In [7]:
# find top 20 words with highest MI with respect to the metal genre
scores = mutual_info_classif(X_train_text, y_train.loc[:,'metal'], random_state=42)

# capture words/features
features = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# sort list (descending MI order) and replace value with its index
sorted_scores = np.argsort(scores)[::-1]

# get top 20
top20_scores = sorted_scores[:20]
top20_features = [features[i] for i in top20_scores]

print("\nMETAL -- Feature, Mutual Information statistic:")
for i, ind in enumerate(top20_scores):
    print(f"\n\t{i+1}.\t{features[ind]}, {round(scores[ind],3)}")


METAL -- Feature, Mutual Information statistic:

	1.	know, 0.023

	2.	oh, 0.023

	3.	of, 0.023

	4.	blood, 0.023

	5.	do, 0.021

	6.	love, 0.021

	7.	say, 0.019

	8.	etern, 0.018

	9.	en, 0.017

	10.	fuck, 0.017

	11.	life, 0.017

	12.	got, 0.017

	13.	dead, 0.017

	14.	hate, 0.017

	15.	that, 0.016

	16.	death, 0.016

	17.	if, 0.016

	18.	kill, 0.015

	19.	the, 0.015

	20.	is, 0.015


# Dance and Electronica

In [8]:
# find top 20 words with highest MI with respect to the dance and electronica genre
scores = mutual_info_classif(X_train_text, y_train.loc[:,'dance and electronica'], random_state=42)

# capture words/features
features = sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.get)

# sort list (descending MI order) and replace value with its index
sorted_scores = np.argsort(scores)[::-1]

# get top 20
top20_scores = sorted_scores[:20]
top20_features = [features[i] for i in top20_scores]

print("\nDANCE AND ELECTRONICA -- Feature, Mutual Information statistic:")
for i, ind in enumerate(top20_scores):
    print(f"\n\t{i+1}.\t{features[ind]}, {round(scores[ind],3)}")


DANCE AND ELECTRONICA -- Feature, Mutual Information statistic:

	1.	viel, 0.011

	2.	blur, 0.01

	3.	to, 0.009

	4.	neue, 0.009

	5.	tun, 0.009

	6.	rihanna, 0.008

	7.	extraño, 0.008

	8.	weed, 0.008

	9.	zimbabw, 0.008

	10.	wine, 0.008

	11.	you, 0.008

	12.	pharoah, 0.008

	13.	fill, 0.008

	14.	duppi, 0.008

	15.	id, 0.008

	16.	ere, 0.008

	17.	their, 0.008

	18.	spinach, 0.008

	19.	wit, 0.008

	20.	ventana, 0.007
