In [1]:
import numpy as np 
import pandas as pd # ( pd.read_csv)
from collections import defaultdict
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report,cohen_kappa_score
import os, gensim, copy, pickle, warnings
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
import numpy as np
from gensim import corpora, models
import seaborn as sns
np.random.seed(400)
from imblearn.over_sampling import SMOTE
import pyLDAvis
import pyLDAvis.gensim_models
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/daisy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/daisy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')
# test = pd.read_csv('data/test.csv')

### Check basic information

### Need to match the trackID to get the label for features, assuming “classic pop and rock” is different from pop

In [3]:
all_data = features.merge(labels, on=['trackID'], how='inner')
print(len(all_data),'combined data length \n', 
      len(features), 'feature length \n', 
      len(labels), 'label length')

8128 combined data length 
 8128 feature length 
 8128 label length


# Step 1: Exploratory Data Analyse and preprocessing

### The tag and title are all strings, need to convert to list and dict

In [4]:
## pre-processing by split the tag and title to list
def combine_title_and_tags(df):
    combine_title_tag_list = []
    for i in range(len(df)):
        df['title'] = df['title'].fillna('')
        df['tags'] = df['tags'].fillna('')
        combine_title_tag_list.append(" ".join([df['title'][i],df['tags'][i]]))   
    df = df.assign(text = pd.Series(combine_title_tag_list))
    df = df.drop(columns = ['tags','title'])
    return df
all_data = combine_title_and_tags(all_data)

### Imbalanced dataset, but not extremely imbalanced

# Step2: Preprocessing and prepare for modelling

In [5]:
## Split to train and test
y = all_data.pop('genre')
X_train, X_valid, y_train, y_valid = train_test_split( all_data, y, test_size=0.4, random_state=15, stratify=y)

In [6]:
print(np.shape(X_train))
X_valid= X_valid.reset_index(drop=True)
y_valid= y_valid.reset_index(drop=True)
X_valid.to_csv('test/X_valid.csv')
y_valid.to_csv('test/y_valid.csv')
X_train.to_csv('test/X_train.csv')
y_train.to_csv('test/y_train.csv')

(4876, 156)


## Check text data

In [7]:
X_train_text2 = pd.DataFrame(X_train.reset_index(drop=True))
y_train_text2 = y_train.reset_index(drop=True)

In [8]:
stop_words = stopwords.words('english')
stop_words.extend(['let','oh','hey', 'hay', 'ya', 'ooh',  'go', 'ai', 'tell', 'still','ah'
                   'ca', 'na', 'say', 'sure', 'yeah', 'tu', 'els', 'might', 'done']) 

def strip_newline(series):
    return [review.replace('\n','') for review in series]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def lemmatize_stemming_2(texts):
    all_output =[]
    for j in texts:
         all_output.append([WordNetLemmatizer().lemmatize(i, pos = 'v') for i in j])
    return all_output


def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_corpus(df):
    df['text'] = strip_newline(df.text)
    words = list(sent_to_words(df.text))
    words = lemmatize_stemming_2(words) ##new
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=12, no_above=0.5)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

## create vectors
def get_text_vector(df, corpuss, lda_model):
    vecs = []
    for i in range(len(df)):
        top_topics = lda_model.get_document_topics(corpuss[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(8)]
        topic_vec.extend([len(df.iloc[i].text)]) # length review
        vecs.append(topic_vec)
    return vecs



In [9]:
X_train_text2.columns

Index(['trackID', 'loudness', 'tempo', 'time_signature', 'key', 'mode',
       'duration', 'vect_1', 'vect_2', 'vect_3',
       ...
       'vect_140', 'vect_141', 'vect_142', 'vect_143', 'vect_144', 'vect_145',
       'vect_146', 'vect_147', 'vect_148', 'text'],
      dtype='object', length=156)

In [10]:
train_corpus, train_id2word, bigram_train = get_corpus(X_train_text2)

with open('model/train_corpus.pkl', 'wb') as f:
    pickle.dump(train_corpus, f)
with open('model/train_id2word.pkl', 'wb') as f:
    pickle.dump(train_id2word, f)
with open('model/bigram_train.pkl', 'wb') as f:
    pickle.dump(bigram_train, f)

In [11]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_model = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=8,
                           id2word=train_id2word,
                           chunksize=20,
                           workers=7, 
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
    lda_model.save('model/lda_model.model')

In [12]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, train_corpus, train_id2word , mds="mmds", R=30)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


It seems there are some overlaps between topic 1 and 2

There are a few words seems to be stop words but not in the stop words list, added the extra stop words like ['hey', 'hay', 'ya','ooh'] to the existing stop words dictionary

In [13]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_model2 = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus,
                           num_topics=16,
                           id2word=train_id2word,
                           chunksize=50,
                           workers=7, 
                           passes=100,
                           eval_every = 1,
                           per_word_topics=False)
    lda_model2.save('model/lda_model2.model')


In [14]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model2, train_corpus, train_id2word , mds="mmds", R=30)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)


### Check topic and key words example

In [15]:
lda_model.print_topics(8,num_words=15)[:12]

[(0,
  '0.025*"que_de" + 0.016*"la_en" + 0.012*"un_el" + 0.012*"que" + 0.009*"corazon" + 0.009*"mi" + 0.008*"cuando" + 0.008*"te_se" + 0.008*"se" + 0.008*"esta" + 0.007*"solo" + 0.007*"el_te" + 0.007*"tus" + 0.006*"mas" + 0.006*"se_mi"'),
 (1,
  '0.014*"get" + 0.006*"know_like" + 0.006*"money" + 0.006*"drink" + 0.005*"street" + 0.005*"buy" + 0.005*"one_get" + 0.005*"like" + 0.005*"come" + 0.005*"man" + 0.005*"stick" + 0.004*"back" + 0.004*"pay" + 0.004*"town" + 0.004*"big"'),
 (2,
  '0.016*"da" + 0.014*"die" + 0.011*"noch" + 0.011*"tri_whi" + 0.011*"style" + 0.010*"art" + 0.010*"hat" + 0.009*"mind_girl" + 0.009*"fun" + 0.008*"vers" + 0.008*"yes_smile" + 0.008*"pair" + 0.007*"talkin" + 0.007*"bust" + 0.007*"explain"'),
 (3,
  '0.012*"blood" + 0.008*"god" + 0.007*"death" + 0.006*"land" + 0.006*"fire" + 0.006*"children" + 0.005*"human" + 0.005*"war" + 0.005*"burn" + 0.005*"peac" + 0.005*"power" + 0.005*"bleed" + 0.005*"forc" + 0.005*"shall" + 0.005*"us"'),
 (4,
  '0.016*"love_know" + 0.01

In [16]:
train_vecs1 = get_text_vector(X_train_text2, train_corpus, lda_model)
train_vecs2 = get_text_vector(X_train_text2, train_corpus, lda_model2)



In [17]:
print(np.shape(train_vecs2 ))
train_vecs = np.concatenate([train_vecs1, train_vecs2], axis=1)
np.shape(train_vecs)

(4876, 9)


(4876, 18)

In [18]:
### to continue
X = np.array(train_vecs)
y = np.array(y_train_text2)

In [19]:
## simulate minority groups
def smote2(X, y):
    X1, y1 = copy.deepcopy(X), copy.deepcopy(y)  # init
    sm = SMOTE(random_state=2)
    X1, y1 = sm.fit_resample(X, y)
    return X1, y1

X2,y2 = smote2(X,y)

### prepare for test text vec

In [20]:
def get_bigram(df):
    df['text'] = strip_newline(df.text)
    words = list(sent_to_words(df.text))
    words = remove_stopwords(words)
    bigram = bigrams(words)
    bigram = [bigram[i] for i in words]
    return bigram

def prepare_text_for_prediction(X_text_df, y_text_df, id2word_dict,lda_model):
    bigram_df = get_bigram(X_text_df)
    df_corpus = [id2word_dict.doc2bow(text) for text in bigram_df]
    df_vecs = get_text_vector(X_text_df, df_corpus, lda_model)
    X_text_array = np.array(df_vecs)
    y_text_array = np.array(y_text_df)
    return X_text_array, y_text_array



In [21]:
del lda_model, train_id2word, train_corpus, bigram_train

lda_model = gensim.models.ldamulticore.LdaMulticore.load('model/lda_model.model')
train_id2word =  pickle.load(open("model/train_id2word.pkl", 'rb'))
lda_model2 = gensim.models.ldamulticore.LdaMulticore.load('model/lda_model2.model')

In [22]:
X_text_array1, y_text_array1 = prepare_text_for_prediction(X_valid, y_valid, train_id2word,lda_model)
X_text_array2, y_text_array2 = prepare_text_for_prediction(X_valid, y_valid, train_id2word,lda_model2)

In [23]:
valid_vecs = np.concatenate([X_text_array1, X_text_array2], axis=1)
X_valid_vecs = np.array(valid_vecs)
valid_y= np.array(y_valid)

In [24]:
np.shape(X_valid_vecs) , np.shape(y_valid)

((3252, 18), (3252,))

In [25]:
X_valid_text_vec2, X_test_text_vec2, y_valid2, y_test2 = train_test_split(X_valid_vecs , y_valid, test_size=0.5, random_state=123)


# Step 3: modelling 

In [26]:
def grid_search_CV(grid_para, X, y, test_X, test_y):

    model_1 = GridSearchCV(XGBClassifier(eval_metric='mlogloss'),
                               grid_para, cv=3)
    model_1.fit(X, y, eval_set = [(test_X, test_y)],
                early_stopping_rounds = 10)
    print(model_1.best_score_)
    print(model_1.best_params_)
    
    {'colsample_bytree': 0.5, 'eta': 0.15, 'gamma': 0.15, 'max_depth': 7, 'num_parallel_tree': 5, 'reg_alpha': 0.5, 'reg_lambda': 0.1, 'sampling_method': 'gradient_based'}
    {'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.15, 'max_depth': 7, 'num_parallel_tree': 5, 'reg_alpha': 0.6, 'reg_lambda': 0.1, 'sampling_method': 'gradient_based'}
    {'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.1, 'max_depth': 7, 'num_parallel_tree': 5, 'reg_alpha': 0.5, 'reg_lambda': 0.1, 'sampling_method': 'gradient_based'}
    {'colsample_bytree': 0.45, 'eta': 0.1, 'gamma': 0.1, 'max_depth': 7, 'num_parallel_tree': 5, 'reg_alpha': 0.45, 'reg_lambda': 0.15, 'sampling_method': 'gradient_based'}

    xgb_m = XGBClassifier(eval_metric = ['mlogloss'],
                          objective='multi:softprob',
                          booster = 'gbtree',
                          grow_policy='lossguide',
                          max_depth=model_1.best_params_['max_depth'],
                          reg_alpha=model_1.best_params_['reg_alpha'],
                          reg_lambda=model_1.best_params_['reg_lambda'],
                          eta=model_1.best_params_['eta'],
                          num_parallel_tree = model_1.best_params_['num_parallel_tree'],
                          gamma=model_1.best_params_['gamma'],
                          sampling_method=model_1.best_params_['sampling_method'],
                          colsample_bytree=model_1.best_params_['colsample_bytree'],
                          seed=123,
                          n_jobs = -1,
                            )
    xgb_m.fit(X, y)
    xgb_m.save_model('model/xgb_model_text.model')
    return  xgb_m

In [None]:

max_depth = [7, 5]  #5, 
eta = [.1, .15]  
gamma = [.15, .1] 
sampling_method = ['gradient_based']  
reg_lambda = [.1, .15]
colsample_bytree = [.45, .5] 
alphax = [.55, .5, .4] 
num_parallel_tree = [5] 
grid_para_2 = dict(max_depth=max_depth, reg_lambda=reg_lambda, reg_alpha=alphax,
                 colsample_bytree=colsample_bytree, eta=eta, gamma=gamma,
                 sampling_method=sampling_method, 
                 num_parallel_tree=num_parallel_tree)
xgb_model = grid_search_CV(grid_para_2, X2, y2, X_valid_text_vec2, y_valid2) #

[0]	validation_0-mlogloss:2.02337




[1]	validation_0-mlogloss:1.97586
[2]	validation_0-mlogloss:1.93720
[3]	validation_0-mlogloss:1.90324
[4]	validation_0-mlogloss:1.87467
[5]	validation_0-mlogloss:1.85030
[6]	validation_0-mlogloss:1.82599
[7]	validation_0-mlogloss:1.80635
[8]	validation_0-mlogloss:1.78778
[9]	validation_0-mlogloss:1.77035
[10]	validation_0-mlogloss:1.75442
[11]	validation_0-mlogloss:1.74072
[12]	validation_0-mlogloss:1.72915
[13]	validation_0-mlogloss:1.71861
[14]	validation_0-mlogloss:1.70796
[15]	validation_0-mlogloss:1.69752
[16]	validation_0-mlogloss:1.68887
[17]	validation_0-mlogloss:1.68152
[18]	validation_0-mlogloss:1.67444
[19]	validation_0-mlogloss:1.66794
[20]	validation_0-mlogloss:1.66253
[21]	validation_0-mlogloss:1.65703
[22]	validation_0-mlogloss:1.65215
[23]	validation_0-mlogloss:1.64743
[24]	validation_0-mlogloss:1.64380
[25]	validation_0-mlogloss:1.64012
[26]	validation_0-mlogloss:1.63655
[27]	validation_0-mlogloss:1.63302
[28]	validation_0-mlogloss:1.63029
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98257
[2]	validation_0-mlogloss:1.94596
[3]	validation_0-mlogloss:1.91433
[4]	validation_0-mlogloss:1.88630
[5]	validation_0-mlogloss:1.86265
[6]	validation_0-mlogloss:1.84017
[7]	validation_0-mlogloss:1.82136
[8]	validation_0-mlogloss:1.80366
[9]	validation_0-mlogloss:1.78741
[10]	validation_0-mlogloss:1.77227
[11]	validation_0-mlogloss:1.75910
[12]	validation_0-mlogloss:1.74677
[13]	validation_0-mlogloss:1.73541
[14]	validation_0-mlogloss:1.72555
[15]	validation_0-mlogloss:1.71593
[16]	validation_0-mlogloss:1.70835
[17]	validation_0-mlogloss:1.70101
[18]	validation_0-mlogloss:1.69437
[19]	validation_0-mlogloss:1.68768
[20]	validation_0-mlogloss:1.68162
[21]	validation_0-mlogloss:1.67624
[22]	validation_0-mlogloss:1.67140
[23]	validation_0-mlogloss:1.66733
[24]	validation_0-mlogloss:1.66375
[25]	validation_0-mlogloss:1.66028
[26]	validation_0-mlogloss:1.65681
[27]	validation_0-mlogloss:1.65361
[28]	validation_0-mlogloss:1.65038
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98805
[2]	validation_0-mlogloss:1.95339
[3]	validation_0-mlogloss:1.92267
[4]	validation_0-mlogloss:1.89679
[5]	validation_0-mlogloss:1.87525
[6]	validation_0-mlogloss:1.85289
[7]	validation_0-mlogloss:1.83526
[8]	validation_0-mlogloss:1.81801
[9]	validation_0-mlogloss:1.80286
[10]	validation_0-mlogloss:1.78827
[11]	validation_0-mlogloss:1.77585
[12]	validation_0-mlogloss:1.76402
[13]	validation_0-mlogloss:1.75443
[14]	validation_0-mlogloss:1.74465
[15]	validation_0-mlogloss:1.73493
[16]	validation_0-mlogloss:1.72742
[17]	validation_0-mlogloss:1.72032
[18]	validation_0-mlogloss:1.71399
[19]	validation_0-mlogloss:1.70777
[20]	validation_0-mlogloss:1.70312
[21]	validation_0-mlogloss:1.69838
[22]	validation_0-mlogloss:1.69354
[23]	validation_0-mlogloss:1.68909
[24]	validation_0-mlogloss:1.68556
[25]	validation_0-mlogloss:1.68237
[26]	validation_0-mlogloss:1.67874
[27]	validation_0-mlogloss:1.67574
[28]	validation_0-mlogloss:1.67311
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.97593
[2]	validation_0-mlogloss:1.93726
[3]	validation_0-mlogloss:1.90344
[4]	validation_0-mlogloss:1.87498
[5]	validation_0-mlogloss:1.85041
[6]	validation_0-mlogloss:1.82600
[7]	validation_0-mlogloss:1.80633
[8]	validation_0-mlogloss:1.78764
[9]	validation_0-mlogloss:1.77019
[10]	validation_0-mlogloss:1.75468
[11]	validation_0-mlogloss:1.74084
[12]	validation_0-mlogloss:1.72911
[13]	validation_0-mlogloss:1.71865
[14]	validation_0-mlogloss:1.70813
[15]	validation_0-mlogloss:1.69783
[16]	validation_0-mlogloss:1.68939
[17]	validation_0-mlogloss:1.68213
[18]	validation_0-mlogloss:1.67527
[19]	validation_0-mlogloss:1.66859
[20]	validation_0-mlogloss:1.66308
[21]	validation_0-mlogloss:1.65775
[22]	validation_0-mlogloss:1.65296
[23]	validation_0-mlogloss:1.64835
[24]	validation_0-mlogloss:1.64459
[25]	validation_0-mlogloss:1.64084
[26]	validation_0-mlogloss:1.63702
[27]	validation_0-mlogloss:1.63375
[28]	validation_0-mlogloss:1.63089
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98239
[2]	validation_0-mlogloss:1.94593
[3]	validation_0-mlogloss:1.91416
[4]	validation_0-mlogloss:1.88613
[5]	validation_0-mlogloss:1.86230
[6]	validation_0-mlogloss:1.83983
[7]	validation_0-mlogloss:1.82091
[8]	validation_0-mlogloss:1.80312
[9]	validation_0-mlogloss:1.78669
[10]	validation_0-mlogloss:1.77162
[11]	validation_0-mlogloss:1.75837
[12]	validation_0-mlogloss:1.74618
[13]	validation_0-mlogloss:1.73498
[14]	validation_0-mlogloss:1.72497
[15]	validation_0-mlogloss:1.71559
[16]	validation_0-mlogloss:1.70804
[17]	validation_0-mlogloss:1.70055
[18]	validation_0-mlogloss:1.69383
[19]	validation_0-mlogloss:1.68726
[20]	validation_0-mlogloss:1.68149
[21]	validation_0-mlogloss:1.67631
[22]	validation_0-mlogloss:1.67143
[23]	validation_0-mlogloss:1.66745
[24]	validation_0-mlogloss:1.66379
[25]	validation_0-mlogloss:1.66017
[26]	validation_0-mlogloss:1.65696
[27]	validation_0-mlogloss:1.65393
[28]	validation_0-mlogloss:1.65075
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98801
[2]	validation_0-mlogloss:1.95318
[3]	validation_0-mlogloss:1.92248
[4]	validation_0-mlogloss:1.89669
[5]	validation_0-mlogloss:1.87509
[6]	validation_0-mlogloss:1.85265
[7]	validation_0-mlogloss:1.83513
[8]	validation_0-mlogloss:1.81774
[9]	validation_0-mlogloss:1.80251
[10]	validation_0-mlogloss:1.78803
[11]	validation_0-mlogloss:1.77565
[12]	validation_0-mlogloss:1.76377
[13]	validation_0-mlogloss:1.75407
[14]	validation_0-mlogloss:1.74421
[15]	validation_0-mlogloss:1.73465
[16]	validation_0-mlogloss:1.72740
[17]	validation_0-mlogloss:1.72023
[18]	validation_0-mlogloss:1.71387
[19]	validation_0-mlogloss:1.70789
[20]	validation_0-mlogloss:1.70342
[21]	validation_0-mlogloss:1.69867
[22]	validation_0-mlogloss:1.69390
[23]	validation_0-mlogloss:1.68936
[24]	validation_0-mlogloss:1.68550
[25]	validation_0-mlogloss:1.68209
[26]	validation_0-mlogloss:1.67864
[27]	validation_0-mlogloss:1.67572
[28]	validation_0-mlogloss:1.67290
[29]	validation_0-mlogloss:1.



[0]	validation_0-mlogloss:2.02336
[1]	validation_0-mlogloss:1.97595
[2]	validation_0-mlogloss:1.93687
[3]	validation_0-mlogloss:1.90258
[4]	validation_0-mlogloss:1.87383
[5]	validation_0-mlogloss:1.84926
[6]	validation_0-mlogloss:1.82540
[7]	validation_0-mlogloss:1.80607
[8]	validation_0-mlogloss:1.78747
[9]	validation_0-mlogloss:1.77003
[10]	validation_0-mlogloss:1.75430
[11]	validation_0-mlogloss:1.74058
[12]	validation_0-mlogloss:1.72914
[13]	validation_0-mlogloss:1.71829
[14]	validation_0-mlogloss:1.70781
[15]	validation_0-mlogloss:1.69744
[16]	validation_0-mlogloss:1.68876
[17]	validation_0-mlogloss:1.68154
[18]	validation_0-mlogloss:1.67454
[19]	validation_0-mlogloss:1.66781
[20]	validation_0-mlogloss:1.66257
[21]	validation_0-mlogloss:1.65725
[22]	validation_0-mlogloss:1.65235
[23]	validation_0-mlogloss:1.64767
[24]	validation_0-mlogloss:1.64412
[25]	validation_0-mlogloss:1.64017
[26]	validation_0-mlogloss:1.63646
[27]	validation_0-mlogloss:1.63316
[28]	validation_0-mlogloss:1.6



[1]	validation_0-mlogloss:1.98247
[2]	validation_0-mlogloss:1.94589
[3]	validation_0-mlogloss:1.91422
[4]	validation_0-mlogloss:1.88639
[5]	validation_0-mlogloss:1.86257
[6]	validation_0-mlogloss:1.84008
[7]	validation_0-mlogloss:1.82103
[8]	validation_0-mlogloss:1.80324
[9]	validation_0-mlogloss:1.78705
[10]	validation_0-mlogloss:1.77177
[11]	validation_0-mlogloss:1.75862
[12]	validation_0-mlogloss:1.74633
[13]	validation_0-mlogloss:1.73528
[14]	validation_0-mlogloss:1.72540
[15]	validation_0-mlogloss:1.71573
[16]	validation_0-mlogloss:1.70807
[17]	validation_0-mlogloss:1.70064
[18]	validation_0-mlogloss:1.69393
[19]	validation_0-mlogloss:1.68732
[20]	validation_0-mlogloss:1.68108
[21]	validation_0-mlogloss:1.67599
[22]	validation_0-mlogloss:1.67132
[23]	validation_0-mlogloss:1.66721
[24]	validation_0-mlogloss:1.66356
[25]	validation_0-mlogloss:1.65969
[26]	validation_0-mlogloss:1.65651
[27]	validation_0-mlogloss:1.65339
[28]	validation_0-mlogloss:1.65007
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98823
[2]	validation_0-mlogloss:1.95344
[3]	validation_0-mlogloss:1.92270
[4]	validation_0-mlogloss:1.89664
[5]	validation_0-mlogloss:1.87509
[6]	validation_0-mlogloss:1.85298
[7]	validation_0-mlogloss:1.83529
[8]	validation_0-mlogloss:1.81805
[9]	validation_0-mlogloss:1.80277
[10]	validation_0-mlogloss:1.78802
[11]	validation_0-mlogloss:1.77559
[12]	validation_0-mlogloss:1.76372
[13]	validation_0-mlogloss:1.75420
[14]	validation_0-mlogloss:1.74423
[15]	validation_0-mlogloss:1.73463
[16]	validation_0-mlogloss:1.72698
[17]	validation_0-mlogloss:1.71975
[18]	validation_0-mlogloss:1.71335
[19]	validation_0-mlogloss:1.70711
[20]	validation_0-mlogloss:1.70218
[21]	validation_0-mlogloss:1.69747
[22]	validation_0-mlogloss:1.69264
[23]	validation_0-mlogloss:1.68820
[24]	validation_0-mlogloss:1.68461
[25]	validation_0-mlogloss:1.68132
[26]	validation_0-mlogloss:1.67797
[27]	validation_0-mlogloss:1.67504
[28]	validation_0-mlogloss:1.67266
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.97573
[2]	validation_0-mlogloss:1.93703
[3]	validation_0-mlogloss:1.90331
[4]	validation_0-mlogloss:1.87460
[5]	validation_0-mlogloss:1.85002
[6]	validation_0-mlogloss:1.82574
[7]	validation_0-mlogloss:1.80598
[8]	validation_0-mlogloss:1.78736
[9]	validation_0-mlogloss:1.76981
[10]	validation_0-mlogloss:1.75423
[11]	validation_0-mlogloss:1.74034
[12]	validation_0-mlogloss:1.72893
[13]	validation_0-mlogloss:1.71828
[14]	validation_0-mlogloss:1.70771
[15]	validation_0-mlogloss:1.69738
[16]	validation_0-mlogloss:1.68904
[17]	validation_0-mlogloss:1.68171
[18]	validation_0-mlogloss:1.67480
[19]	validation_0-mlogloss:1.66806
[20]	validation_0-mlogloss:1.66273
[21]	validation_0-mlogloss:1.65732
[22]	validation_0-mlogloss:1.65235
[23]	validation_0-mlogloss:1.64769
[24]	validation_0-mlogloss:1.64406
[25]	validation_0-mlogloss:1.63997
[26]	validation_0-mlogloss:1.63609
[27]	validation_0-mlogloss:1.63290
[28]	validation_0-mlogloss:1.62988
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98263
[2]	validation_0-mlogloss:1.94616
[3]	validation_0-mlogloss:1.91442
[4]	validation_0-mlogloss:1.88658
[5]	validation_0-mlogloss:1.86281
[6]	validation_0-mlogloss:1.84047
[7]	validation_0-mlogloss:1.82160
[8]	validation_0-mlogloss:1.80367
[9]	validation_0-mlogloss:1.78717
[10]	validation_0-mlogloss:1.77195
[11]	validation_0-mlogloss:1.75877
[12]	validation_0-mlogloss:1.74660
[13]	validation_0-mlogloss:1.73549
[14]	validation_0-mlogloss:1.72565
[15]	validation_0-mlogloss:1.71601
[16]	validation_0-mlogloss:1.70836
[17]	validation_0-mlogloss:1.70099
[18]	validation_0-mlogloss:1.69444
[19]	validation_0-mlogloss:1.68778
[20]	validation_0-mlogloss:1.68173
[21]	validation_0-mlogloss:1.67648
[22]	validation_0-mlogloss:1.67170
[23]	validation_0-mlogloss:1.66777
[24]	validation_0-mlogloss:1.66395
[25]	validation_0-mlogloss:1.66011
[26]	validation_0-mlogloss:1.65685
[27]	validation_0-mlogloss:1.65364
[28]	validation_0-mlogloss:1.65035
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98827
[2]	validation_0-mlogloss:1.95346
[3]	validation_0-mlogloss:1.92278
[4]	validation_0-mlogloss:1.89692
[5]	validation_0-mlogloss:1.87542
[6]	validation_0-mlogloss:1.85327
[7]	validation_0-mlogloss:1.83565
[8]	validation_0-mlogloss:1.81842
[9]	validation_0-mlogloss:1.80309
[10]	validation_0-mlogloss:1.78828
[11]	validation_0-mlogloss:1.77580
[12]	validation_0-mlogloss:1.76389
[13]	validation_0-mlogloss:1.75430
[14]	validation_0-mlogloss:1.74471
[15]	validation_0-mlogloss:1.73518
[16]	validation_0-mlogloss:1.72768
[17]	validation_0-mlogloss:1.72059
[18]	validation_0-mlogloss:1.71415
[19]	validation_0-mlogloss:1.70804
[20]	validation_0-mlogloss:1.70344
[21]	validation_0-mlogloss:1.69868
[22]	validation_0-mlogloss:1.69392
[23]	validation_0-mlogloss:1.68951
[24]	validation_0-mlogloss:1.68567
[25]	validation_0-mlogloss:1.68264
[26]	validation_0-mlogloss:1.67902
[27]	validation_0-mlogloss:1.67605
[28]	validation_0-mlogloss:1.67340
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.97606
[2]	validation_0-mlogloss:1.93714
[3]	validation_0-mlogloss:1.90292
[4]	validation_0-mlogloss:1.87428
[5]	validation_0-mlogloss:1.84972
[6]	validation_0-mlogloss:1.82581
[7]	validation_0-mlogloss:1.80650
[8]	validation_0-mlogloss:1.78763
[9]	validation_0-mlogloss:1.76972
[10]	validation_0-mlogloss:1.75414
[11]	validation_0-mlogloss:1.74012
[12]	validation_0-mlogloss:1.72847
[13]	validation_0-mlogloss:1.71814
[14]	validation_0-mlogloss:1.70762
[15]	validation_0-mlogloss:1.69749
[16]	validation_0-mlogloss:1.68873
[17]	validation_0-mlogloss:1.68137
[18]	validation_0-mlogloss:1.67457
[19]	validation_0-mlogloss:1.66786
[20]	validation_0-mlogloss:1.66253
[21]	validation_0-mlogloss:1.65720
[22]	validation_0-mlogloss:1.65230
[23]	validation_0-mlogloss:1.64768
[24]	validation_0-mlogloss:1.64414
[25]	validation_0-mlogloss:1.64040
[26]	validation_0-mlogloss:1.63678
[27]	validation_0-mlogloss:1.63344
[28]	validation_0-mlogloss:1.63050
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98255
[2]	validation_0-mlogloss:1.94575
[3]	validation_0-mlogloss:1.91422
[4]	validation_0-mlogloss:1.88629
[5]	validation_0-mlogloss:1.86233
[6]	validation_0-mlogloss:1.84017
[7]	validation_0-mlogloss:1.82132
[8]	validation_0-mlogloss:1.80332
[9]	validation_0-mlogloss:1.78698
[10]	validation_0-mlogloss:1.77215
[11]	validation_0-mlogloss:1.75909
[12]	validation_0-mlogloss:1.74699
[13]	validation_0-mlogloss:1.73596
[14]	validation_0-mlogloss:1.72584
[15]	validation_0-mlogloss:1.71657
[16]	validation_0-mlogloss:1.70911
[17]	validation_0-mlogloss:1.70172
[18]	validation_0-mlogloss:1.69500
[19]	validation_0-mlogloss:1.68863
[20]	validation_0-mlogloss:1.68243
[21]	validation_0-mlogloss:1.67733
[22]	validation_0-mlogloss:1.67251
[23]	validation_0-mlogloss:1.66852
[24]	validation_0-mlogloss:1.66457
[25]	validation_0-mlogloss:1.66052
[26]	validation_0-mlogloss:1.65704
[27]	validation_0-mlogloss:1.65391
[28]	validation_0-mlogloss:1.65097
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98763
[2]	validation_0-mlogloss:1.95284
[3]	validation_0-mlogloss:1.92213
[4]	validation_0-mlogloss:1.89608
[5]	validation_0-mlogloss:1.87479
[6]	validation_0-mlogloss:1.85250
[7]	validation_0-mlogloss:1.83448
[8]	validation_0-mlogloss:1.81727
[9]	validation_0-mlogloss:1.80209
[10]	validation_0-mlogloss:1.78752
[11]	validation_0-mlogloss:1.77496
[12]	validation_0-mlogloss:1.76310
[13]	validation_0-mlogloss:1.75353
[14]	validation_0-mlogloss:1.74383
[15]	validation_0-mlogloss:1.73441
[16]	validation_0-mlogloss:1.72669
[17]	validation_0-mlogloss:1.71969
[18]	validation_0-mlogloss:1.71319
[19]	validation_0-mlogloss:1.70719
[20]	validation_0-mlogloss:1.70255
[21]	validation_0-mlogloss:1.69810
[22]	validation_0-mlogloss:1.69363
[23]	validation_0-mlogloss:1.68919
[24]	validation_0-mlogloss:1.68560
[25]	validation_0-mlogloss:1.68236
[26]	validation_0-mlogloss:1.67877
[27]	validation_0-mlogloss:1.67591
[28]	validation_0-mlogloss:1.67349
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.97653
[2]	validation_0-mlogloss:1.93751
[3]	validation_0-mlogloss:1.90310
[4]	validation_0-mlogloss:1.87447
[5]	validation_0-mlogloss:1.84989
[6]	validation_0-mlogloss:1.82601
[7]	validation_0-mlogloss:1.80638
[8]	validation_0-mlogloss:1.78773
[9]	validation_0-mlogloss:1.77037
[10]	validation_0-mlogloss:1.75476
[11]	validation_0-mlogloss:1.74089
[12]	validation_0-mlogloss:1.72905
[13]	validation_0-mlogloss:1.71860
[14]	validation_0-mlogloss:1.70831
[15]	validation_0-mlogloss:1.69802
[16]	validation_0-mlogloss:1.68937
[17]	validation_0-mlogloss:1.68207
[18]	validation_0-mlogloss:1.67516
[19]	validation_0-mlogloss:1.66865
[20]	validation_0-mlogloss:1.66323
[21]	validation_0-mlogloss:1.65798
[22]	validation_0-mlogloss:1.65307
[23]	validation_0-mlogloss:1.64839
[24]	validation_0-mlogloss:1.64491
[25]	validation_0-mlogloss:1.64124
[26]	validation_0-mlogloss:1.63731
[27]	validation_0-mlogloss:1.63403
[28]	validation_0-mlogloss:1.63118
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98244
[2]	validation_0-mlogloss:1.94569
[3]	validation_0-mlogloss:1.91399
[4]	validation_0-mlogloss:1.88576
[5]	validation_0-mlogloss:1.86194
[6]	validation_0-mlogloss:1.84000
[7]	validation_0-mlogloss:1.82117
[8]	validation_0-mlogloss:1.80327
[9]	validation_0-mlogloss:1.78698
[10]	validation_0-mlogloss:1.77176
[11]	validation_0-mlogloss:1.75864
[12]	validation_0-mlogloss:1.74629
[13]	validation_0-mlogloss:1.73499
[14]	validation_0-mlogloss:1.72495
[15]	validation_0-mlogloss:1.71564
[16]	validation_0-mlogloss:1.70825
[17]	validation_0-mlogloss:1.70111
[18]	validation_0-mlogloss:1.69452
[19]	validation_0-mlogloss:1.68796
[20]	validation_0-mlogloss:1.68216
[21]	validation_0-mlogloss:1.67693
[22]	validation_0-mlogloss:1.67223
[23]	validation_0-mlogloss:1.66833
[24]	validation_0-mlogloss:1.66453
[25]	validation_0-mlogloss:1.66069
[26]	validation_0-mlogloss:1.65729
[27]	validation_0-mlogloss:1.65409
[28]	validation_0-mlogloss:1.65088
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98827
[2]	validation_0-mlogloss:1.95336
[3]	validation_0-mlogloss:1.92259
[4]	validation_0-mlogloss:1.89657
[5]	validation_0-mlogloss:1.87534
[6]	validation_0-mlogloss:1.85282
[7]	validation_0-mlogloss:1.83493
[8]	validation_0-mlogloss:1.81770
[9]	validation_0-mlogloss:1.80253
[10]	validation_0-mlogloss:1.78794
[11]	validation_0-mlogloss:1.77543
[12]	validation_0-mlogloss:1.76351
[13]	validation_0-mlogloss:1.75394
[14]	validation_0-mlogloss:1.74413
[15]	validation_0-mlogloss:1.73435
[16]	validation_0-mlogloss:1.72659
[17]	validation_0-mlogloss:1.71948
[18]	validation_0-mlogloss:1.71308
[19]	validation_0-mlogloss:1.70720
[20]	validation_0-mlogloss:1.70257
[21]	validation_0-mlogloss:1.69804
[22]	validation_0-mlogloss:1.69331
[23]	validation_0-mlogloss:1.68892
[24]	validation_0-mlogloss:1.68521
[25]	validation_0-mlogloss:1.68183
[26]	validation_0-mlogloss:1.67840
[27]	validation_0-mlogloss:1.67548
[28]	validation_0-mlogloss:1.67286
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98327
[2]	validation_0-mlogloss:1.94597
[3]	validation_0-mlogloss:1.91399
[4]	validation_0-mlogloss:1.88520
[5]	validation_0-mlogloss:1.86181
[6]	validation_0-mlogloss:1.83761
[7]	validation_0-mlogloss:1.81875
[8]	validation_0-mlogloss:1.80017
[9]	validation_0-mlogloss:1.78288
[10]	validation_0-mlogloss:1.76792
[11]	validation_0-mlogloss:1.75476
[12]	validation_0-mlogloss:1.74307
[13]	validation_0-mlogloss:1.73190
[14]	validation_0-mlogloss:1.72136
[15]	validation_0-mlogloss:1.71148
[16]	validation_0-mlogloss:1.70319
[17]	validation_0-mlogloss:1.69569
[18]	validation_0-mlogloss:1.68840
[19]	validation_0-mlogloss:1.68165
[20]	validation_0-mlogloss:1.67525
[21]	validation_0-mlogloss:1.66990
[22]	validation_0-mlogloss:1.66455
[23]	validation_0-mlogloss:1.65930
[24]	validation_0-mlogloss:1.65533
[25]	validation_0-mlogloss:1.65112
[26]	validation_0-mlogloss:1.64757
[27]	validation_0-mlogloss:1.64407
[28]	validation_0-mlogloss:1.64097
[29]	validation_0-mlogloss:1.



[1]	validation_0-mlogloss:1.98860
[2]	validation_0-mlogloss:1.95321
[3]	validation_0-mlogloss:1.92312
[4]	validation_0-mlogloss:1.89615
[5]	validation_0-mlogloss:1.87364
[6]	validation_0-mlogloss:1.85116
[7]	validation_0-mlogloss:1.83322
[8]	validation_0-mlogloss:1.81526
[9]	validation_0-mlogloss:1.79882
[10]	validation_0-mlogloss:1.78372
[11]	validation_0-mlogloss:1.77024
[12]	validation_0-mlogloss:1.75864
[13]	validation_0-mlogloss:1.74789
[14]	validation_0-mlogloss:1.73778
[15]	validation_0-mlogloss:1.72828
[16]	validation_0-mlogloss:1.72004
[17]	validation_0-mlogloss:1.71261
[18]	validation_0-mlogloss:1.70593
[19]	validation_0-mlogloss:1.69956
[20]	validation_0-mlogloss:1.69367
[21]	validation_0-mlogloss:1.68826
[22]	validation_0-mlogloss:1.68300
[23]	validation_0-mlogloss:1.67839
[24]	validation_0-mlogloss:1.67434
[25]	validation_0-mlogloss:1.67005
[26]	validation_0-mlogloss:1.66661
[27]	validation_0-mlogloss:1.66294
[28]	validation_0-mlogloss:1.66012
[29]	validation_0-mlogloss:1.

In [None]:
ax= plt.subplot()
cm = confusion_matrix(xgb_model.predict( X_test_text_vec2), y_test2)
sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Blues'); 

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 


In [None]:
print(classification_report(xgb_model.predict( X_test_text_vec2), y_test2))

In [None]:
print(cohen_kappa_score(xgb_model.predict(X_test_text_vec2), y_test2))

In [None]:
pd.DataFrame(xgb_model.predict(X_valid_vecs),columns=['pred']).to_csv('test/valid_test_prediction.csv')