## Ensemble methods

In [51]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.ensemble import BaggingClassifier,VotingClassifier,AdaBoostClassifier
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.metrics import accuracy_score,f1_score


In [3]:
df = pd.read_csv('data/dataset.csv')
df = df[df['genre'].notnull()]
df=df.drop(['track','artist','uri'],axis=1)
df.head(5)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,decade,genre
1,0.498,0.505,3,-12.475,1,0.0337,0.018,0.107,0.176,0.797,101.801,213613,4,48.8251,10,0,60,pop
6,0.662,0.272,0,-18.883,1,0.0313,0.36,0.228,0.0963,0.591,143.507,134360,4,47.82155,7,0,60,jazz
9,0.511,0.603,2,-7.637,1,0.028,0.0385,2e-06,0.142,0.685,128.336,157293,4,43.36534,9,1,60,pop
11,0.52,0.411,9,-12.411,1,0.0315,0.786,0.0012,0.146,0.35,120.29,207573,4,37.54527,9,1,60,pop
13,0.746,0.666,2,-10.408,1,0.146,0.543,0.0,0.0488,0.84,94.024,192427,4,78.59848,5,1,60,country


In [32]:
X=df.drop(['decade','genre','target'],axis=1)
stand_scale=StandardScaler()
le=LabelEncoder()
#df['is_pop']=df['genre']=='pop'
y=le.fit_transform(df['genre'])
X=stand_scale.fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X, y)

In [33]:
log_reg=LogisticRegression(multi_class='multinomial',solver='lbfgs')
dec_tree=DecisionTreeClassifier()
knn=KNeighborsClassifier()

log_reg.fit(X_train,y_train)
dec_tree.fit(X_train,y_train)
knn.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()

In [34]:
log_rec_pred=log_reg.predict(X_test)
dec_tree_pred=dec_tree.predict(X_test)
knn_pred=knn.predict(X_test)

In [39]:
average_pred=(log_rec_pred+dec_tree_pred+knn_pred)//3
acc=accuracy_score(y_test, average_pred)
print(acc)

0.5940516052787079


In [40]:
print(f'LogisticRegression accuracy {accuracy_score(y_test, log_rec_pred)}')
print(f'DecisionTree accuracy {accuracy_score(y_test, dec_tree_pred)}')
print(f'KNeighborsClassifier accuracy {accuracy_score(y_test, knn_pred)}')

LogisticRegression accuracy 0.7449281071498917
DecisionTree accuracy 0.6903683277526098
KNeighborsClassifier accuracy 0.7149891668308056


In [44]:
voting_clf=VotingClassifier(estimators=[('KNN',knn),('DTree',dec_tree),('LogReg',log_reg)],voting='hard')
voting_clf.fit(X_train,y_train)
predic=voting_clf.predict(X_test)
acc=accuracy_score(y_test,predic)
f1=f1_score(y_test,predic,average='macro')
print(f'Accuracy score: {acc}')
print(f'F1 score: {f1}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy score: 0.7427614733110104
F1 score: 0.5656075557875482


In [45]:
log_baggin_model=BaggingClassifier(base_estimator=log_reg,n_estimators=5)
knn_baggin_model=BaggingClassifier(base_estimator=knn,n_estimators=5)
dtree_baggin_model=BaggingClassifier(base_estimator=dec_tree,n_estimators=5)

In [49]:
def bagging_ensemble(model):
    k_folds=KFold(n_splits=20)
    results=cross_val_score(model, X_train, y_train,cv=k_folds)
    print(results.mean())


In [50]:
bagging_ensemble(log_baggin_model)
bagging_ensemble(knn_baggin_model)
bagging_ensemble(dtree_baggin_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.7457633622012755
0.716870242566591
0.7292205138286754


In [52]:
k_f=KFold(n_splits=10)
model=AdaBoostClassifier(n_estimators=10)
result=cross_val_score(model,X_test,y_test,cv=k_f)
print(result.mean())


0.6730357669788318
