In [36]:
%matplotlib inline
 
import numpy as np
import pandas as pd
 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler

In [136]:
# Read in data
data = pd.read_csv('data.csv')
data.drop(['artists', 'id', 'name', 'release_date'], axis = 1, inplace = True)
data_current = data[data.year >= 2010]
data_current = data_current[~data_current.duplicated() == 1]
data_current.head(10)

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,year
9087,0.887,0.319,187333,0.201,0,0.0,7,0.904,-17.796,1,27,0.0623,117.153,0.239,2018
9091,0.938,0.269,236800,0.129,0,5e-06,7,0.683,-18.168,0,26,0.0576,82.332,0.16,2018
9111,0.881,0.644,313093,0.212,0,2.2e-05,11,0.798,-14.118,1,19,0.0347,117.072,0.441,2020
9117,0.955,0.627,295093,0.184,0,0.000162,1,0.0986,-15.533,1,19,0.045,115.864,0.299,2020
9119,0.888,0.581,183440,0.331,0,1.5e-05,6,0.147,-14.087,1,19,0.243,88.303,0.642,2020
9121,0.93,0.442,147907,0.399,0,0.000499,6,0.912,-12.661,1,19,0.078,121.662,0.554,2020
9129,0.949,0.57,64173,0.176,0,0.0,6,0.147,-22.676,0,19,0.299,135.687,0.348,2020
9137,0.911,0.565,232640,0.153,0,0.0,10,0.358,-21.606,0,18,0.378,103.309,0.434,2020
9139,0.932,0.598,233520,0.212,0,2.3e-05,6,0.692,-15.078,0,18,0.0406,107.183,0.172,2020
9143,0.879,0.367,213840,0.307,0,0.0,11,0.73,-12.42,0,18,0.0568,172.867,0.284,2020


In [123]:
X = data_current[["acousticness", "danceability", "duration_ms", "energy", "explicit", "instrumentalness", 
                  "key", "liveness", "loudness", "mode", "speechiness", "tempo", "valence", "year"]]
def groups(series):
    if 75 <= series <= 100:
        return 3
    elif 50 <= series < 75:
        return 2
    elif 25 <= series < 50:
        return 1
    elif series < 25:
        return 0

y = data_current['popularity'].apply(groups)

In [124]:
X = pd.get_dummies(X, columns = ['explicit', 'key', 'mode', 'year'])
X.head(5)

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,...,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018,year_2019,year_2020,year_2021
9087,0.887,0.319,187333,0.201,0.0,0.904,-17.796,0.0623,117.153,0.239,...,0,0,0,0,0,0,1,0,0,0
9091,0.938,0.269,236800,0.129,5e-06,0.683,-18.168,0.0576,82.332,0.16,...,0,0,0,0,0,0,1,0,0,0
9111,0.881,0.644,313093,0.212,2.2e-05,0.798,-14.118,0.0347,117.072,0.441,...,0,0,0,0,0,0,0,0,1,0
9117,0.955,0.627,295093,0.184,0.000162,0.0986,-15.533,0.045,115.864,0.299,...,0,0,0,0,0,0,0,0,1,0
9119,0.888,0.581,183440,0.331,1.5e-05,0.147,-14.087,0.243,88.303,0.642,...,0,0,0,0,0,0,0,0,1,0


In [125]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [126]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5159146554739419

In [127]:
# Cross Validation Score
cross_val_score(model, X, y, cv = 5)

array([0.51381602, 0.52710738, 0.53462749, 0.52500874, 0.53594543])

In [128]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)



0.3050017488632389

In [129]:
# K Nearest Neighbors
scores = []
for i in range(1, 10):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.6014340678558937 1


In [9]:
# Cross Validation Score
cross_val_score(model, X, y, cv = 5)



array([0.34452606, 0.34610003, 0.34819867, 0.34714935, 0.34668532])

In [130]:
# K-Means
scores = []
for i in range(1, 5):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.5916404337180833 0


In [131]:
# Logistic Regression
model = LogisticRegression(max_iter = 100)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5916404337180833

# Models with Standard Scaling

In [132]:
ss = StandardScaler()
ss.fit(X)
X_ss = ss.transform(X)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_ss, y, random_state = 0, test_size = 0.2)

In [133]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.5250087443161945

In [29]:
# Cross Validation Score
cross_val_score(model, X, y, cv = 5)



array([0.34592515, 0.34435117, 0.34802378, 0.34837356, 0.349484  ])

In [134]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)



0.637110877929346

In [135]:
# K Nearest Neighbors - takes a long time
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.7245540398740818 7


In [25]:
# Cross Validation Score
cross_val_score(model, X, y, cv = 5)



array([0.3273872 , 0.33578174, 0.33543197, 0.34102833, 0.33391639])

In [26]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [27]:
# Logistic Regression
model = LogisticRegression(max_iter = 100)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.34400139909059113

# Models with Normalized Scaling

In [31]:
normalized = Normalizer()
normalized.fit(X)
X_n = normalized.transform(X)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_n, y, random_state = 0, test_size = 0.2)

In [32]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.012941587967820916

In [33]:
# Cross Validation Score
cross_val_score(model, X, y, cv = 5)



array([0.34592515, 0.34435117, 0.34802378, 0.34837356, 0.349484  ])

In [34]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.34435117173837004

In [35]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.3438265127667016 43


In [36]:
# Cross Validation Score
cross_val_score(model, X, y, cv = 5)



array([0.34872333, 0.34854844, 0.34819867, 0.34872333, 0.34721008])

In [37]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [76]:
# Logistic Regression
model = LogisticRegression(max_iter = 100)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
y_model[:15]
# accuracy_score(ytest, y_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Still can do:
- subset data from original dataset with the variables most correlated to popularity & run models on that

- use gridsearchcv to determine best parameters for each model

# Subset data based on most correlated variables

In [9]:
# Use data_current & get dummy variables
data_temp = pd.get_dummies(data_current, columns = ['explicit', 'key', 'mode', 'year'])

In [11]:
# Correlation plot to see which features are most correlated to popularity
data_temp.corr()['popularity'].sort_values(ascending = False)

popularity          1.000000
explicit_1          0.349313
loudness            0.191963
danceability        0.088074
acousticness        0.058428
speechiness         0.056047
year_2019           0.049276
year_2011           0.047500
key_1               0.041702
valence             0.041355
year_2017           0.040786
key_8               0.039976
year_2012           0.035287
mode_1              0.033417
key_5               0.024744
year_2015           0.022765
year_2010           0.022286
year_2016           0.018476
year_2014           0.017046
key_3               0.015330
key_6               0.011774
key_0               0.005831
year_2018           0.002716
key_10             -0.004188
key_2              -0.009147
key_11             -0.017197
key_4              -0.018370
year_2013          -0.031157
mode_0             -0.033417
key_7              -0.037000
key_9              -0.038000
year_2020          -0.045642
tempo              -0.051334
energy             -0.087903
liveness      

Will take everything above |0.1| -> ['explicit_1', 'loudness', 'liveness', 'duration_ms', 'year_2021', 'explicit_0', 'instrumentalness']

In [13]:
# Subset data
X_s = data_temp[['explicit_1', 'loudness', 'liveness', 'duration_ms', 'year_2021', 'explicit_0', 'instrumentalness']]
y_s = data_temp['popularity']

In [20]:
# Split into train & test data
Xtrain, Xtest, ytrain, ytest = train_test_split(X_s, y_s, random_state = 0, test_size = 0.2)

Test models on subset

In [15]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.34207764952780695

In [17]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34977265, 0.34977265, 0.34977265, 0.34977265, 0.34983383])

In [18]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)



0.34435117173837004

In [None]:
# Cross Validation Score - takes a long time
cross_val_score(model, X_s, y_s, cv = 5)

In [21]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.344700944386149 36


In [22]:
# Cross Validation Score
model = KNeighborsClassifier(n_neighbors = scores.index(max_val))
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34330185, 0.34540049, 0.3478489 , 0.34697447, 0.34703516])

In [23]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [25]:
# Cross Validation Score
model = KMeans(n_clusters = 1)
cross_val_score(model, X_s, y_s, cv = 5)

array([-2.16303849e+14, -2.10584411e+14, -5.58087846e+14, -2.45181187e+14,
       -9.61498373e+13])

In [27]:
# Logistic Regression
model = LogisticRegression(max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.34435117173837004

In [28]:
# Cross Validation Score - takes a long time
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34977265, 0.34977265, 0.34977265, 0.34977265, 0.34983383])

# Subsetted Data with Standard Scaling

In [29]:
ss = StandardScaler()
ss.fit(X_s)
X_ss = ss.transform(X_s)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_ss, y_s, random_state = 0, test_size = 0.2)

In [30]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.0027981811822315496

In [33]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 10)



array([0.34557538, 0.34732424, 0.34242742, 0.3490731 , 0.34837356,
       0.34802378, 0.34732424, 0.34977265, 0.34872333, 0.34989503])

In [34]:
# Support Vector Machine - takes a long time
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)



0.34435117173837004

In [None]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)

In [35]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.341378104232249 46


In [37]:
# Cross Validation Score
model = KNeighborsClassifier(n_neighbors = scores.index(max_val))
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34802378, 0.3478489 , 0.34837356, 0.34819867, 0.34686024])

In [38]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [39]:
# Cross Validation Score
model = KMeans(n_clusters = 1)
cross_val_score(model, X_s, y_s, cv = 5)

array([-2.16303849e+14, -2.10584411e+14, -5.58087846e+14, -2.45181187e+14,
       -9.61498373e+13])

In [41]:
# Logistic Regression - long
model = LogisticRegression(max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.34435117173837004

In [42]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34977265, 0.34977265, 0.34977265, 0.34977265, 0.34983383])

# Subsetted data with Normal Scaling

In [43]:
normalized = Normalizer()
normalized.fit(X_s)
X_n = normalized.transform(X_s)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_n, y_s, random_state = 0, test_size = 0.2)

In [44]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.004896817068905212

In [46]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 10)



array([0.34557538, 0.34732424, 0.34242742, 0.3490731 , 0.34837356,
       0.34802378, 0.34732424, 0.34977265, 0.34872333, 0.34989503])

In [47]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.34435117173837004

In [48]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34977265, 0.34977265, 0.34977265, 0.34977265, 0.34983383])

In [49]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.341727876880028 47


In [50]:
# Cross Validation Score
model = KNeighborsClassifier(n_neighbors = scores.index(max_val))
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34819867, 0.34802378, 0.34872333, 0.34889822, 0.34703516])

In [51]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [52]:
# Cross Validation Score
model = KMeans(n_clusters = 1)
cross_val_score(model, X_s, y_s, cv = 5)

array([-2.16303849e+14, -2.10584411e+14, -5.58087846e+14, -2.45181187e+14,
       -9.61498373e+13])

In [55]:
# Logistic Regression - long
model = LogisticRegression(max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.34435117173837004

In [57]:
# Cross Validation Score - long
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34977265, 0.34977265, 0.34977265, 0.34977265, 0.34983383])

# Subsetted data with MinMax Scaling

In [58]:
minmax = MinMaxScaler()
minmax.fit(X_s)
X_m = minmax.transform(X_s)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_m, y_s, random_state = 0, test_size = 0.2)

In [59]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.0027981811822315496

In [60]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34627492, 0.34540049, 0.3478489 , 0.34854844, 0.34930908])

In [61]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)



0.34435117173837004

In [62]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34977265, 0.34977265, 0.34977265, 0.34977265, 0.34983383])

In [63]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.33945435466946483 44


In [64]:
# Cross Validation Score
model = KNeighborsClassifier(n_neighbors = scores.index(max_val))
cross_val_score(model, X_s, y_s, cv = 5)



array([0.34610003, 0.34697447, 0.34749913, 0.34872333, 0.34616057])

In [65]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [None]:
# Cross Validation Score
model = KMeans(n_clusters = 1)
cross_val_score(model, X_s, y_s, cv = 5)

In [None]:
# Logistic Regression
model = LogisticRegression(max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

In [None]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)

# Smaller Subset

In [66]:
# Subset data
X_s = data_temp[['explicit_1', 'loudness', 'explicit_0', 'instrumentalness']]
y_s = data_temp['popularity']
# Split into train & test data
Xtrain, Xtest, ytrain, ytest = train_test_split(X_s, y_s, random_state = 0, test_size = 0.2)

In [69]:
# Gaussian Naive Bayes Model
model = GaussianNB()                        
model.fit(Xtrain, ytrain)                   
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

0.002273522210563134

In [70]:
# Cross Validation Score
cross_val_score(model, X_s, y_s, cv = 5)



array([0.00139909, 0.00244841, 0.00209864, 0.00279818, 0.00052475])

In [71]:
# Support Vector Machine
model = SVC(kernel = 'rbf', C = 1, max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)



0.34435117173837004

In [72]:
# K Nearest Neighbors
scores = []
for i in range(1, 50):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(Xtrain, ytrain)
    y_model = model.predict(Xtest)
    scores.append(accuracy_score(ytest, y_model))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.3405036726128017 47


In [73]:
# K-Means
scores = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters = i)
    kmeans.fit(Xtrain)
    y_kmeans = kmeans.predict(Xtest)
    scores.append(accuracy_score(ytest, y_kmeans))
    
max_val = max(scores)
print(max_val, scores.index(max_val))

0.34435117173837004 0


In [74]:
# Logistic Regression
model = LogisticRegression(max_iter = 1000)
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
accuracy_score(ytest, y_model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.34435117173837004