In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv('cc_dataset.csv')
df.dropna(inplace=True)

# Define features and target
feature_cols = ['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode',
                'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                'time_signature', 'track_genre']

# Extract 'popularity' and scale it
popularity_data = df[['popularity']]
scaler = StandardScaler()
popularity_scaled = scaler.fit_transform(popularity_data)

# Determine optimal number of clusters
sse = []
silhouette_avg = []
k_range = range(2, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(popularity_scaled)
    sse.append(kmeans.inertia_)
    cluster_labels = kmeans.labels_
    silhouette_avg.append(silhouette_score(popularity_scaled, cluster_labels))

# Choose optimal k (e.g., k=4)
optimal_k = 4

# Apply K-Means clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(popularity_scaled)
df['popularity_category'] = kmeans.labels_

# Map cluster labels to names (optional)
cluster_names = {i: f'Cluster_{i}' for i in range(optimal_k)}
df['popularity_category'] = df['popularity_category'].map(cluster_names)

# Handle categorical variables
X = pd.get_dummies(df[feature_cols], columns=['track_genre', 'key', 'mode', 'time_signature'], drop_first=True)

# Scale numerical features
numerical_cols = ['duration_ms', 'danceability', 'energy', 'loudness', 'speechiness',
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Prepare target variable
y = df['popularity_category'].astype(str)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Train the model
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Classification Report:\n", metrics.classification_report(y_test, y_pred))


Accuracy: 0.5744
Classification Report:
               precision    recall  f1-score   support

   Cluster_0       0.70      0.71      0.71       331
   Cluster_1       0.35      0.32      0.33       226
   Cluster_2       0.57      0.51      0.54       401
   Cluster_3       0.59      0.71      0.64       292

    accuracy                           0.57      1250
   macro avg       0.55      0.56      0.56      1250
weighted avg       0.57      0.57      0.57      1250



In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree Classifier object
dtc = DecisionTreeClassifier(max_depth=5)

# Train the classifier
dtc.fit(X_train, y_train)

# Predict on test dataset
y_pred = dtc.predict(X_test)

# Predict on training set to check for overfitting
train_predict = dtc.predict(X_train)

# Output accuracy
print(f"Default Decision Tree")
print(f"Accuracy train: {metrics.accuracy_score(y_train, train_predict)}")
print(f"Accuracy test: {metrics.accuracy_score(y_test, y_pred)}")

# Create Random Forest Classifier object (with 20 estimators)
forest = RandomForestClassifier(n_estimators=20, max_depth=3, random_state=1)

# Perform cross-validation for Random Forest (with 20 estimators)
cv_scores = cross_val_score(forest, X, y, cv=5)
print(f"Cross-validation scores (20 estimators): {cv_scores}")
print(f"Mean cross-validation score (20 estimators): {cv_scores.mean()}")

# Train and evaluate Random Forest with more estimators for comparison
forest_more_estimators = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

# Perform cross-validation for Random Forest (with more estimators)
cv_scores_more = cross_val_score(forest_more_estimators, X, y, cv=5)
print(f"Cross-validation scores (100 estimators): {cv_scores_more}")
print(f"Mean cross-validation score (100 estimators): {cv_scores_more.mean()}")

# Fit the model with more estimators
forest_more_estimators.fit(X_train, y_train)

# Predict on test dataset
y_test_pred_more = forest_more_estimators.predict(X_test)

# Predict on training set
y_train_pred_more = forest_more_estimators.predict(X_train)

# Output accuracy
forest_train_more = metrics.accuracy_score(y_train, y_train_pred_more)
forest_test_more = metrics.accuracy_score(y_test, y_test_pred_more)
print(f"Random Forest (more estimators): Accuracy on training data: {forest_train_more}")
print(f"Random Forest (more estimators): Accuracy on test data: {forest_test_more}")


Default Decision Tree
Accuracy train: 0.6453272363532437
Accuracy test: 0.612
Cross-validation scores (20 estimators): [0.485      0.561      0.502      0.781      0.56556557]
Mean cross-validation score (20 estimators): 0.5789131131131132
Cross-validation scores (100 estimators): [0.408      0.441      0.507      0.813      0.58858859]
Mean cross-validation score (100 estimators): 0.5515177177177176
Random Forest (more estimators): Accuracy on training data: 0.6473278079451272
Random Forest (more estimators): Accuracy on test data: 0.626


In [7]:
# Predict on test dataset
y_test_pred_more = forest_more_estimators.predict(X_test)

# Predict on training set
y_train_pred_more = forest_more_estimators.predict(X_train)

# Output accuracy
forest_train_more = metrics.accuracy_score(y_train, y_train_pred_more)
forest_test_more = metrics.accuracy_score(y_test, y_test_pred_more)
print(f"Test Two (more estimators, adjusted max depth): Accuracy of testing: {forest_test_more}")
print(f"Random forest train / test accuracies (more estimators, adjusted max depth): {forest_train_more} / {forest_test_more}")  # Play with the number of estimators
forest.fit(X_train, y_train)

# Predict on test dataset
y_test_pred = forest.predict(X_test)

# Predict on training set
y_train_pred = forest.predict(X_train)

# Output accuracy
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
print(f"Test One : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}")
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# Tune hyperparameters using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 10, 15],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=1), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Output best parameters from GridSearchCV
'''
bootstrap: True
max_depth: 15
max_features: 'sqrt'
min_samples_leaf: 1
min_samples_split: 2
n_estimators: 200
'''


Test Two (more estimators, adjusted max depth): Accuracy of testing: 0.626
Random forest train / test accuracies (more estimators, adjusted max depth): 0.6473278079451272 / 0.626
Test One : Accuracy of testing: 0.584
Random forest train / test accuracies: 0.5970277222063447 / 0.584
Fitting 3 folds for each of 1920 candidates, totalling 5760 fits
[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100


KeyboardInterrupt



In [11]:
from six import StringIO

# Train the best model using the manually provided best parameters
best_forest = RandomForestClassifier(bootstrap=True, max_depth=15, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=1)
best_forest.fit(X_train, y_train)

y_test_pred_best = best_forest.predict(X_test)
forest_test_best = metrics.accuracy_score(y_test, y_test_pred_best)
print(f"Accuracy of the best Random Forest model: {forest_test_best}")

# Visualize the decision tree
class_names = [str(i) for i in sorted(df['popularity_category'].unique())]
dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names=feature_cols, class_names=class_names)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('spotifyTree.png')
# Image(graph.create_png())


Accuracy of the best Random Forest model: 0.6833333333333333


NameError: name 'export_graphviz' is not defined

## KNN 

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import export_graphviz
import pydotplus
'''
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from six import StringIO
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

# Load dataset from CSV file
df = pd.read_csv('cc_dataset.csv')

# Drop rows with missing values to avoid errors during training
df.dropna(inplace=True)

# Define features and target variable
feature_cols = ['duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode',
                'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
                'time_signature', 'track_genre']

# Create bins for the target variable 'popularity' to reduce processing time
df['popularity_category'] = pd.cut(df['popularity'], bins=[0, 30, 60, 100], labels=['Low', 'Medium', 'High'])
df = df.dropna(subset=feature_cols + ['popularity_category'])

X = df[feature_cols]  # Features
y = df['popularity_category'].astype(str)  # Binned target variable

# Split dataset into training set and test set (70% training and 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree Classifier object
dtc = DecisionTreeClassifier(max_depth=5)
'''
# Train the classifier
dtc.fit(X_train, y_train)

# Predict on test dataset
y_pred = dtc.predict(X_test)

# Predict on training set to check for overfitting
train_predict = dtc.predict(X_train)

# Output accuracy
print(f"Default Decision Tree")
print(f"Accuracy train: {metrics.accuracy_score(y_train, train_predict)}")
print(f"Accuracy test: {metrics.accuracy_score(y_test, y_pred)}")

# Create Random Forest Classifier object with best parameters from GridSearchCV
best_forest_params = {
    'n_estimators': 200,
    'max_depth': 15,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'bootstrap': True,
    'random_state': 1
}

best_forest = RandomForestClassifier(**best_forest_params)

# Perform cross-validation for the Random Forest with best parameters
cv_scores_best = cross_val_score(best_forest, X, y, cv=5)
print(f"Cross-validation scores (Best Parameters): {cv_scores_best}")
print(f"Mean cross-validation score (Best Parameters): {cv_scores_best.mean()}")

# Train the best model
best_forest.fit(X_train, y_train)

# Predict on test dataset with best model
y_test_pred_best = best_forest.predict(X_test)

# Predict on training set with best model
y_train_pred_best = best_forest.predict(X_train)

# Output accuracy for best Random Forest model
forest_train_best = metrics.accuracy_score(y_train, y_train_pred_best)
forest_test_best = metrics.accuracy_score(y_test, y_test_pred_best)
print(f"Best Random Forest Model - Train Accuracy: {forest_train_best}")
print(f"Best Random Forest Model - Test Accuracy: {forest_test_best}")

# KNN Classifier with GridSearchCV for hyperparameter tuning
knn = KNeighborsClassifier()
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search_knn = GridSearchCV(estimator=knn, param_grid=param_grid_knn, cv=5, n_jobs=-1, verbose=2)
grid_search_knn.fit(X_train, y_train)

# Output best parameters from GridSearchCV for KNN
print(f"Best parameters for KNN: {grid_search_knn.best_params_}")

# Train the best KNN model
best_knn = grid_search_knn.best_estimator_

# Perform cross-validation for the best KNN model
cv_scores_knn = cross_val_score(best_knn, X, y, cv=5)
print(f"Cross-validation scores (Best KNN): {cv_scores_knn}")
print(f"Mean cross-validation score (Best KNN): {cv_scores_knn.mean()}")

# Train the best KNN model
best_knn.fit(X_train, y_train)

# Predict on test dataset with best KNN model
y_test_pred_knn = best_knn.predict(X_test)

# Predict on training set with best KNN model
y_train_pred_knn = best_knn.predict(X_train)

# Output accuracy for best KNN model
knn_train_best = metrics.accuracy_score(y_train, y_train_pred_knn)
knn_test_best = metrics.accuracy_score(y_test, y_test_pred_knn)
print(f"Best KNN Model - Train Accuracy: {knn_train_best}")
print(f"Best KNN Model - Test Accuracy: {knn_test_best}")

# Visualize the decision tree (optional)
class_names = [str(i) for i in sorted(df['popularity_category'].unique())]
dot_data = StringIO()
export_graphviz(dtc, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names=feature_cols, class_names=class_names)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('KNNspotifyTree3.png')
Image(graph.create_png())


Default Decision Tree
Accuracy train: 0.6453272363532437
Accuracy test: 0.6126666666666667
Cross-validation scores (Best Parameters): [0.423      0.48       0.483      0.765      0.56956957]
Mean cross-validation score (Best Parameters): 0.544113913913914
Best Random Forest Model - Train Accuracy: 0.9688482423549586
Best Random Forest Model - Test Accuracy: 0.6833333333333333
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.2s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.2s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=

ValueError: Length of feature_names, 15 does not match number of features, 31

[CV] END ...metric=manhattan, n_neighbors=3, weights=uniform; total time=   0.1s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.1s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.2s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.2s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.1s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.2s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.2s
[CV] END ...metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.2s
[CV] END ..metric=manhattan, n_neighbors=7, weights=distance; total time=   0.2s
[CV] END ..metric=manhattan,

#### RandomForestClassifier

In [None]:
forest = RandomForestClassifier(n_estimators=20,max_depth=3, random_state=1)#Play with the number of estimators
forest.fit(X_train, y_train)
y_test_pred=forest.predict(X_test)
y_train_pred=forest.predict(X_train)
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Test One : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}") 

In [None]:
forest = RandomForestClassifier(n_estimators=2000,max_depth=20, random_state=1)#Play with the number of estimators
forest.fit(X_train, y_train)
y_test_pred=forest.predict(X_test)
y_train_pred=forest.predict(X_train)
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Test Two : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}") 

In [None]:
forest = RandomForestClassifier(n_estimators=20,max_depth=1000, random_state=1)#Play with the number of estimators
forest.fit(X_train, y_train)
y_test_pred=forest.predict(X_test)
y_train_pred=forest.predict(X_train)
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Test 3 : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}") 

In [None]:
forest = RandomForestClassifier(n_estimators=90,max_depth=30, random_state=1)#Play with the number of estimators
forest.fit(X_train, y_train)
y_test_pred=forest.predict(X_test)
y_train_pred=forest.predict(X_train)
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Test 4 : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}") 

In [None]:
forest = RandomForestClassifier(n_estimators=90,max_depth=300, random_state=1)#Play with the number of estimators
forest.fit(X_train, y_train)
y_test_pred=forest.predict(X_test)
y_train_pred=forest.predict(X_train)
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Test 5 : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}") 

In [None]:
forest = RandomForestClassifier(n_estimators=9800,max_depth=100, random_state=1)#Play with the number of estimators
forest.fit(X_train, y_train)
y_test_pred=forest.predict(X_test)
y_train_pred=forest.predict(X_train)
forest_train = metrics.accuracy_score(y_train, y_train_pred)
forest_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Test 6 : Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"Random forest train / test accuracies: {forest_train} / {forest_test}") 

#### AdaBoostClassifier

In [None]:
#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1,max_depth=10)#max_depth
#tree = tree.fit(X_train,y_train)
ada = AdaBoostClassifier(estimator=tree,n_estimators=20, random_state=1) #number of trees 
adaModel = ada.fit(X_train, y_train)
y_test_pred=adaModel.predict(X_test)
y_train_pred=adaModel.predict(X_train)
adaModel_train = metrics.accuracy_score(y_train, y_train_pred)
adaModel_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"AdaBoost train / test accuracies: {adaModel_train} / {adaModel_test}")

In [None]:
#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1,max_depth=200)#max_depth
#tree = tree.fit(X_train,y_train)
ada = AdaBoostClassifier(estimator=tree,n_estimators=20, random_state=1) #number of trees 
adaModel = ada.fit(X_train, y_train)
y_test_pred=adaModel.predict(X_test)
y_train_pred=adaModel.predict(X_train)
adaModel_train = metrics.accuracy_score(y_train, y_train_pred)
adaModel_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"AdaBoost train / test accuracies: {adaModel_train} / {adaModel_test}")

In [None]:
#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1,max_depth=10)#max_depth
#tree = tree.fit(X_train,y_train)
ada = AdaBoostClassifier(estimator=tree,n_estimators=200, random_state=1) #number of trees 
adaModel = ada.fit(X_train, y_train)
y_test_pred=adaModel.predict(X_test)
y_train_pred=adaModel.predict(X_train)
adaModel_train = metrics.accuracy_score(y_train, y_train_pred)
adaModel_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"AdaBoost train / test accuracies: {adaModel_train} / {adaModel_test}")

In [None]:
#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1,max_depth=25)#max_depth
#tree = tree.fit(X_train,y_train)
ada = AdaBoostClassifier(estimator=tree,n_estimators=2024, random_state=1) #number of trees 
adaModel = ada.fit(X_train, y_train)
y_test_pred=adaModel.predict(X_test)
y_train_pred=adaModel.predict(X_train)
adaModel_train = metrics.accuracy_score(y_train, y_train_pred)
adaModel_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"AdaBoost train / test accuracies: {adaModel_train} / {adaModel_test}")

In [None]:
#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1,max_depth=2024)#max_depth
#tree = tree.fit(X_train,y_train)
ada = AdaBoostClassifier(estimator=tree,n_estimators=25, random_state=1) #number of trees 
adaModel = ada.fit(X_train, y_train)
y_test_pred=adaModel.predict(X_test)
y_train_pred=adaModel.predict(X_train)
adaModel_train = metrics.accuracy_score(y_train, y_train_pred)
adaModel_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"AdaBoost train / test accuracies: {adaModel_train} / {adaModel_test}")

In [None]:
#Boosting
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=1,max_depth=100000)#max_depth
#tree = tree.fit(X_train,y_train)
ada = AdaBoostClassifier(estimator=tree,n_estimators=2000000, random_state=1) #number of trees 
adaModel = ada.fit(X_train, y_train)
y_test_pred=adaModel.predict(X_test)
y_train_pred=adaModel.predict(X_train)
adaModel_train = metrics.accuracy_score(y_train, y_train_pred)
adaModel_test = metrics.accuracy_score(y_test, y_test_pred)
# Model Accuracy, how often is the classifier correct?
print(f"Accuracy of testing: {metrics.accuracy_score(y_test, y_test_pred)}")
print(f"AdaBoost train / test accuracies: {adaModel_train} / {adaModel_test}")