In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Load merge CSV file
df = pd.read_csv("clustering.csv")

df.head()

Unnamed: 0,coin,price,1h,24h,7d,24h_volume,mkt_cap,kmeans_cluster
0,Bitcoin,40859.46,0.022,0.03,0.055,35390760000.0,770991500000.0,1
1,Ethereum,2744.41,0.024,0.034,0.065,19748700000.0,327104400000.0,0
2,Tether,1.0,-0.001,-0.001,0.0,57934970000.0,79965160000.0,0
3,BNB,383.43,0.018,0.028,0.004,1395854000.0,64043820000.0,0
4,USD Coin,0.999874,-0.001,0.0,-0.0,3872274000.0,52222140000.0,0


In [6]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier



In [7]:
X = df.iloc[:,-7:-1]
y = df.iloc[:,-1:]\


# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
#  Define Function to Run All Classifiers

def evaluate_classifiers(X_train, y_train, X_test, y_test, random_state=1):

    """
    Trains multiple classifiers and returns their accuracy on the test set.
    """
      
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=500),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state),
        "Random Forest": RandomForestClassifier(n_estimators=200, random_state=random_state),
        "Gradient Boosting": GradientBoostingClassifier(random_state=random_state),
        "Support Vector Machine": SVC(),
        "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=random_state)
    }
    
    results = {}
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results[name] = acc
    
    return results


In [9]:
evaluate_classifiers(X_train,y_train, X_test, y_test)

{'Logistic Regression': 0.45302013422818793,
 'Decision Tree': 0.9932885906040269,
 'Random Forest': 0.9899328859060402,
 'Gradient Boosting': 0.9932885906040269,
 'Support Vector Machine': 0.9865771812080537,
 'K-Nearest Neighbors': 0.9932885906040269,
 'XGBoost': 0.9932885906040269}

Here, we select the hightest acciracy model which is KNN which has 99% accuracy.

In [10]:
final_model = KNeighborsClassifier(n_neighbors=5)
final_model.fit(X_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [11]:
y_prediction = final_model.predict(X_test)

accuracy_score(y_test, y_prediction)

0.9932885906040269

In [14]:
import pickle

# Save the best KNN model

with open("Final Model.pkl", "wb") as f:
    pickle.dump(final_model, f)
