In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
data = pd.read_csv("Data/data.csv")

In [10]:
data.columns

Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings'],
      dtype='object')

In [11]:
data.dropna(inplace=True)

In [12]:
data.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
5,Baji Sankar,Mind-blowing purchase,"Certified Buyer, Hyderabad",173.0,45.0,Oct 2018,Good quality product. Delivered on time.READ MORE,5
6,Flipkart Customer,Must buy!,"Certified Buyer, Doom Dooma",403.0,121.0,Jan 2020,BEST PURCHASE It is a good quality and is more...,5


In [13]:
columns_to_drop = ['Reviewer Name', 'Place of Review','Up Votes', 'Down Votes','Month']
data.drop(columns=columns_to_drop, inplace=True)
data.dropna(subset=['Review text'], inplace=True)

In [14]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

In [15]:
data['Ratings'] = data['Ratings'].apply(lambda rating: 'negative' if rating <= 2 else 'positive')
X = data[['Review text']]
y = data['Ratings']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6009, 1) (6009,)
(2004, 1) (2004,)


In [17]:
X_train.head()

Unnamed: 0,Review text
3352,NiceREAD MORE
6434,not value for moneyREAD MORE
4452,Best playing experience with this shuttle cork...
7289,SuperREAD MORE
2981,Overall good.READ MORE


In [18]:
import re
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
## initialise the inbuilt Stemmer
stemmer = PorterStemmer()
## We can also use Lemmatizer instead of Stemmer
lemmatizer = WordNetLemmatizer()

In [19]:
def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", str(raw_text))
    
    raw_text = raw_text.replace("READ MORE","")
    raw_text = raw_text.replace("read","")
    
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [20]:
from tqdm import tqdm, tqdm_notebook

In [21]:
tqdm.pandas()

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Abinay
[nltk_data]     Rachakonda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
X_train_clean = X_train['Review text'].progress_apply(lambda text: preprocess(text,flag = 'lemma'))

X_train_clean.shape

100%|██████████████████████████████████████████████████████████████████████████████████████████| 6009/6009 [00:12<00:00, 472.64it/s]


(6009, 2)

In [24]:
X_test_clean = X_test['Review text'].progress_apply(lambda text: preprocess(text,flag = 'lemma'))

X_test_clean.shape

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2004/2004 [00:03<00:00, 511.52it/s]


(2004, 2)

In [34]:
pip install Mlflow

Collecting MlflowNote: you may need to restart the kernel to use updated packages.

  Using cached mlflow-2.11.3-py3-none-any.whl (19.7 MB)
Collecting graphene<4
  Using cached graphene-3.3-py2.py3-none-any.whl (128 kB)
Collecting querystring-parser<2
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting docker<8,>=4.0.0
  Using cached docker-7.0.0-py3-none-any.whl (147 kB)
Collecting sqlparse<1,>=0.4.0
  Using cached sqlparse-0.4.4-py3-none-any.whl (41 kB)
Collecting alembic!=1.10.0,<2
  Using cached alembic-1.13.1-py3-none-any.whl (233 kB)
Collecting waitress<4
  Using cached waitress-3.0.0-py3-none-any.whl (56 kB)
Collecting Mako
  Using cached Mako-1.3.2-py3-none-any.whl (78 kB)
Collecting graphql-relay<3.3,>=3.1
  Using cached graphql_relay-3.2.0-py3-none-any.whl (16 kB)
Collecting aniso8601<10,>=8
  Using cached aniso8601-9.0.1-py2.py3-none-any.whl (52 kB)
Collecting graphql-core<3.3,>=3.1
  Using cached graphql_core-3.2.3-py3-none-any.whl (202 kB)
Inst



In [25]:
import mlflow

mlflow.set_experiment("badmintor_data_prediction")

<Experiment: artifact_location='file:///C:/Users/Abinay%20Rachakonda/Desktop/MLflow%20for%20Experiment%20Tracking%20and%20Model%20Management/mlruns/542283279915906153', creation_time=1711807853590, experiment_id='542283279915906153', last_update_time=1711807853590, lifecycle_stage='active', name='badmintor_data_prediction', tags={}>

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [26]:
# Define pipeline steps
pipe_1 = Pipeline(
    [
        ('scaler', TfidfVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]
)


# Observe the Key Value Pair format
parameter_grid_1 = [
    {
        'scaler': [CountVectorizer(), TfidfVectorizer()],
        'classifier__n_neighbors' : [i for i in range(3, 21, 2)],              
        'classifier__p' : [1, 2, 3]
    }
]

In [27]:
X_train_clean

Unnamed: 0,0,1
3352,niceread,1
6434,value moneyread,2
4452,best playing experience shuttle cork amazing p...,7
7289,superread,1
2981,overall good read,3
...,...,...
5686,good read,2
5850,superread,1
1320,received original product awesome read,5
8063,good n genuine productread,4


In [28]:
clf = GridSearchCV(
    estimator=pipe_1, 
    param_grid=parameter_grid_1, 
    scoring='accuracy',
    cv=5,
    return_train_score=True,
    verbose=1
)

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time clf.fit(X_train_clean[0],y_train)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Fitting 5 folds for each of 54 candidates, totalling 270 fits


90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 251, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
  File "C:\Users\Abi

CPU times: total: 4min 35s
Wall time: 3min 21s


In [29]:
pipe_2 = Pipeline(
    [
        ('scaler', TfidfVectorizer()),
        ('classifier', SVC())
    ]
)


# Observe the Key Value Pair format
parameter_grid_2 = [
    {
        'scaler': [ CountVectorizer(),TfidfVectorizer()],
        'classifier__kernel' : ['rbf'], 
        'classifier__C' : [0.1, 0.01, 1, 10, 100]
    }, 
    {
        'scaler': [CountVectorizer(), TfidfVectorizer()],
        'classifier__kernel' : ['poly'], 
        'classifier__degree' : [2, 3, 4, 5], 
        'classifier__C' : [0.1, 0.01, 1, 10, 100]
    }, 
    {
        'scaler': [CountVectorizer(),TfidfVectorizer()],
        'classifier__kernel' : ['linear'], 
        'classifier__C' : [0.1, 0.01, 1, 10, 100]
    }
]

In [30]:
clf = GridSearchCV(
    estimator=pipe_2, 
    param_grid=parameter_grid_2, 
    scoring='accuracy',
    cv=5,
    return_train_score=True,
    verbose=1
)

# Initialize the auto logger
# max_tuning_runs=None will make sure that all the runs are recorded.
# By default top 5 runs will be recorded for each experiment
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run() as run:
    %time clf.fit(X_train_clean[0], y_train)



Fitting 5 folds for each of 60 candidates, totalling 300 fits
CPU times: total: 4min 35s
Wall time: 6min 28s


In [34]:
pipelines = {
    'knn' : Pipeline([
        ('scaler', TfidfVectorizer()),
        ('classifier', KNeighborsClassifier())
    ]), 
    'svc' : Pipeline([
        ('scaler', TfidfVectorizer()),
        ('classifier', SVC())
    ]),
    'logistic_regression': Pipeline([
        ('scaler', TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('scaler', TfidfVectorizer()),
        ('classifier', RandomForestClassifier())
    ]),
    'decision_tree': Pipeline([
        ('scaler',TfidfVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ]),
    'naive_bayes': Pipeline([
        ('scaler', TfidfVectorizer()),
        ('classifier', GaussianNB())
    ])
}

# Define parameter grid for each algorithm
param_grids = {
    'knn': [
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__n_neighbors' : [i for i in range(3, 21, 2)], 
            'classifier__p' : [1, 2, 3]
        }
    ],
    'svc': [
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__kernel' : ['rbf'], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }, 
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__kernel' : ['poly'], 
            'classifier__degree' : [2, 3, 4, 5], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }, 
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__kernel' : ['linear'], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }
    ],
    'logistic_regression': [
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2']
        }, 
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l1'], 
            'classifier__solver': ['liblinear']
        }, 
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga']
        }
    ],
    'random_forest': [
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__n_estimators': [50, 100, 200]
        }
    ],
    'decision_tree': [
        {
            'scaler': [CountVectorizer(),TfidfVectorizer()],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'naive_bayes': [
        {
             'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000], 
            'classifier__alpha' : [0.1, 0.5, 1, 10]
        }
    ]
}

In [35]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train_clean[0], y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test_clean[0], y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()

********** knn **********




Fitting 5 folds for each of 54 candidates, totalling 270 fits


90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 251, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
  File "C:\Users\Abi

CPU times: total: 8min 59s
Wall time: 8min 11s
Train Score:  0.9033129629911846
Test Score:  0.9051896207584831

********** svc **********




Fitting 5 folds for each of 60 candidates, totalling 300 fits
CPU times: total: 8min 52s
Wall time: 15min 27s
Train Score:  0.9187889736921949
Test Score:  0.9226546906187625

********** logistic_regression **********




Fitting 5 folds for each of 30 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



CPU times: total: 3min 1s
Wall time: 5min 14s
Train Score:  0.9192868948643739
Test Score:  0.9276447105788423

********** random_forest **********




Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: total: 3min 39s
Wall time: 7min 4s
Train Score:  0.9161256357361655
Test Score:  0.9216566866267465

********** decision_tree **********




Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: total: 11.4 s
Wall time: 26.6 s
Train Score:  0.9106343715234532
Test Score:  0.9156686626746507

********** naive_bayes **********




Fitting 5 folds for each of 12 candidates, totalling 60 fits


ValueError: Invalid parameter 'vectorization' for estimator Pipeline(steps=[('scaler', TfidfVectorizer()), ('classifier', GaussianNB())]). Valid parameters are: ['memory', 'steps', 'verbose'].

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [33]:
import time
import joblib
import os

In [36]:
dev = "Rachakonda Abhinay"
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )

    # Fit
    start_fit_time = time.time()
    grid_search.fit(X_train, y_train)
    end_fit_time = time.time()

    # Predict
    start_predict_time = time.time()
    y_pred = grid_search.predict(X_test)
    end_predict_time = time.time()

    # Saving the best model
    joblib.dump(grid_search.best_estimator_, f'best_models/{algo}.pkl')
    model_size = os.path.getsize(f'best_models/{algo}.pkl')

    # Pring Log
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
    print("Fit Time: ", end_fit_time - start_fit_time)
    print("Predict Time: ", end_predict_time - start_predict_time)
    print("Model Size: ", model_size)
    
    print()

    # Start the experiment run
    with mlflow.start_run() as run:
        # Log tags with mlflow.set_tag()
        mlflow.set_tag("developer", dev)

        # Log Parameters with mlflow.log_param()
        mlflow.log_param("algorithm", algo)
        mlflow.log_param("hyperparameter_grid", param_grids[algo])
        mlflow.log_param("best_hyperparameter", grid_search.best_params_)

        # Log Metrics with mlflow.log_metric()
        mlflow.log_metric("train_score", grid_search.best_score_)
        mlflow.log_metric("test_score", grid_search.score(X_test, y_test))
        mlflow.log_metric("fit_time", end_fit_time - start_fit_time)
        mlflow.log_metric("predict_time", end_predict_time - start_predict_time)
        mlflow.log_metric("model_size", model_size)

        # Log Model using mlflow.sklearn.log_model()
        mlflow.sklearn.log_model(grid_search.best_estimator_, f"{algo}_model")

********** knn **********


2024/04/01 01:06:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '949f6fd10e7c45dc9c5d86b8a31d1d08', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 54 candidates, totalling 270 fits


ValueError: 
All the 270 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 251, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\sklearn\__init__.py", line 1659, in patched_fit
    return original(self, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 559, in call_original
    return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 494, in call_original_fn_with_event_logging
    original_fn_result = original_fn(*og_args, **og_kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 556, in _original_fn
    original_result = original(*_og_args, **_og_kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 251, in patch_with_managed_run
    result = patch_function(original, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\sklearn\__init__.py", line 1659, in patched_fit
    return original(self, *args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 559, in call_original
    return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 494, in call_original_fn_with_event_logging
    original_fn_result = original_fn(*og_args, **og_kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 556, in _original_fn
    original_result = original(*_og_args, **_og_kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\neighbors\_classification.py", line 207, in fit
    return self._fit(X, y)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\neighbors\_base.py", line 407, in _fit
    X, y = self._validate_data(
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\utils\validation.py", line 1092, in check_X_y
    check_consistent_length(X, y)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\utils\validation.py", line 387, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [1, 4807]

--------------------------------------------------------------------------------
215 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 458, in safe_patch_function
    return original(*args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 458, in safe_patch_function
    return original(*args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\neighbors\_classification.py", line 207, in fit
    return self._fit(X, y)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\neighbors\_base.py", line 407, in _fit
    X, y = self._validate_data(
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\utils\validation.py", line 1092, in check_X_y
    check_consistent_length(X, y)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\utils\validation.py", line 387, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [1, 4807]

--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 458, in safe_patch_function
    return original(*args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\mlflow\utils\autologging_utils\safety.py", line 458, in safe_patch_function
    return original(*args, **kwargs)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\neighbors\_classification.py", line 207, in fit
    return self._fit(X, y)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\neighbors\_base.py", line 407, in _fit
    X, y = self._validate_data(
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\utils\validation.py", line 1092, in check_X_y
    check_consistent_length(X, y)
  File "C:\Users\Abinay Rachakonda\anaconda\lib\site-packages\sklearn\utils\validation.py", line 387, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [1, 4808]
