In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

In [3]:
# Load dataset
df = pd.read_csv('data.csv')

In [4]:
# Drop rows with missing values
df.dropna(subset=['Review text'], inplace=True)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
vectorizer = TfidfVectorizer()

In [7]:
X = vectorizer.fit_transform(df['Review text'])
y = df['Ratings']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

print(X_train.shape, X_test.shape)

(6382, 3497) (2128, 3497)


In [9]:
import mlflow
from sklearn.preprocessing import MaxAbsScaler
mlflow.set_experiment("Sentimental Analysis")

2024/03/27 09:57:11 INFO mlflow.tracking.fluent: Experiment with name 'Sentimental Analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/rameshbabu/MLFlow%20Projects/mlruns/153116874839389910', creation_time=1711513631650, experiment_id='153116874839389910', last_update_time=1711513631650, lifecycle_stage='active', name='Sentimental Analysis', tags={}>

In [10]:
pipelines = {
    'kneighbors': Pipeline([
        ('scaler', MaxAbsScaler()),
        ('classifier', KNeighborsClassifier())
    ]),
    'knn' : Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ]), 
    'svc' : Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC())
    ]),
    'logistic_regression': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ]),
    'random_forest': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ]),
    'decision_tree': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', DecisionTreeClassifier())
    ])
}

# Define parameter grid for each algorithm
param_grids = {
    'kneighbors': [
        {
        'scaler': [StandardScaler(), MaxAbsScaler()],
        'classifier__n_neighbors' : [i for i in range(3, 21, 2)],              
        'classifier__p' : [1, 2, 3]
        }
    ],
    'knn': [
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__n_neighbors' : [i for i in range(3, 21, 2)], 
            'classifier__p' : [1, 2, 3]
        }
    ],
    'svc': [
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__kernel' : ['rbf'], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }, 
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__kernel' : ['poly'], 
            'classifier__degree' : [2, 3, 4, 5], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }, 
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__kernel' : ['linear'], 
            'classifier__C' : [0.1, 0.01, 1, 10, 100]
        }
    ],
    'logistic_regression': [
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2']
        }, 
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l1'], 
            'classifier__solver': ['liblinear']
        }, 
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga']
        }
    ],
    'random_forest': [
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__n_estimators': [50, 100, 200]
        }
    ],
    'decision_tree': [
        {
            'scaler': [StandardScaler(), MaxAbsScaler()],
            'classifier__max_depth': [None, 5, 10]
        }
    ]
}

In [None]:
best_models = {}

# Run the Pipeline
for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    mlflow.sklearn.autolog(max_tuning_runs=None)
    
    with mlflow.start_run() as run:
        %time grid_search.fit(X_train, y_train)
        
    print('Train Score: ', grid_search.best_score_)
    print('Test Score: ', grid_search.score(X_test, y_test))
    
    best_models[algo] = grid_search.best_estimator_
    print()

********** kneighbors **********




Fitting 5 folds for each of 54 candidates, totalling 270 fits


180 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py", line 251, in patch_with_managed_run
    result = patch_function(o

CPU times: user 5min 30s, sys: 4.59 s, total: 5min 34s
Wall time: 3min 1s
Train Score:  0.6369473909019719
Test Score:  0.6226503759398496

********** knn **********




Fitting 5 folds for each of 54 candidates, totalling 270 fits


180 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py", line 251, in patch_with_managed_run
    result = patch_function(o

CPU times: user 5min 27s, sys: 5.01 s, total: 5min 32s
Wall time: 2min 54s
Train Score:  0.6369473909019719




Test Score:  0.6226503759398496

********** svc **********
Fitting 5 folds for each of 60 candidates, totalling 300 fits


150 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py", line 578, in safe_patch_function
    patch_function(call_original, *args, **kwargs)
  File "/Users/rameshbabu/miniconda3/envs/mlflow/lib/python3.8/site-packages/mlflow/utils/autologging_utils/safety.py", line 251, in patch_with_managed_run
    result = patch_function(o

CPU times: user 11min 16s, sys: 8.21 s, total: 11min 24s
Wall time: 11min 42s
Train Score:  0.648543804911099




Test Score:  0.6320488721804511

********** logistic_regression **********
Fitting 5 folds for each of 30 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

CPU times: user 8min 31s, sys: 6.98 s, total: 8min 38s
Wall time: 9min 33s
Train Score:  0.6599809015546331
Test Score:  0.6419172932330827

********** random_forest **********
Fitting 5 folds for each of 6 candidates, totalling 30 fits
