import required Libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

Read Datafile

In [3]:
df = pd.read_csv("D:/Epsilon AI internship/Project_1/cleaned_data.csv")

Train test split

In [5]:
# Select features (X) and the target (y)
X = df[['votes', 'online_order', 'book_table', 'approx_cost(for two people)', 'location']]
X = pd.get_dummies(X, drop_first=True)  # One-hot encode categorical variables

y = df['rate_flag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
y_pred_log = log_reg.predict(X_test)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))


Logistic Regression Accuracy: 0.8182742148592337
              precision    recall  f1-score   support

           0       0.80      0.92      0.86      5779
           1       0.86      0.67      0.75      4060

    accuracy                           0.82      9839
   macro avg       0.83      0.80      0.80      9839
weighted avg       0.82      0.82      0.81      9839



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_clf.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.971236914320561
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5779
           1       0.97      0.96      0.96      4060

    accuracy                           0.97      9839
   macro avg       0.97      0.97      0.97      9839
weighted avg       0.97      0.97      0.97      9839



In [8]:
# K-Nearest Neighbors
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_clf.predict(X_test)

# Evaluate
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.9137107429616831
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      5779
           1       0.90      0.89      0.90      4060

    accuracy                           0.91      9839
   macro avg       0.91      0.91      0.91      9839
weighted avg       0.91      0.91      0.91      9839



In [10]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)

# Predictions
y_pred_gb = gb_clf.predict(X_test)

# Evaluate
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.8457160280516313
              precision    recall  f1-score   support

           0       0.88      0.85      0.87      5779
           1       0.80      0.84      0.82      4060

    accuracy                           0.85      9839
   macro avg       0.84      0.84      0.84      9839
weighted avg       0.85      0.85      0.85      9839



As we Realise random forest gave the highest accuracy

Tuning#

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


For logistic regression, you might want to tune the regularization strength C.

In [12]:
log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],  # L1 = Lasso, L2 = Ridge
    'solver': ['liblinear']   # For L1, use 'liblinear' or 'saga' solver
}


For the random forest, you’ll tune parameters such as number of trees n_estimators, maximum depth max_depth, and minimum samples per split min_samples_split.

In [13]:
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


K-Nearest Neighbors (KNN)
For KNN, tune the number of neighbors n_neighbors and the distance metric (p for Minkowski distance).

In [14]:
knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1: Manhattan Distance, 2: Euclidean Distance
}


For Gradient Boosting, you might tune the learning rate, number of estimators, and maximum depth of the trees.

In [15]:
gb_params = {
    'learning_rate': [0.01, 0.1, 0.05],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [16]:
log_reg = LogisticRegression()
grid_search_log = GridSearchCV(log_reg, param_grid=log_reg_params, cv=5, scoring='accuracy')
grid_search_log.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_search_log.best_params_)
print("Best Logistic Regression Accuracy:", grid_search_log.best_score_)


Best Logistic Regression Params: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Logistic Regression Accuracy: 0.8153586912987165


In [17]:
rf_clf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_clf, param_grid=rf_params, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

print("Best Random Forest Params:", grid_search_rf.best_params_)
print("Best Random Forest Accuracy:", grid_search_rf.best_score_)


KeyboardInterrupt: 

In [19]:
knn_clf = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn_clf, param_grid=knn_params, cv=5, scoring='accuracy')
grid_search_knn.fit(X_train, y_train)

print("Best KNN Params:", grid_search_knn.best_params_)
print("Best KNN Accuracy:", grid_search_knn.best_score_)


KeyboardInterrupt: 

In [20]:
gb_clf = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb_clf, param_grid=gb_params, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train, y_train)

print("Best Gradient Boosting Params:", grid_search_gb.best_params_)
print("Best Gradient Boosting Accuracy:", grid_search_gb.best_score_)


KeyboardInterrupt: 