In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
mmt = pd.read_csv(r"C:\Users\mohds\Downloads\Sem -3\NLP Project\Project\Sentiment Dataset\MakeMyTrip.csv")
yatra = pd.read_csv(r"C:\Users\mohds\Downloads\Sem -3\NLP Project\Project\Sentiment Dataset\yatra.csv")
booking = pd.read_csv(r"C:\Users\mohds\Downloads\Sem -3\NLP Project\Project\Sentiment Dataset\booking.csv")
goibibo = pd.read_csv(r"C:\Users\mohds\Downloads\Sem -3\NLP Project\Project\Sentiment Dataset\Goibibo.csv")

In [3]:
data = pd.concat([mmt, yatra, booking, goibibo], axis = 0)
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
data.dropna(inplace=True)
data.isna().sum()

review_description    0
polarity              0
Sentiment             0
dtype: int64

In [4]:
print(f"Shape of Data: {data.shape}")
data

Shape of Data: (7992, 3)


Unnamed: 0,review_description,polarity,Sentiment
0,different another try connect service call dis...,-0.250000,Negative
1,hope review someone bad mood trip price higher...,0.135714,Positive
2,abysmally poor service would mistake rely make...,-0.133333,Negative
3,many lack try senior able time senior citizen ...,0.218750,Positive
4,immediate travel encounter technical issue tra...,0.300000,Positive
...,...,...,...
7989,cash scam waste keep open time answer earn tra...,-0.078571,Negative
7990,highly worst service name covid care fool arou...,-0.400000,Negative
7991,fact try level best get money back unfortunate...,0.150000,Positive
7992,actively last last week first time raise suppo...,0.088889,Positive


In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['review_description'], data['Sentiment'], test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(use_idf=True, 
                                   stop_words='english', 
                                   analyzer='word')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
X_train_tfidf

<6393x3631 sparse matrix of type '<class 'numpy.float64'>'
	with 82890 stored elements in Compressed Sparse Row format>

### Decision Tree

In [37]:
params = {'max_depth': [2,3,4,5], 
          'min_samples_split': [100, 150, 200, 250, 225],
          'min_samples_leaf': [50, 75, 115, 100]}

In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
gridcv = GridSearchCV(DecisionTreeClassifier(), 
                      params, 
                      verbose = 1,
                      cv = 10)
gridcv.fit(X_train_tfidf, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [39]:
from sklearn.tree import DecisionTreeClassifier
ModelDT = DecisionTreeClassifier(**gridcv.best_params_)

In [40]:
ModelDT.fit(X_train_tfidf, y_train)

In [41]:
y_pred_train_DT = ModelDT.predict(X_train_tfidf)
y_pred_test_DT = ModelDT.predict(X_test_tfidf)

In [51]:
print("\nClassification Report on Train (Decision Tree):\n", classification_report(y_train, y_pred_train_DT))


Classification Report on Train (Decision Tree):
               precision    recall  f1-score   support

    Negative       0.92      0.73      0.81      3059
    Positive       0.79      0.95      0.86      3334

    accuracy                           0.84      6393
   macro avg       0.86      0.84      0.84      6393
weighted avg       0.85      0.84      0.84      6393



In [52]:
print("\nClassification Report on Test (Decision Tree):\n", classification_report(y_test, y_pred_test_DT))


Classification Report on Test (Decision Tree):
               precision    recall  f1-score   support

    Negative       0.92      0.71      0.80       756
    Positive       0.78      0.94      0.86       843

    accuracy                           0.83      1599
   macro avg       0.85      0.83      0.83      1599
weighted avg       0.85      0.83      0.83      1599



### Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
gridcv = GridSearchCV(RandomForestClassifier(), 
                      params, 
                      verbose = 1,
                      cv = 10)
gridcv.fit(X_train_tfidf, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [46]:
from sklearn.ensemble import RandomForestClassifier
ModelRF = RandomForestClassifier(**gridcv.best_params_, 
                                 n_estimators = 25, 
                                 criterion = 'gini', 
                                 max_features='sqrt'
  )

In [47]:
ModelRF.fit(X_train_tfidf, y_train)

In [48]:
y_pred_train_RF = ModelRF.predict(X_train_tfidf)
y_pred_test_RF = ModelRF.predict(X_test_tfidf)

In [49]:
print("\nClassification Report on Train (Random Forest):\n", classification_report(y_train, y_pred_train_RF))


Classification Report on Train (Random Forest):
               precision    recall  f1-score   support

    Negative       0.85      0.65      0.74      3059
    Positive       0.74      0.89      0.81      3334

    accuracy                           0.78      6393
   macro avg       0.79      0.77      0.77      6393
weighted avg       0.79      0.78      0.77      6393



In [50]:
print("\nClassification Report on Test (Random Forest):\n", classification_report(y_test, y_pred_test_RF))


Classification Report on Test (Random Forest):
               precision    recall  f1-score   support

    Negative       0.85      0.63      0.73       756
    Positive       0.73      0.90      0.81       843

    accuracy                           0.77      1599
   macro avg       0.79      0.77      0.77      1599
weighted avg       0.79      0.77      0.77      1599



### Gradient Boosting

In [6]:
params = {'max_depth': [2,3,4,5], 
          'min_samples_split': [100, 150, 200, 250, 225],
          'min_samples_leaf': [50, 75, 115, 100], 
          'learning_rate':[0.7, 0.6], 
          'n_estimators': [25]
         }

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=42,loss="log_loss",criterion="squared_error")

gscv_GBM = GridSearchCV(estimator=model, 
                    param_grid=params,
                    cv=10,
                    verbose=1,
                    n_jobs=-1,
                    scoring='accuracy')

gscv_GBM.fit(X_train_tfidf, y_train)

Fitting 10 folds for each of 160 candidates, totalling 1600 fits


In [8]:
Model_GB = GradientBoostingClassifier(**gscv_GBM.best_params_)
Model_GB.fit(X_train_tfidf, y_train)

In [9]:
y_pred_train_GB = Model_GB.predict(X_train_tfidf)
y_pred_test_GB = Model_GB.predict(X_test_tfidf)

In [10]:
print("\nClassification Report on Train (Gradient Boost):\n", classification_report(y_train, y_pred_train_GB))


Classification Report on Train (Gradient Boost):
               precision    recall  f1-score   support

    Negative       0.94      0.84      0.89      3059
    Positive       0.87      0.95      0.91      3334

    accuracy                           0.90      6393
   macro avg       0.91      0.90      0.90      6393
weighted avg       0.90      0.90      0.90      6393



In [11]:
print("\nClassification Report on Test (Gradient Boost):\n", classification_report(y_test, y_pred_test_GB))


Classification Report on Test (Gradient Boost):
               precision    recall  f1-score   support

    Negative       0.93      0.83      0.88       756
    Positive       0.86      0.94      0.90       843

    accuracy                           0.89      1599
   macro avg       0.89      0.89      0.89      1599
weighted avg       0.89      0.89      0.89      1599



### Navie-Bayes

In [33]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB(alpha=1)  # Laplace smoothing (alpha) = 1
model.fit(X_train_tfidf, y_train)

In [34]:
y_pred_train_NB = model.predict(X_train_tfidf)
y_pred_test_NB = model.predict(X_test_tfidf)

In [35]:
print("\nClassification Report on Train (Naive-Bayes):\n", classification_report(y_train, y_pred_train_NB))


Classification Report on Train (Naive-Bayes):
               precision    recall  f1-score   support

    Negative       0.81      0.90      0.85      3059
    Positive       0.90      0.80      0.85      3334

    accuracy                           0.85      6393
   macro avg       0.85      0.85      0.85      6393
weighted avg       0.85      0.85      0.85      6393



In [36]:
print("\nClassification Report on Test (Naive-Bayes):\n", classification_report(y_test, y_pred_test_NB))


Classification Report on Test (Naive-Bayes):
               precision    recall  f1-score   support

    Negative       0.76      0.86      0.81       756
    Positive       0.86      0.76      0.81       843

    accuracy                           0.81      1599
   macro avg       0.81      0.81      0.81      1599
weighted avg       0.81      0.81      0.81      1599



### Accuracy Comparision

In [67]:
from sklearn.metrics import accuracy_score
def model_evaluation(prediction, actual):
    accuracy = accuracy_score(prediction, actual)
    return accuracy

In [72]:
dt_train = model_evaluation(y_pred_train_DT, y_train)
dt_test = model_evaluation(y_pred_test_DT, y_test)
rf_train = model_evaluation(y_pred_train_RF, y_train)
rf_test = model_evaluation(y_pred_test_RF, y_test)
gb_train = model_evaluation(y_pred_train_GB, y_train)
gb_test = model_evaluation(y_pred_test_GB, y_test)
nb_train = model_evaluation(y_pred_train_NB, y_train)
nb_test = model_evaluation(y_pred_test_NB, y_test)

In [77]:
print(f"Accuracy of the 'Decision Tree Train Model' is : {np.round(dt_train*100)}%")
print(f"Accuracy of the 'Decision Tree Test Model' is : {np.round(dt_test*100)}%")
print()
print(f"Accuracy of the 'Random Forest Train Model' is : {np.round(rf_train*100)}%")
print(f"Accuracy of the 'Random Forest Test Model' is : {np.round(rf_test*100)}%")
print()
print(f"Accuracy of the 'Gradient Boosting Train Model' is : {np.round(gb_train*100)}%")
print(f"Accuracy of the 'Gradient Boosting Test Model' is : {np.round(gb_test*100)}%")
print()
print(f"Accuracy of the 'Naive-Bayes Train Model' is : {np.round(nb_train*100)}%")
print(f"Accuracy of the 'Naive-Bayes Test Model' is : {np.round(nb_test*100)}%")

Accuracy of the 'Decision Tree Train Model' is : 84.0%
Accuracy of the 'Decision Tree Test Model' is : 83.0%

Accuracy of the 'Random Forest Train Model' is : 78.0%
Accuracy of the 'Random Forest Test Model' is : 77.0%

Accuracy of the 'Gradient Boosting Train Model' is : 90.0%
Accuracy of the 'Gradient Boosting Test Model' is : 89.0%

Accuracy of the 'Naive-Bayes Train Model' is : 85.0%
Accuracy of the 'Naive-Bayes Test Model' is : 81.0%


- The models show a progression in performance from Decision Tree to Random Forest to Naive-Bayes and finally to Gradient Boosting.
- `Gradient Boosting` with an accuracy of `90% on Train` and `89% on Test`, appears to be the most effective model.

X_train, X_test, y_train, y_test = train_test_split(data['review_description'], data['Sentiment'], test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer(stop_words='english')
X_train_cv = count_vectorizer.fit_transform(X_train)
X_train_dtm = pd.DataFrame(X_train_cv.toarray(), columns=count_vectorizer.get_feature_names_out())
X_test_dtm = count_vectorizer.transform(X_test)

##### Model Export

In [12]:
import pickle

pickle.dump(Model_GB, open(r"C:\Users\mohds\Downloads\Sem -3\NLP Project\Project\Deployment\build.pkl", 'wb'))

In [13]:
import pickle

pickle.dump(tfidf_vectorizer, open(r"C:\Users\mohds\Downloads\Sem -3\NLP Project\Project\Deployment\tfidf_vectorizer.pkl", 'wb'))