In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from google.colab import drive
import pandas as pd
import plotly.express as px

# Mount Google Drive
drive.mount('/content/drive')

# Load data
df = pd.read_csv('/content/drive/My Drive/emotions.csv')

# Features and labels
X = df['Comment']
y = df['Emotion']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize text data
vec = TfidfVectorizer(max_df=0.95, min_df=3, stop_words='english')  # Adjusted max_df
X_train_vectorized = vec.fit_transform(X_train)
X_test_vectorized = vec.transform(X_test)

# Initialize and train the classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_vectorized, y_train)

# Training and test accuracy
y_train_pred = classifier.predict(X_train_vectorized)
y_test_pred = classifier.predict(X_test_vectorized)

print("Train accuracy:", accuracy_score(y_train, y_train_pred))
print("Test accuracy:", accuracy_score(y_test, y_test_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train accuracy: 0.9983152827918171
Test accuracy: 0.9472502805836139


In [3]:

classifier =RandomForestClassifier()
classifier.fit(X_train_vectorized, y_train)

y_pred = classifier.predict(X_test_vectorized)

print("Test accuracy:", accuracy_score(y_test, y_pred))

y_pred = classifier.predict(X_train_vectorized)

print("Train accuracy:", accuracy_score(y_train, y_pred))

from sklearn.model_selection import GridSearchCV

params = {
   'n_estimators': [100, 200, 300],
   'max_features': [10, 20, 30],
   'min_samples_split': [4, 6, 8],
   'min_samples_leaf': [4, 6, 8]
}

grid = GridSearchCV(classifier, params, cv = 3, verbose = 3)
model = grid.fit(X_train_vectorized, y_train)



print('Best hyperparameters are '+str(model.best_params_))


final_model = model.best_estimator_
y_pred = final_model.predict(X_test_vectorized)

print("Test accuracy:", accuracy_score(y_test, y_pred))


y_pred = final_model.predict(X_train_vectorized)

print("Train accuracy:", accuracy_score(y_train, y_pred))


Test accuracy: 0.9455667789001122
Train accuracy: 0.9983152827918171
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV 1/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=100;, score=0.908 total time=   0.9s
[CV 2/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=100;, score=0.888 total time=   1.5s
[CV 3/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=100;, score=0.894 total time=   0.5s
[CV 1/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=200;, score=0.906 total time=   1.1s
[CV 2/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=200;, score=0.903 total time=   1.1s
[CV 3/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=200;, score=0.918 total time=   1.1s
[CV 1/3] END max_features=10, min_samples_leaf=4, min_samples_split=4, n_estimators=300;, score=0.903 total time=   1.0s
[CV 2/3] END max_featu

I suppose the problem below arose because the "y_test" of the final model was not assigned. Here the "y_test" is from the previous model and there is a mismatch between the "y_test" and the "y_pre" in the lines below.

In [8]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

import plotly.express as px

fig = px.imshow(cm,
                labels=dict(x="Predicted values", y="Actual values", color="Productivity"),
                x=['anger', 'fear', 'joy'],
                y=['anger', 'fear', 'joy']
               )


fig.update_xaxes(side="bottom")

fig.update_traces(text=cm, texttemplate="%{text}")

fig.show()

print("Classification Report:\n", classification_report(y_test, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [1782, 4155]