In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your CSV data into a pandas DataFrame
df = pd.read_csv('../../../data/processed/tokenized_data.csv')

# Preprocessing (tokenization, removing stopwords, etc.) if needed

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['overall'], test_size=0.2, random_state=42)

# Initialize the CountVectorizer for bigrams only
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))  # Only bigrams

# It's important to fill NaN values to avoid issues during the vectorization
X_train.fillna('', inplace=True)

# Fit the vectorizer on the training data and transform it into a document-term matrix
X_train_ngrams = ngram_vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test.fillna('', inplace=True)
X_test_ngrams = ngram_vectorizer.transform(X_test)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_ngrams, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_ngrams)
print(classification_report(y_test, y_pred))