##EECS 4412 Project

Author: Jason Lau

Student#: 218835066

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

train_csv_path = '/content/drive/MyDrive/train_yelp_60k.csv'
test_csv_path = '/content/drive/MyDrive/test_yelp_60k.csv'

pre_test_df = pd.read_csv(test_csv_path)

ids = pre_test_df['ID'] #Keep ids

train_df = pd.read_csv(train_csv_path).drop(columns=["ID"]) #Remove the ID column
test_df = pre_test_df.drop(columns=["ID"])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

#NLTK data
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

#Preprocessing
def preprocess_text(text):
  #Lowercase the text
  text = text.lower()

  #Tokenization
  tokens = word_tokenize(text)

  #Remove punctuation
  tokens = [word for word in tokens if word not in string.punctuation]

  #Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]

  #Recombine tokens into a string
  return " ".join(tokens)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pandas as pd

#classifier
clf = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=5)

X = train_df["Text"].apply(preprocess_text)  #Features, preprocessing
y = train_df["Class"]  #Target

#Convert string labels to numeric values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

#Split data into train and test sets
X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

#Keep the original string labels for reporting
_, _, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#BOW approach using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

#Applying SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_bow_smote, y_train_encoded_smote = smote.fit_resample(X_train_bow, y_train_encoded)

#Feature selection using SelectKBest and chi2
k = 1500  #Number of features to select
select_k_best = SelectKBest(chi2, k=k)

#Apply feature selection to the training and test sets
X_train_bow_selected = select_k_best.fit_transform(X_train_bow_smote, y_train_encoded_smote)
X_test_bow_selected = select_k_best.transform(X_test_bow)

#XGBoost Evaluation
print("Evaluating XGBoost:")

#BOW Evaluation
clf.fit(X_train_bow_selected, y_train_encoded_smote)
y_pred_encoded = clf.predict(X_test_bow_selected)

#Convert numeric predictions back to original labels for evaluation
y_pred_bow = le.inverse_transform(y_pred_encoded)

bow_accuracy = accuracy_score(y_test, y_pred_bow)
bow_precision = precision_score(y_test, y_pred_bow, average='macro')
bow_recall = recall_score(y_test, y_pred_bow, average='macro')
bow_f1 = f1_score(y_test, y_pred_bow, average='macro')

#Show Results
print(f"BOW Classification Report For XGBoost", classification_report(y_test, y_pred_bow))



#Predictions using XGBoost
test_texts = test_df["Text"].apply(preprocess_text) #Preprocess the test data

#Transform using the same vectorizer and feature selector used during training
X_final_test_bow = vectorizer.transform(test_texts)
X_final_test_selected = select_k_best.transform(X_final_test_bow)

#Predictions using the trained classifier
final_preds_encoded = clf.predict(X_final_test_selected)

#Convert predictions back to original labels
final_preds = le.inverse_transform(final_preds_encoded)

#Create DataFrame
submission_df = pd.DataFrame({
    "ID": ids,
    "Class": final_preds
})

prediction_counts = pd.Series(final_preds).value_counts()
print("XGBoost Predictions:")
print(prediction_counts)

# Save predictions to CSV
submission_df.to_csv("/content/drive/MyDrive/yelp_predictions.csv", index=False)