In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset with text data and labels
dataset_path = '/content/masterDatasetFinal.csv'
masterData = pd.read_csv(dataset_path)

#PLEASE DO THIS EVERYTIME
# Drop rows with NaN values in the 'Body' column
masterData.dropna(subset=['Body'], inplace=True)

# Split the dataset into training and testing sets, including labels
train_data, test_data = train_test_split(masterData, test_size=0.20, random_state=42)

# Save the labels for training and testing sets
train_data[['label']].to_csv('train_labels.csv', index=False)
test_data[['label']].to_csv('test_labels.csv', index=False)

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['Body'])

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(test_data['Body'])

# Load the labels
y_train = train_data['label']
y_test = test_data['label']

# Initialize and train the logistic regression model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_tfidf, y_train)

# Predict the labels for the test set
y_pred = logistic_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Logistic Regression:", accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix Logistic Regression:")
print(conf_matrix)

# Initialize and train the Random Forest classifier
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_tfidf, y_train)

# Predict the labels for the test set
y_pred = random_forest_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest:", accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix Random Forest :")
print(conf_matrix)

# Initialize and train the SVM classifier
svm_model = SVC(kernel='linear', random_state=42, probability=True)
svm_model.fit(X_train_tfidf, y_train)

# Predict the labels for the test set
y_pred = svm_model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy SVM:", accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix SVM:")
print(conf_matrix)

Accuracy Logistic Regression: 0.9571150097465887
Confusion Matrix Logistic Regression:
[[1001   19]
 [  47  472]]
Accuracy of Random Forest: 0.9525666016894087
Confusion Matrix Random Forest :
[[1010   10]
 [  63  456]]
Accuracy SVM: 0.9740090968161144
Confusion Matrix SVM:
[[1000   20]
 [  20  499]]
