In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

# Load the datasets
trainData = pd.read_csv('/content/trainData.csv').dropna(subset=['Body'])
testData = pd.read_csv('/content/testData.csv').dropna(subset=['Body'])
valData = pd.read_csv('/content/validationData.csv').dropna(subset=['Body'])

# Setup the data
X_train = trainData['Body']
y_train = trainData['label']
X_test = testData['Body']
y_test = testData['label']
X_val = valData['Body']
y_val = valData['label']


# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression(random_state=42)),
])

# Adjust parameters if needed
parameters = {
    'bow__max_df': (0.75, 0.85),
    'bow__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10],
}

# Using GridSearchCV to find the best parameters on the validation set
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_val, y_val)

# Print best score and parameters
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Use the best parameters to re-train the final model
pipeline.set_params(**best_parameters)
pipeline.fit(X_train, y_train)

# Evaluate the model on the test data
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification Report:")
print(classification_report(y_test, predictions))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best score: 0.954
Best parameters set:
	bow__max_df: 0.75
	bow__ngram_range: (1, 1)
	clf__C: 1
Accuracy: 0.9731601731601731
Confusion Matrix:
[[754   9]
 [ 22 370]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       763
           1       0.98      0.94      0.96       392

    accuracy                           0.97      1155
   macro avg       0.97      0.97      0.97      1155
weighted avg       0.97      0.97      0.97      1155



In [4]:
pipeline_rf = Pipeline([
    ('bow', CountVectorizer()),  # Changed from 'tfidf' to 'bow'
    ('clf', RandomForestClassifier(random_state=42)),
])

# Parameters to tune
parameters_rf = {
    'bow__max_df': (0.75, 0.85),
    'bow__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'clf__n_estimators': [100, 300],  # number of trees in the forest
    'clf__max_depth': [10, 20, None]  # maximum depth of the tree
}

# Using GridSearchCV to find the best parameters on the validation set
grid_search_rf = GridSearchCV(pipeline_rf, parameters_rf, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_val, y_val)

# Print best score and parameters
print("Best score (Random Forest): %0.3f" % grid_search_rf.best_score_)
print("Best parameters set (Random Forest):")
best_parameters_rf = grid_search_rf.best_estimator_.get_params()
for param_name in sorted(parameters_rf.keys()):
    print("\t%s: %r" % (param_name, best_parameters_rf[param_name]))

# Use the best parameters to re-train the final model
pipeline_rf.set_params(**best_parameters_rf)
pipeline_rf.fit(X_train, y_train)

# Evaluate the model on the test data
predictions_rf = pipeline_rf.predict(X_test)
print("Accuracy (Random Forest):", accuracy_score(y_test, predictions_rf))
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, predictions_rf))
print("Classification Report (Random Forest):")
print(classification_report(y_test, predictions_rf))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score (Random Forest): 0.950
Best parameters set (Random Forest):
	bow__max_df: 0.75
	bow__ngram_range: (1, 1)
	clf__max_depth: None
	clf__n_estimators: 100
Accuracy (Random Forest): 0.9705627705627705
Confusion Matrix (Random Forest):
[[757   6]
 [ 28 364]]
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       763
           1       0.98      0.93      0.96       392

    accuracy                           0.97      1155
   macro avg       0.97      0.96      0.97      1155
weighted avg       0.97      0.97      0.97      1155



In [6]:
# Define a pipeline combining a text feature extractor with a classifier
pipeline_svm = Pipeline([
    ('bow', CountVectorizer()),  # Changed from 'tfidf' to 'bow'
    ('clf', SVC(random_state=42)),
])

# Parameters to tune
parameters_svm = {
    'bow__max_df': (0.75, 0.85),
    'bow__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf']
}

# Using GridSearchCV to find the best parameters on the validation set
grid_search_svm = GridSearchCV(pipeline_svm, parameters_svm, cv=5, n_jobs=-1, verbose=1)
grid_search_svm.fit(X_val, y_val)

# Print best score and parameters
print("Best score (SVM): %0.3f" % grid_search_svm.best_score_)
print("Best parameters set (SVM):")
best_parameters_svm = grid_search_svm.best_estimator_.get_params()
for param_name in sorted(parameters_svm.keys()):
    print("\t%s: %r" % (param_name, best_parameters_svm[param_name]))

# Use the best parameters to re-train the final model
pipeline_svm.set_params(**best_parameters_svm)
pipeline_svm.fit(X_train, y_train)

# Evaluate the model on the test data
predictions_svm = pipeline_svm.predict(X_test)
print("Accuracy (SVM):", accuracy_score(y_test, predictions_svm))
print("Confusion Matrix (SVM):")
print(confusion_matrix(y_test, predictions_svm))
print("Classification Report (SVM):")
print(classification_report(y_test, predictions_svm))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best score (SVM): 0.942
Best parameters set (SVM):
	bow__max_df: 0.75
	bow__ngram_range: (1, 1)
	clf__C: 10
	clf__kernel: 'rbf'
Accuracy (SVM): 0.9627705627705627
Confusion Matrix (SVM):
[[761   2]
 [ 41 351]]
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       763
           1       0.99      0.90      0.94       392

    accuracy                           0.96      1155
   macro avg       0.97      0.95      0.96      1155
weighted avg       0.96      0.96      0.96      1155

