In [6]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Load all 3 datasets
trainData = pd.read_csv('/content/trainData.csv').dropna(subset=['Body'])
testData = pd.read_csv('/content/testData.csv').dropna(subset=['Body'])
valData = pd.read_csv('/content/validationData.csv').dropna(subset=['Body'])

# Setup the data
X_train = trainData['Body']
y_train = trainData['label']
X_test = testData['Body']
y_test = testData['label']
X_val = valData['Body']
y_val = valData['label']

# Tokenize the text
def tokenize_text(text):
    return text.split()

# Word2Vec model
model = Word2Vec([tokenize_text(text) for text in X_train], vector_size=100, window=5, min_count=1, workers=4)
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
        self.size = model.vector_size

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.model.wv[word] for word in tokenize_text(text) if word in self.model.wv]
                    or [np.zeros(self.size)], axis=0)
            for text in X
        ])

# pipeline with scaling
pipeline = Pipeline([
    ('w2v', Word2VecVectorizer(model)),
    ('scale', StandardScaler()),  # Add scaling
    ('clf', LogisticRegression(random_state=42, max_iter=10000, solver='lbfgs')),
])

# parameters for GridSearchCV
parameters = {
    'clf__C': [0.1, 1, 10],
    'clf__solver': ['lbfgs', 'sag', 'saga']
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_val, y_val)

#best score and parameters
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

# Use the best parameters to re-train the final model
pipeline.set_params(**best_parameters)
pipeline.fit(X_train, y_train)

# Evaluate the model on the test data
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("Classification Report:")
print(classification_report(y_test, predictions))


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score: 0.941
Best parameters set:
	clf__C: 10
	clf__solver: 'lbfgs'
Accuracy: 0.9601731601731601
Confusion Matrix:
[[747  16]
 [ 30 362]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       763
           1       0.96      0.92      0.94       392

    accuracy                           0.96      1155
   macro avg       0.96      0.95      0.96      1155
weighted avg       0.96      0.96      0.96      1155

