# Models Training 3
In this notebook we are exploring the impact of using word embeddings on training the previous notebooks models

## Imports

In [None]:
!pip install -q spacy gensim nltk
!python -m spacy download es_core_news_md
!pip install xgboost

In [50]:
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# import custom helper module
import importlib
import helpers
importlib.reload(helpers)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'helpers' from '/notebooks/helpers.py'>

## Load dataset

In [None]:
X_train, X_test, y_train, y_test = helpers.load_dataset("training_data_clean.csv", force_reload=True)

# print message
helpers.print_text(X_train, y_train)


In [31]:
# print message
helpers.print_text(X_train, y_train)

----------------------------------------------------------------------------------------------------
[9780] jade helm really martial law texas ranger relay see inside military train --> 0
----------------------------------------------------------------------------------------------------


## Loading spaCy and Word Embeddings

In [9]:
# Tokenize the news titles
X_train_tok = X_train.apply(word_tokenize)
X_test_tok = X_test.apply(word_tokenize)

X_train_tok.head(10)

0    [republicans, punish, georgia, governor, refus...
1    [father, soldier, slay, niger, defend, preside...
2    [south, dakotas, governor, veto, loosen, conce...
3    [turkeys, erdogan, say, take, jerusalem, resol...
4    [bill, maher, insult, trump, suppose, masculin...
5    [dem, senator, switch, party, call, nfl, owner...
6    [ryan, say, trump, play, constructive, role, h...
7    [epa, chief, want, scientist, debate, climate,...
8    [macron, rebuffs, assad, accusations, france, ...
9     [factbox, trump, fill, top, job, administration]
Name: text, dtype: object

In [10]:
# List of tokenized sentences
sentences = X_train_tok.tolist()

# Train the Word2Vec model
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained model for future use
w2v_model.save("word2vec.model")

In [11]:
# Get the vector for the word 'fake'
vector = w2v_model.wv['fake']
print(vector)

# Find most similar words to 'fake'
similar_words = w2v_model.wv.most_similar('fake')
print(similar_words)

[ 0.1009098   0.42369646  0.16905278  0.11438417  0.15395679 -1.0888929
  0.04600015  1.2877456  -0.45858213 -0.00980897 -0.61340445 -0.4006611
  0.35187316  0.3254912   0.05728637 -0.90026647  0.3217223  -0.5676425
  0.08375329 -1.6426866   0.87598896  0.08074313 -0.16266617  0.02741987
 -0.56688917  0.13562736 -0.3372227  -0.18075691 -0.53378326 -0.02543711
  0.21432543  1.1239637   0.5955622  -0.20186833  0.10383294  0.39326224
 -0.7218565  -0.68641555 -0.34389305 -0.72810245  0.50160515 -0.46925777
 -0.45580345 -0.37470594 -0.00776394 -0.7345219  -1.1103567  -0.03710944
  0.78701025  0.23681286 -0.05692272 -0.06359635 -0.01440202 -0.37049296
 -0.18431452  0.1768427  -0.00879263  0.43672043 -0.12425543  0.4377274
  0.36666018  0.3076133  -0.49570087 -0.4651558  -0.30933484  0.8210563
  0.3556036  -0.05598972 -0.5506664   0.17317593 -0.27808043 -0.04978522
  0.04907046 -0.6970299   0.81413674  0.40380546  0.4773325  -0.51699656
 -0.3426096   0.42831302 -0.27656826  0.556901   -0.6685

In [13]:
# Function to calculate the average word2vec for each sentence
def sentence_to_vec(sentence, model):
    # Filter out words that are not in the Word2Vec vocabulary
    words_in_vocab = [word for word in sentence if word in model.wv]
    
    # If none of the words in the sentence are in the vocabulary, return a zero vector
    if len(words_in_vocab) == 0:
        return np.zeros(model.vector_size)
    
    # Average the word vectors for all words in the sentence
    return np.mean([model.wv[word] for word in words_in_vocab], axis=0)

# Apply to your dataset to get sentence embeddings for all titles
X_train_emb = X_train_tok.apply(lambda x: sentence_to_vec(x, w2v_model))
X_test_emb = X_test_tok.apply(lambda x: sentence_to_vec(x, w2v_model))

# Convert the list of embeddings to a NumPy array for model training
X_train_emb = np.array(X_train_emb.tolist())
X_test_emb = np.array(X_test_emb.tolist())

In [17]:
# Assuming X contains your sentence embeddings (e.g., from Word2Vec)
# Normalize the embeddings using L2 norm
X_train_norm = normalize(X_train_emb, norm='l2')
X_test_norm = normalize(X_test_emb, norm='l2')

# Now X_normalized contains L2-normalized embeddings

In [18]:
# Train a logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train_norm, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_norm)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      3515
           1       0.86      0.84      0.85      3316

    accuracy                           0.85      6831
   macro avg       0.85      0.85      0.85      6831
weighted avg       0.85      0.85      0.85      6831



In [15]:
# Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)  # You can experiment with the 'kernel' parameter
svm_model.fit(X_train_norm, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_norm)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Accuracy: {accuracy}")

# Print a detailed classification report
print(classification_report(y_test, y_pred))


SVM Accuracy: 0.8647342995169082
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      3515
           1       0.87      0.85      0.86      3316

    accuracy                           0.86      6831
   macro avg       0.86      0.86      0.86      6831
weighted avg       0.86      0.86      0.86      6831



In [21]:
# Convert the dataset into DMatrix, XGBoost's internal data structure
train_dmatrix = xgb.DMatrix(X_train_norm, label=y_train)
test_dmatrix = xgb.DMatrix(X_test_norm, label=y_test)

# Set up the XGBoost parameters
params = {
    'objective': 'binary:logistic',  # Binary classification
    'eval_metric': 'logloss',        # Logarithmic loss as evaluation metric
    'max_depth': 6,                  # Maximum depth of a tree
    'eta': 0.1,                      # Learning rate
    'subsample': 0.8,                # Fraction of samples used per tree
    'colsample_bytree': 0.8,         # Fraction of features used per tree
    'seed': 42                       # Random seed for reproducibility
}

# Train the XGBoost model
model = xgb.train(params, train_dmatrix, num_boost_round=100)

# Make predictions on the test set
y_pred_prob = model.predict(test_dmatrix)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]  # Convert probabilities to binary predictions

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


XGBoost Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3515
           1       0.89      0.88      0.88      3316

    accuracy                           0.89      6831
   macro avg       0.89      0.89      0.89      6831
weighted avg       0.89      0.89      0.89      6831



In [22]:
# Initialize XGBClassifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 6],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearch
grid_search.fit(X_train_norm, y_train)

# Print the best parameters
print(f"Best parameters found: {grid_search.best_params_}")

# Use the best estimator to make predictions
y_pred = grid_search.best_estimator_.predict(X_test_norm)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost (with tuning) Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best parameters found: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 1.0}
XGBoost (with tuning) Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      3515
           1       0.89      0.88      0.88      3316

    accuracy                           0.89      6831
   macro avg       0.89      0.89      0.89      6831
weighted avg       0.89      0.89      0.89      6831



In [23]:
# Try Random Forest Classifier

# Initialize Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_norm, y_train)

# Predict on the test data
y_pred_rf = rf_classifier.predict(X_test_norm)

# Evaluate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 89.28%
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      3515
           1       0.90      0.88      0.89      3316

    accuracy                           0.89      6831
   macro avg       0.89      0.89      0.89      6831
weighted avg       0.89      0.89      0.89      6831

