In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# Select relevant columns for Bot Behavior Analysis
feature_cols_bot = ['followers_count', 'friends_count', 'favourites_count', 'statuses_count', 'listed_count', 'following']
X_bot = joined_df[feature_cols_bot]
y_bot = joined_df['BotScore']

# Split the data into training and test sets
X_train_bot, X_test_bot, y_train_bot, y_test_bot = train_test_split(X_bot, y_bot, test_size=0.2, random_state=42)

# Initialize Random Forest Regressor
rf_model_bot = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model_bot.fit(X_train_bot, y_train_bot)

# Make predictions on the test set
y_pred_bot = rf_model_bot.predict(X_test_bot)

# Evaluate the model (MSE and RMSE)
mse = mean_squared_error(y_test_bot, y_pred_bot)
rmse = np.sqrt(mse)
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

# Perform K-Fold Cross-Validation
cv_scores = cross_val_score(rf_model_bot, X_train_bot, y_train_bot, cv=10, scoring='neg_mean_squared_error')
print(f'Mean CV MSE: {-np.mean(cv_scores)}')
print(f'Standard Deviation of CV MSE: {np.std(cv_scores)}')

# Grid Search for Hyperparameter Tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None]
}

grid_search = GridSearchCV(estimator=rf_model_bot, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train_bot, y_train_bot)

# Generate and Plot Learning Curves
train_sizes, train_scores, val_scores = learning_curve(
    grid_search.best_estimator_, X_train_bot, y_train_bot, cv=5, scoring='neg_mean_squared_error',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)

train_mean = np.mean(-train_scores, axis=1)
train_std = np.std(-train_scores, axis=1)
val_mean = np.mean(-val_scores, axis=1)
val_std = np.std(-val_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training error')
plt.plot(train_sizes, val_mean, label='Validation error')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='gray')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, color='gainsboro')

plt.title('Learning Curve for Random Forest Regressor')
plt.xlabel('Number of training samples')
plt.ylabel('Mean Squared Error')
plt.legend(loc='best')
plt.grid()
plt.show()

# Store the BotScore predictions for the entire dataset in a new column
predicted_BotScore = grid_search.best_estimator_.predict(X_bot)
joined_df['predicted_BotScore'] = predicted_BotScore

# Calculate accuracy for bot detection
y_pred_binary = np.where(y_pred_bot >= 0.5, 1, 0)
y_test_binary = np.where(y_test_bot >= 0.5, 1, 0)

accuracy = np.mean(y_pred_binary == y_test_binary)
print(f'Accuracy for Bot Detection: {accuracy * 100:.2f}%')

from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import Softmax
import torch

# Initialize the BERT tokenizer and model for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Softmax function for probabilities
softmax = Softmax(dim=1)

def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = softmax(logits)
    sentiment_score = torch.argmax(probabilities)  # You can map this to your preferred range/scale
    return sentiment_score.item()

# Apply the function to the DataFrame
joined_df['sentiment_score'] = joined_df['tweet'].apply(get_sentiment)

# Show some results
print(joined_df[['tweet', 'sentiment_score']].head())

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Preparing features and target variable
feature_cols = ['mentions', 'quotes', 'replies', 'retweets', 'favourites', 'hashtags', 'sentiment_score']
X = joined_df[feature_cols]
y = joined_df['majority_target']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f'Accuracy for predicting majority_target using XGBoost: {accuracy_xgb}')

# Store the predictions back into the DataFrame
joined_df['predicted_majority_target'] = xgb_model.predict(X)

# Preparing features and target variable
feature_cols = ['predicted_BotScore', 'sentiment_score', 'predicted_majority_target']
X = joined_df[feature_cols]
y = joined_df['BinaryNumTarget']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy for predicting Fake News in Context: {accuracy}')

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Already calculated accuracy
print(f'Accuracy: {accuracy}')

# Calculate and print other evaluation metrics
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')
