In [2]:
import pandas as pd
from textblob import TextBlob

# Load the dataset
data = pd.read_csv('2020-04-09 Coronavirus Tweets.csv')

# Rename columns
data.rename(columns={'favourites_count': 'likes', 'retweet_count': 'retweets', 'friends_count': 'friends'}, inplace=True)

# Verify column names and types
print(data.info())

# Calculate total engagement
data['total_engagement'] = data['likes'] + data['retweets'] + data['followers_count']

# Determine high engagement
high_engagement_threshold = data['total_engagement'].quantile(0.75)
data['high_engagement'] = (data['total_engagement'] >= high_engagement_threshold).astype(int)

# Perform sentiment analysis
data['sentiment'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Check the first few rows of your processed data
print(data[['text', 'total_engagement', 'high_engagement', 'sentiment']].head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473226 entries, 0 to 473225
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   status_id             473226 non-null  int64  
 1   user_id               473226 non-null  int64  
 2   created_at            473226 non-null  object 
 3   screen_name           473226 non-null  object 
 4   text                  473226 non-null  object 
 5   source                473224 non-null  object 
 6   reply_to_status_id    53740 non-null   float64
 7   reply_to_user_id      64528 non-null   float64
 8   reply_to_screen_name  64528 non-null   object 
 9   is_quote              473226 non-null  bool   
 10  is_retweet            473226 non-null  bool   
 11  likes                 473226 non-null  int64  
 12  retweets              473226 non-null  int64  
 13  country_code          20005 non-null   object 
 14  place_full_name       20097 non-null   object 
 15  

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
text_features = tfidf_vectorizer.fit_transform(data['text'])

# Stack text features with other numerical features
features = hstack([text_features, data[['sentiment', 'total_engagement']].values])

# Labels are the 'high_engagement' column
labels = data['high_engagement'].values

# Classifier Setup and Parameter Tuning with GridSearchCV
svc = SVC(kernel='rbf')
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.001, 0.01, 0.1, 1]
}
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(features, labels)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# K-Fold Cross-Validation Setup and Evaluation
skf = StratifiedKFold(n_splits=5)
fold_idx = 1
for train_index, test_index in skf.split(features, labels):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    best_svc = SVC(kernel='rbf', C=grid_search.best_params_['C'], gamma=grid_search.best_params_['gamma'])
    best_svc.fit(X_train, y_train)
    predictions = best_svc.predict(X_test)
    print(f"Classification Report for fold {fold_idx}:")
    print(classification_report(y_test, predictions))
    fold_idx += 1
