# Project 4: Regression Analysis and Define Your Own Task

## Part 2

### Question 9.1

In [1]:
import json
import numpy as np

def calculate_statistics(filename):
    with open(filename, 'r') as file:
        tweets = [json.loads(line) for line in file]
    
    num_tweets = len(tweets)
    if num_tweets == 0:
        print(f"No tweets found in {filename}")
        return
    
    times = [tweet['citation_date'] for tweet in tweets]
    max_time = max(times)
    min_time = min(times)
    total_followers = sum(tweet['author']['followers'] for tweet in tweets)
    total_retweets = sum(tweet['metrics']['citations']['total'] for tweet in tweets)
    
    avg_tweets_per_hour = num_tweets * 3600 / (max_time - min_time)
    avg_followers_per_tweet = total_followers / num_tweets
    avg_retweets_per_tweet = total_retweets / num_tweets
    
    print(filename)
    print('Average number of tweets per hour:', avg_tweets_per_hour)
    print('Average number of followers of users posting the tweets per tweet:', avg_followers_per_tweet)
    print('Average number of retweets per tweet:', avg_retweets_per_tweet)
    print('-' * 50)

files = ['ECE219_tweet_data/tweets_#gohawks.txt', 'ECE219_tweet_data/tweets_#gopatriots.txt', 
         'ECE219_tweet_data/tweets_#nfl.txt', 'ECE219_tweet_data/tweets_#patriots.txt', 
         'ECE219_tweet_data/tweets_#sb49.txt', 'ECE219_tweet_data/tweets_#superbowl.txt']

for file in files:
    calculate_statistics(file)

ECE219_tweet_data/tweets_#gohawks.txt
Average number of tweets per hour: 292.48785062173687
Average number of followers of users posting the tweets per tweet: 2217.9237355281984
Average number of retweets per tweet: 2.0132093991319877
--------------------------------------------------
ECE219_tweet_data/tweets_#gopatriots.txt
Average number of tweets per hour: 40.954698006061946
Average number of followers of users posting the tweets per tweet: 1427.2526051635405
Average number of retweets per tweet: 1.4081919101697078
--------------------------------------------------
ECE219_tweet_data/tweets_#nfl.txt
Average number of tweets per hour: 397.0213901819841
Average number of followers of users posting the tweets per tweet: 4662.37544523693
Average number of retweets per tweet: 1.5344602655543254
--------------------------------------------------
ECE219_tweet_data/tweets_#patriots.txt
Average number of tweets per hour: 750.8942646068899
Average number of followers of users posting the tweet

### Question 9.2

In [None]:
import json
import math
import matplotlib.pyplot as plt
from datetime import datetime
import pytz

pst_tz = pytz.timezone('America/Los_Angeles')

def report_tweets(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
        min_time = math.inf
        max_time = 0
        n_tweets = {}
        
        # Determine min and max time
        for line in lines:
            json_obj = json.loads(line)
            citation_date = json_obj['citation_date']
            min_time = min(min_time, citation_date)
            max_time = max(max_time, citation_date)
        
        # Calculate total hours and initialize tweet count list
        total_hours = math.ceil((max_time - min_time) / 3600)
        n_tweets = [0] * total_hours
        
        # Count tweets in each hour
        for line in lines:
            json_obj = json.loads(line)
            index = math.floor((json_obj['citation_date'] - min_time) / 3600)
            n_tweets[index] += 1
        
        return n_tweets

q2_files = ['ECE219_tweet_data/tweets_#nfl.txt','ECE219_tweet_data/tweets_#superbowl.txt']

for file in q2_files:
    n_tweets = report_tweets(file)
    plt.figure(figsize=(10,6))
    plt.bar(range(len(n_tweets)),n_tweets)
    plt.xlabel('Hours over time')
    plt.ylabel('Number of tweets')
    plt.title('Number of tweets per hour for '+file)

plt.show()

### Question 10

#### Preprocess Data

In [1]:
import json
import pandas as pd
import random
import os

# Define the data files
data_files = ['ECE219_tweet_data/tweets_#gohawks.txt', 'ECE219_tweet_data/tweets_#gopatriots.txt', 
              'ECE219_tweet_data/tweets_#nfl.txt', 'ECE219_tweet_data/tweets_#patriots.txt', 
              'ECE219_tweet_data/tweets_#sb49.txt', 'ECE219_tweet_data/tweets_#superbowl.txt']

# Define the subsampling ratio (e.g., 10%)
subsample_ratio = 0.01

# Initialize an empty list to store subsampled tweets
subsampled_tweets = []

# Iterate over each data file
for file in data_files:
    # Extract the label from the file name
    label = os.path.splitext(os.path.basename(file))[0].split('_')[-1]
    # Open the file and read each line
    with open(file, 'r') as f:
        # Read a subsample of tweets from each file
        for line in f:
            # Randomly decide whether to include the tweet in the subsample
            if random.random() < subsample_ratio:
                # Parse the tweet from JSON
                tweet = json.loads(line)
                # Add the label to the tweet
                tweet['label'] = label
                # Append the tweet to the subsampled list
                subsampled_tweets.append(tweet)

# Create a DataFrame from the subsampled tweets
tweet_df = pd.DataFrame(subsampled_tweets)

# Print the first few rows of the DataFrame
print(tweet_df.head())


   firstpost_date                                              title  \
0      1421210358  I've been doing this page for almost a year no...   
1      1421244982  Good Morning #12s 4 days left, lets get it #Go...   
2      1421250919  ‚òï üèà There's a #Storm coming #12s ‚õÖ ‚òÅ ‚ö° üèà #GBvs...   
3      1421255373  Congrats to @DakotaDrenth on his 100th career ...   
4      1421259406                                 #GoHawks luh Bitxh   

                                                 url  \
0  http://twitter.com/Girlsof12/status/5552226009...   
1  http://twitter.com/JustQ17/status/555367826077...   
2  http://twitter.com/kingkaps7/status/5553927262...   
3  http://twitter.com/HMSwrestling/status/5554114...   
4  http://twitter.com/_B3NJI/status/5554283220928...   

                                               tweet  \
0  {'contributors': None, 'truncated': False, 'te...   
1  {'contributors': None, 'truncated': False, 'te...   
2  {'contributors': None, 'truncated': False, 't

In [2]:
import pandas as pd
import pytz

# Assuming 'tweet_df' is your DataFrame containing the tweet data
# Let's say you have two columns 'firstpost_date' and 'citation_date' containing UNIX timestamps

# Define the PST time zone
pst_tz = pytz.timezone('America/Los_Angeles')

# Convert UNIX timestamps to datetime objects in PST time zone
tweet_df['firstpost_date_pst'] = pd.to_datetime(tweet_df['firstpost_date'], unit='s', utc=True).dt.tz_convert(pst_tz)
tweet_df['citation_date_pst'] = pd.to_datetime(tweet_df['citation_date'], unit='s', utc=True).dt.tz_convert(pst_tz)

# Print the DataFrame to verify the changes
print(tweet_df.head())

# Save the DataFrame as a CSV file
tweet_df.to_csv('tweet_data_with_pst.csv', index=False)

   firstpost_date                                              title  \
0      1421210358  I've been doing this page for almost a year no...   
1      1421244982  Good Morning #12s 4 days left, lets get it #Go...   
2      1421250919  ‚òï üèà There's a #Storm coming #12s ‚õÖ ‚òÅ ‚ö° üèà #GBvs...   
3      1421255373  Congrats to @DakotaDrenth on his 100th career ...   
4      1421259406                                 #GoHawks luh Bitxh   

                                                 url  \
0  http://twitter.com/Girlsof12/status/5552226009...   
1  http://twitter.com/JustQ17/status/555367826077...   
2  http://twitter.com/kingkaps7/status/5553927262...   
3  http://twitter.com/HMSwrestling/status/5554114...   
4  http://twitter.com/_B3NJI/status/5554283220928...   

                                               tweet  \
0  {'contributors': None, 'truncated': False, 'te...   
1  {'contributors': None, 'truncated': False, 'te...   
2  {'contributors': None, 'truncated': False, 't

In [3]:
# Set the max_colwidth parameter to None to display entire contents of the metrics column
pd.set_option('display.max_colwidth', None)

# Print the first row of the DataFrame
print(tweet_df.head(1))

   firstpost_date  \
0      1421210358   

                                                                                                                                        title  \
0  I've been doing this page for almost a year now and there are 2 ladies that get the most favorites. @jaismiles and @Jamie12thlady #GoHawks   

                                                      url  \
0  http://twitter.com/Girlsof12/status/555222600930975745   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

#### Clean Data Frame

In [4]:
import pandas as pd
import json
import datetime
import math
import os

# Define the data files
data_file = 'ECE219_tweet_data/tweets_#gohawks.txt'  # Choose one data file for testing

# Initialize an empty DataFrame to store features
feature_df = pd.DataFrame(columns=['Time of the day', 'Number of tweets', 
                                   'Number of retweets', 'Number of followers', 
                                   'Tweet text', 'Hashtag'])


# Function to extract features and add to DataFrame
def report_features_to_df(filename, percent=5):
    with open(filename, 'r') as file:
        lines = file.readlines()
        
        # Calculate the number of lines to process based on the percentage
        num_lines_to_process = math.ceil(len(lines) * percent / 100)
        
        # Initialize an empty DataFrame to store features
        features = pd.DataFrame(columns=['Time of the day', 'Number of tweets', 
                                         'Number of retweets', 'Number of followers', 
                                         'Tweet text', 'Hashtag'])
        
        # Process each line up to the calculated number of lines to process
        for i in range(num_lines_to_process):
            json_obj = json.loads(lines[i])
            
            # Add features to DataFrame
            index = i  # Assuming the index of the first row is 0
            features.loc[index, 'Time of the day'] = datetime.datetime.fromtimestamp(json_obj['citation_date']).hour
            features.loc[index, 'Number of tweets'] = 1
            features.loc[index, 'Number of retweets'] = json_obj['metrics']['citations']['total']
            features.loc[index, 'Number of followers'] = json_obj['author']['followers']
            features.loc[index, 'Tweet text'] = json_obj['tweet']['text']
            # Extract hashtags from entities, hashtags, and tweet text
            hashtags = set()
            if 'entities' in json_obj['tweet'] and 'hashtags' in json_obj['tweet']['entities']:
                for hashtag in json_obj['tweet']['entities']['hashtags']:
                    hashtags.add(hashtag['text'].lower())
            if 'hashtags' in json_obj['tweet'] and json_obj['tweet']['hashtags'] is not None:
                for hashtag in json_obj['tweet']['hashtags']:
                    hashtags.add(hashtag.lower())
            if 'text' in json_obj['tweet']:
                text = json_obj['tweet']['text']
                for word in text.split():
                    if word.startswith('#'):
                        hashtags.add(word[1:].lower())
            features.loc[index, 'Hashtag'] = ', '.join(hashtags) if hashtags else None
        
        return features

# Extract features from 5% of the data file
features_df = report_features_to_df(data_file, percent=5)

# Print the extracted features
print(features_df)

     Time of the day Number of tweets Number of retweets Number of followers  \
0                 12                1                  5              1752.0   
1                 12                1                  2               258.0   
2                 22                1                  5                22.0   
3                 22                1                  2                22.0   
4                 22                1                  2                22.0   
...              ...              ...                ...                 ...   
8452              20                1                  1              1802.0   
8453              20                1                  1               200.0   
8454              20                1                  1                11.0   
8455              20                1                  1                99.0   
8456              20                1                  1                27.0   

                                       

In [1]:
import pandas as pd
import json
import datetime
import math
import os

# Initialize an empty DataFrame to store features
feature_df = pd.DataFrame(columns=['Time of the day', 'Number of tweets', 
                                   'Number of retweets', 'Number of followers', 
                                   'Tweet text', 'Hashtag'])

# Function to extract features and add to DataFrame
def report_features_to_df(filename, percent=5):
    # Extract hashtag label from the file name
    hashtag_label = os.path.basename(filename).split('_')[-1].split('.')[0][1:]
    
    with open(filename, 'r') as file:
        lines = file.readlines()
        
        # Calculate the number of lines to process based on the percentage
        num_lines_to_process = math.ceil(len(lines) * percent / 100)
        
        # Initialize an empty DataFrame to store features
        features = pd.DataFrame(columns=['Time of the day', 'Number of tweets', 
                                         'Number of retweets', 'Number of followers', 
                                         'Tweet text', 'Hashtag', 'Hashtag Label'])
        
        # Process each line up to the calculated number of lines to process
        for i in range(num_lines_to_process):
            json_obj = json.loads(lines[i])
            
            # Add features to DataFrame
            index = i  # Assuming the index of the first row is 0
            features.loc[index, 'Time of the day'] = datetime.datetime.fromtimestamp(json_obj['citation_date']).hour
            features.loc[index, 'Number of tweets'] = 1
            features.loc[index, 'Number of retweets'] = json_obj['metrics']['citations']['total']
            features.loc[index, 'Number of followers'] = json_obj['author']['followers']
            features.loc[index, 'Tweet text'] = json_obj['tweet']['text']
            # Extract hashtags from entities, hashtags, and tweet text
            hashtags = set()
            if 'entities' in json_obj['tweet'] and 'hashtags' in json_obj['tweet']['entities']:
                for hashtag in json_obj['tweet']['entities']['hashtags']:
                    hashtags.add(hashtag['text'].lower())
            if 'hashtags' in json_obj['tweet'] and json_obj['tweet']['hashtags'] is not None:
                for hashtag in json_obj['tweet']['hashtags']:
                    hashtags.add(hashtag.lower())
            if 'text' in json_obj['tweet']:
                text = json_obj['tweet']['text']
                for word in text.split():
                    if word.startswith('#'):
                        hashtags.add(word[1:].lower())
            features.loc[index, 'Hashtag'] = ', '.join(hashtags) if hashtags else None
            features.loc[index, 'Hashtag Label'] = hashtag_label
        
        return features


In [2]:
# Define the data files
data_files = ['ECE219_tweet_data/tweets_#gohawks.txt', 'ECE219_tweet_data/tweets_#gopatriots.txt', 
              'ECE219_tweet_data/tweets_#nfl.txt', 'ECE219_tweet_data/tweets_#patriots.txt', 
              'ECE219_tweet_data/tweets_#sb49.txt', 'ECE219_tweet_data/tweets_#superbowl.txt']

# Initialize an empty DataFrame to store combined features
combined_features_df = pd.DataFrame(columns=['Time of the day', 'Number of tweets', 
                                             'Number of retweets', 'Number of followers', 
                                             'Tweet text', 'Hashtag', 'Hashtag Label'])

# Extract features from each data file and concatenate them
for file in data_files:
    features_df = report_features_to_df(file, percent=5)
    combined_features_df = pd.concat([combined_features_df, features_df])

# Reset index of the combined dataframe
combined_features_df.reset_index(drop=True, inplace=True)

# Print the combined features dataframe
print(combined_features_df)

       Time of the day Number of tweets Number of retweets  \
0                   12                1                  5   
1                   12                1                  2   
2                   22                1                  5   
3                   22                1                  2   
4                   22                1                  2   
...                ...              ...                ...   
141186               8                1                  3   
141187               8                1                  1   
141188               8                1                  1   
141189               8                1                  1   
141190               8                1                  1   

       Number of followers                                         Tweet text  \
0                   1752.0  I &lt;3 our defense! #GoHawks http://t.co/U1pc...   
1                    258.0  twelfth dogs are ready! #gohawks #dogslife htt...   
2           

#### Hashtag Classification Model

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Encode 'Tweet text' using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(data['Tweet text'])

# Convert numerical features to appropriate data types
numerical_features = ['Number of tweets', 'Number of retweets', 'Number of followers', 'Time of the day']
for feature in numerical_features:
    data[feature] = pd.to_numeric(data[feature], errors='coerce')  # coerce invalid parsing to NaN

# Remove rows with NaN values
data.dropna(subset=numerical_features, inplace=True)

# Convert numerical features to sparse format
X_num_sparse = scipy.sparse.csr_matrix(data[numerical_features].values)

# Concatenate numerical and textual features
X = scipy.sparse.hstack([X_text, X_num_sparse])

# Encode labels
le = LabelEncoder()
y = le.fit_transform(data['Hashtag Label'])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

     gohawks       0.90      0.93      0.91      1681
  gopatriots       0.87      0.84      0.86       223
         nfl       0.82      0.78      0.80      2331
    patriots       0.89      0.88      0.88      4470
        sb49       0.99      0.99      0.99      7437
   superbowl       0.95      0.97      0.96     12097

    accuracy                           0.94     28239
   macro avg       0.91      0.90      0.90     28239
weighted avg       0.94      0.94      0.94     28239



#### Hashtag Classification Baseline Model

In [30]:
from sklearn.utils import shuffle
from sklearn.metrics import classification_report

# Shuffle the training labels
shuffled_y_train = shuffle(y_train, random_state=42)

# Print the shuffled labels
print("Shuffled Training Labels:")
print(shuffled_y_train)

# Calculate the class distribution in the training data
class_distribution = {label: count / len(shuffled_y_train) for label, count in zip(*np.unique(shuffled_y_train, return_counts=True))}
print("\nClass Distribution in the Training Data:")
print(class_distribution)

# Randomly assign labels based on the class distribution
random_pred_baseline = np.random.choice(list(class_distribution.keys()), size=len(y_test), p=list(class_distribution.values()))

# Print classification report for the baseline model
print("\nBaseline Classification Report:")
print(classification_report(y_test, random_pred_baseline, target_names=le.classes_, zero_division=1))



Shuffled Training Labels:
[3 5 2 ... 4 2 5]

Class Distribution in the Training Data:
{0: 0.05999008428358949, 1: 0.008437212267157731, 2: 0.08252177916283022, 3: 0.1554819746440966, 4: 0.2633508038812947, 5: 0.43021814576103123}

Baseline Classification Report:
              precision    recall  f1-score   support

     gohawks       0.06      0.06      0.06      1681
  gopatriots       0.01      0.01      0.01       223
         nfl       0.10      0.10      0.10      2331
    patriots       0.15      0.15      0.15      4470
        sb49       0.26      0.26      0.26      7437
   superbowl       0.42      0.43      0.43     12097

    accuracy                           0.29     28239
   macro avg       0.17      0.17      0.17     28239
weighted avg       0.29      0.29      0.29     28239



#### Retweet Model

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Feature Engineering
# Encode 'Tweet text' using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(data['Tweet text'])

# Include numerical features
X_num = data[['Number of followers', 'Time of the day']]  # Add other relevant numerical features

# Combine textual and numerical features
X = pd.concat([pd.DataFrame(X_text.toarray()), X_num], axis=1)

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Target variable
y = data['Number of retweets']  # Replace 'Number of retweets' with 'Number of likes' or 'Number of quotes' as needed

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a simpler model like Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Compute Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Compute Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Compute R-squared (R^2) score
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R^2) Score:", r2)


Mean Absolute Error: 5.535046352890463
Mean Squared Error: 365.61456975189583
Root Mean Squared Error: 19.121050435368236
R-squared (R^2) Score: -0.6435524383198834




#### Optimize the Model for Better Performance

In [8]:
# Import required libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import scipy.sparse

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Encode 'Tweet text' using TF-IDF
print("Encoding 'Tweet text' using TF-IDF...")
tfidf = TfidfVectorizer(max_features=500)  # Reduced number of features
X_text = tfidf.fit_transform(data['Tweet text'])

# Ensure numerical features are correctly parsed
print("Parsing numerical features...")
numerical_features = ['Number of followers', 'Time of the day']  # Add other relevant numerical features

# Convert numerical features to appropriate data types and handle missing values
for feature in numerical_features:
    data[feature] = pd.to_numeric(data[feature], errors='coerce')  # Convert to numeric, coerce errors to NaN

# Remove rows with NaN values in numerical features
data.dropna(subset=numerical_features, inplace=True)

# Extract numerical features after ensuring they are numeric
X_num = data[numerical_features].values

# Convert numerical features to sparse format
print("Converting numerical features to sparse format...")
X_num_sparse = scipy.sparse.csr_matrix(X_num)

# Combine textual and numerical features
print("Combining textual and numerical features...")
X_combined = scipy.sparse.hstack([X_text, X_num_sparse])

# Standardize numerical features
print("Standardizing numerical features...")
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_combined[:, -X_num.shape[1]:] = scaler.fit_transform(X_combined[:, -X_num.shape[1]:])

# Apply PCA for Dimensionality Reduction
print("Applying PCA for Dimensionality Reduction...")
pca = PCA(n_components=100)  # Reduced number of components
X = pca.fit_transform(X_combined.toarray())

# Target variable
y = data['Number of retweets']

# Split the data into train and test sets
print("Splitting the data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor model with hyperparameter tuning using RandomizedSearchCV
print("Training Random Forest Regressor model with hyperparameter tuning...")
param_distributions = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

# There are 2 * 2 * 2 = 8 combinations in the search space
n_iter = 8  # Match the number of parameter combinations

rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=n_iter, cv=3, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_

# Predict on the test set
print("Making predictions on the test set...")
y_pred = best_rf.predict(X_test)

# Compute evaluation metrics
print("Computing evaluation metrics...")
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Evaluation Metrics:")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R^2) Score:", r2)

Encoding 'Tweet text' using TF-IDF...
Parsing numerical features...
Converting numerical features to sparse format...
Combining textual and numerical features...
Standardizing numerical features...
Applying PCA for Dimensionality Reduction...
Splitting the data into train and test sets...
Training Random Forest Regressor model with hyperparameter tuning...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END .max_depth=10, min_samples_split=5, n_estimators=50; total time= 6.0min
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=13.0min
[CV] END .max_depth=20, min_samples_split=5, n_estimators=50; total time=13.3min
[CV] END .max_depth=10, min_samples_split=2, n_estimators=50; total time= 6.1min
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=13.1min
[CV] END .max_depth=20, min_samples_split=5, n_estimators=50; total time=13.4min
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=12.6min
[CV] END max_d

In [None]:
# Import required libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import scipy.sparse

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Encode 'Tweet text' using TF-IDF
print("Encoding 'Tweet text' using TF-IDF...")
tfidf = TfidfVectorizer(max_features=500)  # Reduced number of features
X_text = tfidf.fit_transform(data['Tweet text'])

# Ensure numerical features are correctly parsed
print("Parsing numerical features...")
numerical_features = ['Number of followers', 'Time of the day']  # Add other relevant numerical features

# Convert numerical features to appropriate data types and handle missing values
for feature in numerical_features:
    data[feature] = pd.to_numeric(data[feature], errors='coerce')  # Convert to numeric, coerce errors to NaN

# Remove rows with NaN values in numerical features
data.dropna(subset=numerical_features, inplace=True)

# Extract numerical features after ensuring they are numeric
X_num = data[numerical_features].values

# Convert numerical features to sparse format
print("Converting numerical features to sparse format...")
X_num_sparse = scipy.sparse.csr_matrix(X_num)

# Combine textual and numerical features
print("Combining textual and numerical features...")
X_combined = scipy.sparse.hstack([X_text, X_num_sparse])

# Standardize numerical features
print("Standardizing numerical features...")
scaler = StandardScaler(with_mean=False)  # Set with_mean=False for sparse matrices
X_combined = scipy.sparse.hstack([X_text, scaler.fit_transform(X_num_sparse)])

# Apply PCA for Dimensionality Reduction
print("Applying PCA for Dimensionality Reduction...")
pca = PCA(n_components=100)  # Reduced number of components
X = pca.fit_transform(X_combined.toarray())

# Target variable
y = data['Number of retweets']

# Split the data into train and test sets
print("Splitting the data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting Regressor model with hyperparameter tuning using RandomizedSearchCV
print("Training Gradient Boosting Regressor model with hyperparameter tuning...")
param_distributions = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

# There are 2 * 2 * 2 = 8 combinations in the search space
n_iter = 8  # Match the number of parameter combinations

gbr = GradientBoostingRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=gbr, param_distributions=param_distributions, n_iter=n_iter, cv=3, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

best_gbr = random_search.best_estimator_

# Predict on the test set
print("Making predictions on the test set...")
y_pred = best_gbr.predict(X_test)

# Compute evaluation metrics
print("Computing evaluation metrics...")
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Evaluation Metrics:")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R^2) Score:", r2)

#### Retweet Baseline Model

In [9]:
from sklearn.dummy import DummyRegressor

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Extracting target variable
y = data['Number of retweets']

# Predicting the mean value of the target variable as a baseline
dummy_regressor = DummyRegressor(strategy='mean')
dummy_regressor.fit(X_train, y_train)
y_pred_baseline = dummy_regressor.predict(X_test)

# Compute evaluation metrics for the baseline model
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
mse_baseline = mean_squared_error(y_test, y_pred_baseline)
rmse_baseline = np.sqrt(mse_baseline)
r2_baseline = r2_score(y_test, y_pred_baseline)

# Print evaluation metrics for the baseline model
print("Baseline Model Evaluation Metrics:")
print("Mean Absolute Error:", mae_baseline)
print("Mean Squared Error:", mse_baseline)
print("Root Mean Squared Error:", rmse_baseline)
print("R-squared (R^2) Score:", r2_baseline)

Baseline Model Evaluation Metrics:
Mean Absolute Error: 2.86311979720438
Mean Squared Error: 222.95390778487854
Root Mean Squared Error: 14.931641161803968
R-squared (R^2) Score: -0.002247911021285187


#### Relative Time Model

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Feature Engineering
# Example: Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data, columns=["Hashtag Label"])

# Example: Extract features from text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(data['Tweet text'])

# Combine textual features with other numerical features
X_num = data.drop(['Tweet text', 'Hashtag'], axis=1)
X = pd.concat([pd.DataFrame(X_text.toarray()), X_num], axis=1)

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Target variable
y = data['Time of the day']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R^2) Score:", r2)


Mean Absolute Error: 6.293879696934464e-13
Mean Squared Error: 7.7329435180892385e-25
Root Mean Squared Error: 8.793715664091737e-13
R-squared (R^2) Score: 1.0




In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Assuming combined_features_df is already loaded as a DataFrame
data = combined_features_df

# Feature Engineering
# Example: Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data, columns=["Hashtag Label"])

# Example: Extract features from text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(data['Tweet text'])

# Combine textual features with other numerical features
X_num = data.drop(['Tweet text', 'Hashtag'], axis=1)
X = pd.concat([pd.DataFrame(X_text.toarray()), X_num.reset_index(drop=True)], axis=1)

# Convert feature names to strings
X.columns = X.columns.astype(str)

# Target variable
y = data['Time of the day']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics for Linear Regression
print("Linear Regression Model")
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared (R^2) Score:", r2)

# Baseline model: Predict the mean value of the target variable
y_baseline_pred = np.full_like(y_test, y_train.mean())

# Evaluation for the baseline model
baseline_mae = mean_absolute_error(y_test, y_baseline_pred)
baseline_mse = mean_squared_error(y_test, y_baseline_pred)
baseline_rmse = mean_squared_error(y_test, y_baseline_pred, squared=False)
baseline_r2 = r2_score(y_test, y_baseline_pred)

# Print evaluation metrics for Baseline Model
print("\nBaseline Model (Mean Predictor)")
print("Mean Absolute Error:", baseline_mae)
print("Mean Squared Error:", baseline_mse)
print("Root Mean Squared Error:", baseline_rmse)
print("R-squared (R^2) Score:", baseline_r2)


Linear Regression Model
Mean Absolute Error: 6.293879696934464e-13
Mean Squared Error: 7.7329435180892385e-25
Root Mean Squared Error: 8.793715664091737e-13
R-squared (R^2) Score: 1.0

Baseline Model (Mean Predictor)
Mean Absolute Error: 4.717447501682071
Mean Squared Error: 32.07096568575375
Root Mean Squared Error: 5.663123315428841
R-squared (R^2) Score: -0.016396757844456156


