### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Reading the dataset

In [2]:
data = pd.read_csv('./Database/userstwitter.csv', delimiter=';')

data.head()



Unnamed: 0,item_id,user_who_published,user_id,text,is_retweet,is_liked,is_quoted,retweet_count,like_count,quote_count,reply_count,user_tweets_count,user_likes_count,user_followees_count,user_followers_count,user_listed_count,tweet_date,user_is_verified,user_registration_date
0,18973154081,100220864,100220864,My Official First single out NOW on I Tunes! J...,0,0,0,1326,5168,374,156,4769,507,93,43302107,33942,2010-07-20 06:21:57,1,2009-12-29 13:07:04
1,133863536169451520,114894699,114894699,"system is currently having issues, we are work...",0,0,0,3563,425,12,313,60,0,1,2107812,3166,2011-11-08 11:08:48,1,2010-02-16 23:35:05
2,133992333908770816,114894699,114894699,service has been fully restored. sorry for th...,0,0,0,669,260,7,208,60,0,1,2107812,3166,2011-11-08 19:40:36,1,2010-02-16 23:35:05
3,142304671904694272,114894699,114894699,sorry there was a brief outage today approxima...,0,0,0,350,137,0,89,60,0,1,2107812,3166,2011-12-01 18:10:52,1,2010-02-16 23:35:05
4,143726488821305345,114894699,114894699,"sorry, system is having problems. we are work...",0,0,0,2067,143,1,83,60,0,1,2107812,3166,2011-12-05 16:20:39,1,2010-02-16 23:35:05


### #1 Feature Engineering

In [None]:
columns_to_normalize = ['retweet_count', 'like_count', 'quote_count', 'reply_count', 'user_tweets_count', 'user_likes_count', 'user_followees_count', 'user_followers_count', 'user_listed_count']  # List of column names to convert

for column in columns_to_normalize:
    data[column] = pd.to_numeric(data[column], errors='coerce')
    
data['user_who_published'] = data['user_who_published'].astype('int64')
data['tweet_date'] = pd.to_datetime(data['tweet_date'])
data['user_registration_date'] = pd.to_datetime(data['user_registration_date'])

now = pd.Timestamp(datetime.now())
data['tweet_date'] = (now - data['tweet_date']).dt.days
data['user_registration_date'] = (now - data['user_registration_date']).dt.days

data['tweet_date'] = 1 - ((data['tweet_date'] - data['tweet_date'].min()) / (data['tweet_date'].max() - data['tweet_date'].min()))
data['user_registration_date'] = 1 - ((data['user_registration_date'] - data['user_registration_date'].min()) / (data['user_registration_date'].max() - data['user_registration_date'].min()))

data[columns_to_normalize] = data[columns_to_normalize].fillna(0)
    
data.head()



#### 1.1 Visualizing engagement metrics distribuition

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

# Prepare the figure with specified dimensions
plt.figure(figsize=(6, 4))

# Combined settings for each column: index, color, label, linestyle, marker
column_settings = [
    (6, 'blue', 'retweet_count', '-', 'o'),
    (7, 'green', 'like_count', '--', '^'),
    (8, 'red', 'quote_count', '-.', 's'),
    (9, 'purple', 'reply_count', ':', 'd'),
    (10, 'yellow', 'user_tweets_count', '-', '1'),
    (11, 'black', 'user_likes_count', '--', '2'),
    (12, 'gray', 'user_followees_count', '-.', '3'),
    (13, 'orange', 'user_followers_count', ':', '4'),
    (14, 'pink', 'user_listed_count', '-', '|')
]

# Plot Q-Q plots and perform Shapiro-Wilk tests
for col, color, label, linestyle, marker in column_settings:
    (osm, osr), (slope, intercept, _) = stats.probplot(data.iloc[:, col], dist="norm", plot=None)
    plt.plot(osm, osr, marker=marker, linestyle='', color=color, label=label)  # Plot data points
    plt.plot(osm, slope*osm + intercept, linestyle=linestyle, color=color)  # Plot regression line
    
    # Shapiro-Wilk Test for each column
    stat, p = stats.shapiro(data.iloc[:, col])
    print(f'{label} - Shapiro-Wilk Test statistics={stat:.3f}, p={p:.3f}')

plt.title('Q-Q Plot')
plt.legend(loc='best')  # Improved legend placement
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')
plt.tight_layout()  # Adjust layout to make sure everything fits without overlap
plt.show()

#### 1.2 Normalizing engagement metrics using min-max

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])
data[columns_to_normalize].head()

#### 1.2 Visualizing engagement metrics distribuition after normalization

In [None]:
# Prepare the figure
plt.figure(figsize=(6, 4))

# Column indices and their respective settings
column_settings = [
    (6, 'blue', 'retweet_count', '-', 'o'),
    (7, 'green', 'like_count', '--', '^'),
    (8, 'red', 'quote_count', '-.', 's'),
    (9, 'purple', 'reply_count', ':', 'd'),
    (10, 'yellow', 'user_tweets_count', '-', '1'),
    (11, 'black', 'user_likes_count', '--', '2'),
    (12, 'gray', 'user_followees_count', '-.', '3'),
    (13, 'orange', 'user_followers_count', ':', '4'),
    (14, 'pink', 'user_listed_count', '-', '|'),
    (15, 'cyan', '', '-', '>')  # Note: Missing label here, adjust as necessary
]

# Plot Q-Q plots and perform Shapiro-Wilk tests
for col, color, label, linestyle, marker in column_settings:
    (osm, osr), (slope, intercept, _) = stats.probplot(data.iloc[:, col], dist="norm", plot=None)
    plt.plot(osm, osr, marker=marker, linestyle='', color=color, label=label)  # Plot data points
    plt.plot(osm, slope*osm + intercept, linestyle=linestyle, color=color)  # Plot regression line
    
    # Shapiro-Wilk Test for each column
    stat, p = stats.shapiro(data.iloc[:, col])
    print(f'{label} - Shapiro-Wilk Test statistics={stat:.3f}, p={p:.3f}')

plt.title('Q-Q Plot')
plt.legend(loc='best')  # Improved legend placement
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Ordered Values')
plt.tight_layout()  # Adjust layout to make sure everything fits without overlap
plt.show()

### 2 User Metrics

In [None]:
grouped_data = data.groupby('user_who_published').agg(
    total_likes=('like_count', 'sum'),
    total_retweets=('retweet_count', 'sum'),
    total_replies=('reply_count', 'sum'),
    total_tweets=('item_id', 'count'),  # Assuming each row is a tweet; adjust the column name accordingly
    followers_count=('user_followers_count', 'max'),  # Assuming followers_count doesn't change per user in the dataset
    recency=('tweet_date', 'sum')  # Assuming followers_count doesn't change per user in the dataset
).reset_index()

grouped_data['engagement_rate'] = (1 + grouped_data['total_likes'] + grouped_data['total_retweets'] + grouped_data['total_replies'] + grouped_data['recency']) / grouped_data['total_tweets'] * grouped_data['followers_count']

engagement_df = grouped_data[['user_who_published', 'engagement_rate']]

data_with_engagement = pd.merge(data, engagement_df, on='user_who_published', how='left')

data_with_engagement.head()

In [None]:
def calculate_follower_followee_ratio(followers, followees):
    if followees == 0:  # Avoid division by zero
        return followers
    ratio = followers / followees
    return ratio

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data_with_engagement['tweet_date'], data_with_engagement['engagement_rate'], alpha=0.5)
plt.title('Engagement Rate vs Followers Count')
plt.xlabel('Tweet Date')
plt.ylabel('Engagement Rate')
plt.grid(True, which="both", ls="--") 
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Assuming engagement_df is prepared with 'user_who_published' as IDs and 'engagement_rate' as values
X = engagement_df[['engagement_rate']].values  # Extracting the engagement rates for clustering

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

# Replace 'n_clusters' with the chosen number based on the elbow method
n_clusters = 7 # Example value
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(X_scaled)

# Add the cluster labels back to the original DataFrame
engagement_df['cluster'] = y_kmeans


# Group by cluster and calculate average engagement rate, or any other statistics of interest
cluster_analysis = engagement_df.groupby('cluster').agg(
    average_engagement=('engagement_rate', 'mean'),
    count=('user_who_published', 'count')
).reset_index()

print(cluster_analysis)

# Ensure this line comes after your clustering code but before your plotting code
# This merges the followers count into engagement_df based on user_who_published
engagement_df = pd.merge(engagement_df, grouped_data[['user_who_published', 'followers_count']], on='user_who_published', how='left')



In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assume x_all and y_all are your aggregated data points
x_all = np.concatenate([engagement_df[engagement_df['cluster'] == i]['followers_count'] for i in range(n_clusters)])
y_all = np.concatenate([engagement_df[engagement_df['cluster'] == i]['engagement_rate'] for i in range(n_clusters)])

# Fit a linear model
coefficients = np.polyfit(x_all, y_all, 1)  # 1 for linear
polynomial = np.poly1d(coefficients)

# Generate a series of x values for plotting the fitted line
x_line = np.linspace(min(x_all), max(x_all), 100)
# Calculate the y values based on the coefficients from the polyfit
y_line = polynomial(x_line)

# Now plot the original scatter plot
plt.figure(figsize=(10, 6))
for i in range(n_clusters):
    clustered_data = engagement_df[engagement_df['cluster'] == i]
    plt.scatter(clustered_data['followers_count'], clustered_data['engagement_rate'],
                s=100, c=colors[i], label=f'Cluster {i}', marker=markers[i], alpha=0.5)

# And plot the fitted line
plt.plot(x_line, y_line, 'r-', label='Fitted Line')

plt.title('Clusters of Engagement Rate vs. Followers Count with Fitted Line')
plt.xlabel('Followers Count')
plt.ylabel('Engagement Rate')
plt.legend()
plt.show()
