In [None]:
# Grouping tweet data by user and calculating the average retweets and likes
tweet_aggregates = tweets_df.groupby('user_id').agg({
    'retweets': 'mean',
    'likes': 'mean',
    'tweet_id': 'count'
}).reset_index()
tweet_aggregates.rename(columns={'retweets': 'avg_retweets', 'likes': 'avg_likes', 'tweet_id': 'total_tweets'}, inplace=True)

# Merging the aggregates with user data
merged_df = pd.merge(users_df, tweet_aggregates, on='user_id', how='left')

# Calculating the tweet frequency
merged_df['days_since_join'] = (datetime.now().date() - merged_df['join_date']).dt.days
merged_df['tweet_frequency'] = merged_df['total_tweets'] / merged_df['days_since_join']

# Calculating followers to following ratio
merged_df['followers_following_ratio'] = merged_df['followers_count'] / merged_df['following_count']

# Selecting the relevant features
features_df = merged_df[['user_id', 'tweet_frequency', 'avg_retweets', 'avg_likes', 'followers_following_ratio']]

features_df


In [None]:
import numpy as np

# Extracting feature vectors for the first two users
user_1_features = features_df.iloc[0, 1:].values
user_2_features = features_df.iloc[1, 1:].values

# Calculating Euclidean distance
euclidean_distance = np.linalg.norm(user_1_features - user_2_features)
euclidean_distance


In [None]:
# Generating random labels (1 for genuine, 0 for fake)
np.random.seed(42)
features_df['label'] = np.random.choice([0, 1], size=num_users)

# Splitting the data into features (X) and target (y)
X = features_df.drop(columns=['user_id', 'label'])
y = features_df['label']

features_df[['user_id', 'label']]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Logistic Regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred = clf.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculating confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, conf_matrix
