In [216]:
import pandas as pd
import json

## Load JSONL (newline-delimited JSON) into DataFrame

In [219]:
df = pd.read_json("data.txt", lines=True, convert_dates=False)

In [220]:
df.head(2)

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,retweeted_status,is_quote_status,timestamp_ms,display_text_range,extended_entities,extended_tweet,quoted_status_id,quoted_status_id_str,quoted_status,withheld_in_countries
0,Fri Apr 04 05:30:11 +0000 2014,451954866139590656,451954866139590656,25. نيك ورعان صغار سعودي http://t.co/zYBDYjTQc...,"<a href=""http://tweetadder.com"" rel=""nofollow""...",False,,,,,...,,,,,,,,,,
1,Fri Apr 04 05:32:57 +0000 2014,451955562407010304,451955562407010304,#ثقتنا_في_الخليفة يا سلمان شعب #البحرين يهديك ...,"<a href=""http://ifttt.com"" rel=""nofollow"">IFTT...",False,,,,,...,,,,,,,,,,


## Convert rows to list of dicts

In [224]:
data = df.to_dict(orient="records")

## Extract useful information

In [226]:
flattened_data = []
for tweet in data: 
    tweet_id = tweet.get("id_str") or tweet.get("id")
    text = tweet.get("full_text") or tweet.get("text")
    created_at = tweet.get("created_at")
    username = tweet.get("user", {}).get("screen_name") if isinstance(tweet.get("user"), dict) else None
    user_followers_count = tweet.get("user", {}).get("followers_count", 0) if isinstance(tweet.get("user"), dict) else 0
    user_created_at = tweet.get("user", {}).get("created_at") if isinstance(tweet.get("user"), dict) else None
    hashtag_count = len(tweet.get("entities", {}).get("hashtags", [])) if isinstance(tweet.get("entities"), dict) else 0
    retweet_count = tweet.get("retweet_count", 0)
    favorite_count = tweet.get("favorite_count", 0)
    lang = tweet.get("lang")

    flattened_data.append([
        tweet_id, text, created_at, username, user_created_at,
        user_followers_count, hashtag_count, retweet_count, favorite_count, lang
    ])

## Convert flattened data into a DataFrame

In [229]:
flat_df = pd.DataFrame(flattened_data, columns=[
    "tweet_id", "text", "tweet_created_at", "username", "user_created_at",
    "user_followers_count", "hashtag_count", "retweet_count", "favorite_count", "lang"
])

In [231]:
flat_df.head(5)

Unnamed: 0,tweet_id,text,tweet_created_at,username,user_created_at,user_followers_count,hashtag_count,retweet_count,favorite_count,lang
0,451954866139590656,25. نيك ورعان صغار سعودي http://t.co/zYBDYjTQc...,Fri Apr 04 05:30:11 +0000 2014,serfearnar3,Tue Mar 11 07:46:40 +0000 2014,172,8,0,0,ar
1,451955562407010304,#ثقتنا_في_الخليفة يا سلمان شعب #البحرين يهديك ...,Fri Apr 04 05:32:57 +0000 2014,ebri_ebre,Fri Jan 03 21:34:48 +0000 2014,70,5,0,0,ar
2,451956103476424704,Thanks @MaupiEnCo #FF @BoomerZazou @HARDYenDAI...,Fri Apr 04 05:35:06 +0000 2014,amadeus_pluis,Fri Jul 13 21:03:47 +0000 2012,769,1,0,0,en
3,451957428880674816,RT @heidi_striker: Make time for others and th...,Fri Apr 04 05:40:22 +0000 2014,Palleg1964,Thu Jan 09 18:35:51 +0000 2014,568,1,0,0,en
4,451957445636534272,RT @anneotago: @jendab @chelsea_goulton @LogiB...,Fri Apr 04 05:40:26 +0000 2014,chelsea_goulton,Fri Jan 11 04:28:15 +0000 2013,93,1,0,0,en


## Check missing values

In [234]:
flat_df.isnull().sum()

tweet_id                0
text                    0
tweet_created_at        0
username                0
user_created_at         0
user_followers_count    0
hashtag_count           0
retweet_count           0
favorite_count          0
lang                    0
dtype: int64

In [236]:
flat_df.dtypes

tweet_id                 int64
text                    object
tweet_created_at        object
username                object
user_created_at         object
user_followers_count     int64
hashtag_count            int64
retweet_count            int64
favorite_count           int64
lang                    object
dtype: object

In [238]:
date_format = "%a %b %d %H:%M:%S %z %Y"  # Twitter date format

flat_df["tweet_created_at"] = pd.to_datetime(
    flat_df["tweet_created_at"], format=date_format, errors="coerce"
)
flat_df["user_created_at"] = pd.to_datetime(
    flat_df["user_created_at"], format=date_format, errors="coerce"
)

### Account age in days when tweet was posted

In [241]:
flat_df["account_age_days"] = (flat_df["tweet_created_at"] - flat_df["user_created_at"]).dt.days

In [243]:
flat_df.head(5)

Unnamed: 0,tweet_id,text,tweet_created_at,username,user_created_at,user_followers_count,hashtag_count,retweet_count,favorite_count,lang,account_age_days
0,451954866139590656,25. نيك ورعان صغار سعودي http://t.co/zYBDYjTQc...,2014-04-04 05:30:11+00:00,serfearnar3,2014-03-11 07:46:40+00:00,172,8,0,0,ar,23
1,451955562407010304,#ثقتنا_في_الخليفة يا سلمان شعب #البحرين يهديك ...,2014-04-04 05:32:57+00:00,ebri_ebre,2014-01-03 21:34:48+00:00,70,5,0,0,ar,90
2,451956103476424704,Thanks @MaupiEnCo #FF @BoomerZazou @HARDYenDAI...,2014-04-04 05:35:06+00:00,amadeus_pluis,2012-07-13 21:03:47+00:00,769,1,0,0,en,629
3,451957428880674816,RT @heidi_striker: Make time for others and th...,2014-04-04 05:40:22+00:00,Palleg1964,2014-01-09 18:35:51+00:00,568,1,0,0,en,84
4,451957445636534272,RT @anneotago: @jendab @chelsea_goulton @LogiB...,2014-04-04 05:40:26+00:00,chelsea_goulton,2013-01-11 04:28:15+00:00,93,1,0,0,en,448


### 1. Account Age

In [246]:
flat_df["account_age_days"] = (flat_df["tweet_created_at"] - flat_df["user_created_at"]).dt.days
flat_df["account_age_years"] = flat_df["account_age_days"] / 365

In [248]:
flat_df.head(2)

Unnamed: 0,tweet_id,text,tweet_created_at,username,user_created_at,user_followers_count,hashtag_count,retweet_count,favorite_count,lang,account_age_days,account_age_years
0,451954866139590656,25. نيك ورعان صغار سعودي http://t.co/zYBDYjTQc...,2014-04-04 05:30:11+00:00,serfearnar3,2014-03-11 07:46:40+00:00,172,8,0,0,ar,23,0.063014
1,451955562407010304,#ثقتنا_في_الخليفة يا سلمان شعب #البحرين يهديك ...,2014-04-04 05:32:57+00:00,ebri_ebre,2014-01-03 21:34:48+00:00,70,5,0,0,ar,90,0.246575


### 2. Engagement Ratios

In [251]:
# Avoid division by zero
flat_df["followers_per_retweet"] = flat_df["user_followers_count"] / (flat_df["retweet_count"] + 1)
flat_df["followers_per_favorite"] = flat_df["user_followers_count"] / (flat_df["favorite_count"] + 1)
flat_df["retweet_to_favorite_ratio"] = flat_df["retweet_count"] / (flat_df["favorite_count"] + 1)


In [253]:
flat_df.head(2)

Unnamed: 0,tweet_id,text,tweet_created_at,username,user_created_at,user_followers_count,hashtag_count,retweet_count,favorite_count,lang,account_age_days,account_age_years,followers_per_retweet,followers_per_favorite,retweet_to_favorite_ratio
0,451954866139590656,25. نيك ورعان صغار سعودي http://t.co/zYBDYjTQc...,2014-04-04 05:30:11+00:00,serfearnar3,2014-03-11 07:46:40+00:00,172,8,0,0,ar,23,0.063014,172.0,172.0,0.0
1,451955562407010304,#ثقتنا_في_الخليفة يا سلمان شعب #البحرين يهديك ...,2014-04-04 05:32:57+00:00,ebri_ebre,2014-01-03 21:34:48+00:00,70,5,0,0,ar,90,0.246575,70.0,70.0,0.0


### 3. Tweet Content Features

In [256]:
# Length of the tweet
flat_df["text_length"] = flat_df["text"].str.len()

# Number of words
flat_df["word_count"] = flat_df["text"].str.split().str.len()

# Number of mentions (@)
flat_df["mention_count"] = flat_df["text"].str.count("@")

# Number of URLs
flat_df["url_count"] = flat_df["text"].str.count("http[s]?://")

# Presence of hashtags
flat_df["has_hashtags"] = (flat_df["hashtag_count"] > 0).astype(int)

### 4. Temporal Features

In [259]:
# Hour of tweet (0-23)
flat_df["tweet_hour"] = flat_df["tweet_created_at"].dt.hour

# Day of week (0=Monday, 6=Sunday)
flat_df["tweet_dayofweek"] = flat_df["tweet_created_at"].dt.dayofweek

# Is weekend?
flat_df["is_weekend"] = flat_df["tweet_dayofweek"].isin([5,6]).astype(int)

### 5. Interaction Features

In [262]:
flat_df["engagement_sum"] = flat_df["retweet_count"] + flat_df["favorite_count"]
flat_df["engagement_per_follower"] = flat_df["engagement_sum"] / (flat_df["user_followers_count"] + 1)

### 6. Language Encoding

In [265]:
flat_df["lang_encoded"] = flat_df["lang"].astype('category').cat.codes

In [269]:
flat_df.head(2)

Unnamed: 0,tweet_id,text,tweet_created_at,username,user_created_at,user_followers_count,hashtag_count,retweet_count,favorite_count,lang,...,word_count,mention_count,url_count,has_hashtags,tweet_hour,tweet_dayofweek,is_weekend,engagement_sum,engagement_per_follower,lang_encoded
0,451954866139590656,25. نيك ورعان صغار سعودي http://t.co/zYBDYjTQc...,2014-04-04 05:30:11+00:00,serfearnar3,2014-03-11 07:46:40+00:00,172,8,0,0,ar,...,14,0,1,1,5,4,0,0,0.0,0
1,451955562407010304,#ثقتنا_في_الخليفة يا سلمان شعب #البحرين يهديك ...,2014-04-04 05:32:57+00:00,ebri_ebre,2014-01-03 21:34:48+00:00,70,5,0,0,ar,...,16,0,1,1,5,4,0,0,0.0,0


In [346]:
flat_df.to_csv("24RP14528_Final_DataSet.csv", index=False)

# Explaination of choices or assumptions

## 1. Filling Missing Values

#### Numeric columns: Filled with 0 (e.g., user_followers_count, retweet_count, favorite_count).
#### Assumption: If a value is missing, it’s likely the user had no followers or the tweet had no engagement. Using 0 avoids errors during modeling.
#### Alternative: Could use median/mean imputation, but 0 makes sense for counts.

#### Categorical columns: Filled with "unknown" (e.g., username, lang).
#### Assumption: Missing usernames or language is rare but possible; labeling them "unknown" keeps the dataset consistent.

## 2. Dropping Duplicates

#### Choice: Drop duplicate tweet_ids.
#### Assumption: Each tweet should be unique; duplicates could come from repeated data collection. Keeping duplicates could bias engagement statistics.

## 3. Handling Invalid Dates

#### Choice: Use pd.to_datetime(..., errors="coerce") → converts invalid dates to NaT. Then drop NaT.
#### Assumption: Invalid or missing timestamps are rare; dropping them is safer than guessing.
#### Alternative: Could fill missing dates with a default value, but that might distort account age calculations.

## 4. Derived Features

#### Account age, text length, word count, mentions, URLs, engagement ratios, etc.
#### Assumption: These features capture user influence, tweet activity, and engagement potential.
#### Reasoning: Logistic Regression, Random Forest, and Gradient Boosting can all benefit from these numeric features to detect patterns

## Only the selected features are useful for engagement prediction or user behavior modeling; other columns were unnecessary.

## 1. Logistic Regression

In [355]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

In [342]:
features = [
    'tweet_text_length',
    'account_age_days',
    'user_followers_count',
    'has_hashtags',
    'tweet_hour',
    'tweet_dayofweek',
    'is_weekend'
]

In [378]:
flat_df['high_engagement'] = ((flat_df['retweet_count'] + flat_df['favorite_count']) > 5).astype(int)
print(flat_df['high_engagement'].value_counts())

high_engagement
0    39092
Name: count, dtype: int64


In [380]:
flat_df_clean = flat_df.dropna(subset=features + ['high_engagement'])

In [382]:
X = flat_df_clean[features]
y = flat_df_clean['high_engagement']

#### Train/test split

In [385]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [387]:
# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [389]:
flat_df['high_engagement'] = (flat_df['engagement_sum'] >= 10).astype(int)
print(flat_df['high_engagement'].value_counts())

high_engagement
0    39092
Name: count, dtype: int64


In [391]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

#### I wasn't able to continue because on hight engagement i have only one class where the Summation of 'retweet_count' and 'favorite_count' both it's total remained to be zero