## Importing libs

In [27]:
import pandas as pd
import os
from tqdm import tqdm 
import numpy as np
import pickle
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

## Importing data

In [52]:
df = (
    pd.read_csv('tweets/all_tweets.csv')
    .iloc[:, 1:]
)

In [53]:
max_date = pd.to_datetime(df['created_at'].astype(str).str[:10]).max()
max_date

Timestamp('2023-01-29 00:00:00')

In [54]:
df.shape

(359036, 28)

In [55]:
users = (
    pd.read_pickle('data_users.pkl')
    .rename(columns={'username': 'author'})[['author','followers_count',
                                             'following_count', 'tweet_count', 'created_at']]
)

In [56]:
users['days_of_creation'] = (max_date - pd.to_datetime(users['created_at'].astype(str).str[:10])).dt.days

In [57]:
users = users.drop(columns=['created_at'])

In [58]:
users.shape

(204, 5)

## Dataprep

In [59]:
df = users.merge(df, on=['author'])
df.shape

(359036, 32)

In [61]:
df = df.drop(columns=['author_id', 'withheld'])

In [62]:
df['ratio_like'] = df['like_count'] / df['impression_count']
df['ratio_retweet'] = df['retweet_count'] / df['impression_count']

#### Botometer

In [63]:
bot_df = pd.DataFrame(columns=[
    'author',
    'majority_lang',
    'english',
    'universal',
    'eng_astroturf',
    'eng_fake_follower',
    'eng_financial',
    'eng_other',
    'eng_overall',
    'eng_self_declared',
    'eng_spammer',
    'uni_astroturf',
    'uni_fake_follower',
    'uni_financial',
    'uni_other',
    'uni_overall',
    'uni_self_declared',
    'uni_spammer'
])

for i in tqdm(os.listdir('users_botometer/')):
    with open(f'users_botometer/{i}', 'rb') as f:
        bot_col = pickle.load(f)
        
    bot_feat = [
        bot_col['user']['user_data']['screen_name'],
        bot_col['user']['majority_lang'],
        bot_col['cap']['english'],
        bot_col['cap']['universal'],
        bot_col['raw_scores']['english']['astroturf'],
        bot_col['raw_scores']['english']['fake_follower'],
        bot_col['raw_scores']['english']['financial'],
        bot_col['raw_scores']['english']['other'],
        bot_col['raw_scores']['english']['overall'],
        bot_col['raw_scores']['english']['self_declared'],
        bot_col['raw_scores']['english']['spammer'],
        bot_col['raw_scores']['universal']['astroturf'],
        bot_col['raw_scores']['universal']['fake_follower'],
        bot_col['raw_scores']['universal']['financial'],
        bot_col['raw_scores']['universal']['other'],
        bot_col['raw_scores']['universal']['overall'],
        bot_col['raw_scores']['universal']['self_declared'],
        bot_col['raw_scores']['universal']['spammer']
    ]
    
    bot_df.loc[len(bot_df)] = bot_feat

100%|██████████████████████████████████████████████████████████████████| 204/204 [00:01<00:00, 123.69it/s]


In [64]:
bot_df.head()

Unnamed: 0,author,majority_lang,english,universal,eng_astroturf,eng_fake_follower,eng_financial,eng_other,eng_overall,eng_self_declared,eng_spammer,uni_astroturf,uni_fake_follower,uni_financial,uni_other,uni_overall,uni_self_declared,uni_spammer
0,19KHK701,tr,0.819915,0.767421,0.31,0.56,0.29,0.82,0.82,0.01,0.05,0.27,0.4,0.21,0.49,0.26,0.09,0.03
1,21meralsimsek,tr,0.779582,0.743721,0.18,0.18,0.03,0.57,0.31,0.0,0.04,0.22,0.19,0.0,0.13,0.23,0.0,0.02
2,692khk,tr,0.796862,0.800688,0.37,0.47,0.15,0.67,0.67,0.0,0.07,0.22,0.48,0.11,0.46,0.37,0.0,0.04
3,Adana_KHK,tr,0.803671,0.789743,0.06,0.23,0.01,0.76,0.76,0.0,0.03,0.07,0.11,0.0,0.36,0.31,0.0,0.0
4,AfsinSuleyman,tr,0.796667,0.767421,0.23,0.19,0.01,0.61,0.61,0.0,0.02,0.26,0.08,0.0,0.31,0.26,0.0,0.0


In [67]:
df = df.merge(bot_df.drop(columns=['majority_lang']))

## Model

### Target

In [69]:
df_like = df[
    (df['impression_count'] > 0) &
    (df['ratio_like'] <= 1)
]

In [71]:
df_retweet = df[
    (df['impression_count'] > 0) &
    (df['ratio_retweet'] <= 1)
]

In [90]:
pd.concat([
    df_like['ratio_like'].describe(),
    df_retweet['ratio_retweet'].describe(),
], axis=1)

Unnamed: 0,ratio_like,ratio_retweet
count,21234.0,21016.0
mean,0.023062,0.012231
std,0.030491,0.026889
min,0.0,0.0
25%,0.004197,0.0
50%,0.013761,0.002116
75%,0.032182,0.014615
max,1.0,1.0


In [91]:
df_like['ratio_like'] = pd.qcut(df_like['ratio_like'], 3, ['low', 'medium', 'high'])
df_retweet['ratio_retweet'] = pd.qcut(df_like['ratio_retweet'], 3, ['low', 'medium', 'high'])

### Features

In [104]:
features = [
    'created_at'
    ,'days_of_creation'
    ,'followers_count'
    ,'following_count'
    ,'days_of_creation'
    ,'in_reply_to_user_id'
    ,'text'
    ,'media_keys'
    ,'hashtags'
    ,'mentions'
    ,'is_retweet'
    ,'possibly_sensitive'
    ,'english'
    ,'universal'
    ,'eng_astroturf'
    ,'eng_fake_follower'
    ,'eng_financial'
    ,'eng_other'
    ,'eng_overall'
    ,'eng_self_declared'
    ,'eng_spammer'
    ,'uni_astroturf'
    ,'uni_fake_follower'
    ,'uni_financial'
    ,'uni_other'
    ,'uni_overall'
    ,'uni_self_declared'
    ,'uni_spammer'
]

In [105]:
def create_features(data, target, features, max_date=max_date):
    y = data[target]
    
    x = data[features]
    x['created_at'] = (max_date - pd.to_datetime(x['created_at'].str[:10])).dt.days
    x['in_reply_to_user'] = np.where(x['in_reply_to_user_id'].isna(), 0, 1)
    x['text_length'] = x['text'].apply(lambda x: len(x))
    x['has_media'] = np.where(x['media_keys'].isna(), 0, 1)
    x['has_hashtags'] = np.where(x['hashtags'].isna(), 0, 1)
    x['has_mentions'] = np.where(x['mentions'].isna(), 0, 1)
    x['is_retweet'] = x['is_retweet'].astype(int)
    x['possibly_sensitive'] = x['possibly_sensitive'].astype(int)
    
    x = x.drop(columns=[
        'in_reply_to_user_id',
        'text',
        'is_retweet',
        'possibly_sensitive',
        'media_keys',
        'hashtags',
        'mentions'
    ])
    
    return x, y

#### Logistic Regression

In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

##### Ratio likes

In [151]:
x, y = create_features(df_like, 'ratio_like', features)

In [152]:
lr = LogisticRegression()

In [153]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [154]:
lr.fit(x_train, y_train)

In [155]:
y_pred = lr.predict(x_test)

In [156]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.52      0.55      0.53      1768
         low       0.45      0.38      0.41      1733
      medium       0.40      0.43      0.41      1808

    accuracy                           0.45      5309
   macro avg       0.45      0.45      0.45      5309
weighted avg       0.45      0.45      0.45      5309



In [157]:
f1_score(y_test, y_pred, average='macro')

0.451892825067253

In [158]:
pd.DataFrame(lr.coef_, index=lr.classes_, columns=x.columns).T.round(3)

Unnamed: 0,high,low,medium
created_at,0.001,-0.0,-0.0
days_of_creation,-0.0,0.0,0.0
followers_count,-0.0,0.0,0.0
following_count,0.0,-0.0,0.0
days_of_creation,-0.0,0.0,0.0
english,0.0,-0.0,-0.0
universal,0.0,-0.0,-0.0
eng_astroturf,0.0,-0.0,-0.0
eng_fake_follower,0.0,-0.0,-0.0
eng_financial,0.0,-0.0,-0.0


##### Ratio retweets

In [159]:
x, y = create_features(df_retweet, 'ratio_retweet', features)

In [160]:
lr = LogisticRegression()

In [161]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [162]:
lr.fit(x_train, y_train)

In [163]:
y_pred = lr.predict(x_test)

In [164]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.51      0.65      0.58      1698
         low       0.62      0.27      0.38      1804
      medium       0.53      0.70      0.60      1752

    accuracy                           0.54      5254
   macro avg       0.56      0.54      0.52      5254
weighted avg       0.56      0.54      0.52      5254



In [165]:
f1_score(y_test, y_pred, average='macro')

0.5193830179689486

In [166]:
pd.DataFrame(lr.coef_, index=lr.classes_, columns=x.columns).T.round(3)

Unnamed: 0,high,low,medium
created_at,0.001,0.0,-0.001
days_of_creation,-0.0,0.0,0.0
followers_count,-0.0,-0.0,0.0
following_count,0.0,-0.0,-0.0
days_of_creation,-0.0,0.0,0.0
english,0.0,0.0,-0.0
universal,0.0,-0.0,-0.0
eng_astroturf,0.0,0.0,-0.0
eng_fake_follower,0.0,-0.0,-0.0
eng_financial,0.0,0.0,-0.0


#### Hashtags

In [192]:
from unidecode import unidecode

In [195]:
lst_hashtags = []

for i in tqdm(df['hashtags']):
    
    try:
        hashtags = [j['tag'] for j in eval(i)]
        hashtags = unidecode(' '.join(hashtags))
    except:
        hashtags = ''
    
    lst_hashtags.append(hashtags)

 ... (more hidden) ...


In [282]:
data = df.copy()

In [283]:
data['hashtags'] = lst_hashtags

In [279]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

In [306]:
for i in TARGETS:
    data_ = data[
        (data['impression_count'] > 0) &
        (data[i] <= 1)
    ]

    pipe = Pipeline([
        ('bow', CountVectorizer()),
        ('rf', RandomForestClassifier())
    ])
    
    y = pd.qcut(data_[i], 3, labels=['low', 'medium', 'high'])

    pipe.fit(data_['hashtags'], y)

    y_pred = pipe.predict(data_['hashtags'])

    print(classification_report(y, y_pred))
    print(f1_score(y, y_pred, average='macro'))
    print('='*80)
    
    

              precision    recall  f1-score   support

        high       0.65      0.42      0.51      7003
         low       0.37      0.78      0.51      7006
      medium       0.87      0.23      0.36      7007

    accuracy                           0.48     21016
   macro avg       0.63      0.48      0.46     21016
weighted avg       0.63      0.48      0.46     21016

0.4608801175257676
              precision    recall  f1-score   support

        high       0.63      0.40      0.49      7078
         low       0.39      0.83      0.53      7078
      medium       0.81      0.20      0.32      7078

    accuracy                           0.47     21234
   macro avg       0.61      0.47      0.44     21234
weighted avg       0.61      0.47      0.44     21234

0.4448270717425849


In [304]:
for i in TARGETS:
    data_ = data[
        (data['impression_count'] > 0) &
        (data[i] <= 1)
    ]

    pipe = Pipeline([
        ('bow', CountVectorizer()),
        ('lr', LogisticRegression())
    ])
    
    y = pd.qcut(data_[i], 3, labels=['low', 'medium', 'high'])

    pipe.fit(data_['hashtags'], y)

    y_pred = pipe.predict(data_['hashtags'])

    print(classification_report(y, y_pred))
    print(f1_score(y, y_pred, average='macro'))
    print('='*80)
    
    

              precision    recall  f1-score   support

        high       0.65      0.41      0.50      7003
         low       0.80      0.03      0.06      7006
      medium       0.42      0.98      0.59      7007

    accuracy                           0.47     21016
   macro avg       0.62      0.47      0.38     21016
weighted avg       0.62      0.47      0.38     21016

0.38335249279781136
              precision    recall  f1-score   support

        high       0.63      0.39      0.48      7078
         low       0.39      0.82      0.53      7078
      medium       0.77      0.19      0.31      7078

    accuracy                           0.47     21234
   macro avg       0.60      0.47      0.44     21234
weighted avg       0.60      0.47      0.44     21234

0.4392598931574363
