In [None]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

In [None]:
df = pd.read_csv('tweet_data.csv')
print("number of users: ", df.username.unique().shape[0])
print("columns", df.columns)

userlist_shuffled = pd.Series(df.username.unique()).sample(frac=1, random_state=26).reset_index(drop=True).tolist()

mid = 4 * (len(userlist_shuffled) // 5)
train = df[df.username.isin(userlist_shuffled[:mid])]
test = df[df.username.isin(userlist_shuffled[mid:])]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print("train shape: ", train.shape, "test shape", test.shape)

In [None]:
def get_retweet(tw):
    if tw.startswith('RT @'): 
        return tw.split(" ")[1][1:-1]
    else:
        return ""

def concat_retweets(df, column):
    """
    concatenates tweets for each user
    returns dataframe including usernames and corresponding concatenated tweets
    """
    map_user = list()
    map_retweets = list()
    
    for user in tqdm(df.username.unique()):
        map_user.append(user)
        map_retweets.append(" ".join([get_retweet(text) for text in df[df.username == user][column].values]))
  
    return pd.DataFrame({"username":map_user, "retweet_list":map_retweets}, columns=["username", "retweet_list"])

In [None]:
map_user_tweet_tr = concat_retweets(train, 'text')
map_user_tweet_ts = concat_retweets(test, 'text')

merged_tr = map_user_tweet_tr.merge(train[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

merged_ts = map_user_tweet_ts.merge(test[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

In [None]:
merged = merged_tr.append(merged_ts)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

N = 20

vectorizer = TfidfVectorizer(ngram_range=(1,1), 
                     max_features=N,
                     analyzer='word')
vectorizer.fit(merged["retweet_list"])

In [68]:
train_rt_csr = vectorizer.transform(merged_tr["retweet_list"])
test_rt_csr = vectorizer.transform(merged_ts["retweet_list"])

In [69]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

models = [SVC(), RandomForestClassifier()]
labels = ["age_enc", "gender_enc"]

In [70]:
for model_to_fit in models:
    for label in labels:
        # fit model
        model = model_to_fit
        model.fit(train_rt_csr, merged_tr[label])
        
        preds=list()
        for test_vec_csr in tqdm(test_rt_csr):
            pred = model.predict(test_vec_csr)[0]
            preds.append(pred)
        
        if merged_ts[label].nunique() > 2:
            print(label, 'macro', type(model).__name__, "f1:", "{:.3f},".format(f1_score(preds, merged_ts[label], average='macro')),
                "acc:", "{:.3f},".format(accuracy_score(preds, merged_ts[label])),
                 "pre:", "{:.3f},".format(precision_score(preds, merged_ts[label], average='macro')),
                 "rec:", "{:.3f}".format(recall_score(preds, merged_ts[label], average='macro')))
            print(pd.Series(preds).value_counts())
        else:
            print(label, 'binary', type(model).__name__, "f1:", "{:.3f},".format(f1_score(preds, merged_ts[label], average='binary', pos_label=0)),
                "acc:", "{:.3f},".format(accuracy_score(preds, merged_ts[label])),
                 "pre:", "{:.3f},".format(precision_score(preds, merged_ts[label], average='binary', pos_label=0)),
                 "rec:", "{:.3f}".format(recall_score(preds, merged_ts[label], average='binary', pos_label=0)))
            print(pd.Series(preds).value_counts())

0it [00:00, ?it/s]

age_enc macro SVC f1: 0.227, acc: 0.342, pre: 0.257, rec: 0.321
0    73
3    38
2    36
4     2
dtype: int64


  _warn_prf(average, modifier, msg_start, len(result))


0it [00:00, ?it/s]

gender_enc binary SVC f1: 0.700, acc: 0.564, pre: 0.905, rec: 0.571
0    133
1     16
dtype: int64


0it [00:00, ?it/s]

age_enc macro RandomForestClassifier f1: 0.294, acc: 0.423, pre: 0.317, rec: 0.332
0    74
2    30
3    29
1     8
4     8
dtype: int64


  _warn_prf(average, modifier, msg_start, len(result))


0it [00:00, ?it/s]

gender_enc binary RandomForestClassifier f1: 0.718, acc: 0.611, pre: 0.881, rec: 0.607
0    122
1     27
dtype: int64
