In [16]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.utils import compute_class_weight

from sklearn.feature_extraction.text import TfidfVectorizer

import scipy

In [6]:
df = pd.read_csv('data/tweet_data.csv')
print("number of users: ", df.username.unique().shape[0])
print("columns", df.columns)

userlist_shuffled = pd.Series(df.username.unique()).sample(frac=1, random_state=26).reset_index(drop=True).tolist()

mid = 4 * (len(userlist_shuffled) // 5)
train = df[df.username.isin(userlist_shuffled[:mid])]
test = df[df.username.isin(userlist_shuffled[mid:])]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print("train shape: ", train.shape, "test shape", test.shape)

number of users:  1050
columns Index(['id_str', 'created_at', 'text', 'entities', 'retweeted', 'username',
       'realname', 'gender', 'age', 'meslek', 'year', 'year_relative',
       'age_normalized', 'gender_enc', 'age_group', 'age_group_norm',
       'age_enc', 'age_enc_norm'],
      dtype='object')
train shape:  (1641471, 18) test shape (391212, 18)


In [7]:
def get_retweet(tw):
    if tw.startswith('RT @'): 
        return tw.split(" ")[1][1:-1]
    else:
        return ""

def concat_retweets(df, column):
    """
    concatenates tweets for each user
    returns dataframe including usernames and corresponding concatenated tweets
    """
    map_user = list()
    map_retweets = list()
    
    for user in tqdm(df.username.unique()):
        map_user.append(user)
        map_retweets.append(" ".join([get_retweet(text) for text in df[df.username == user][column].values]))
  
    return pd.DataFrame({"username":map_user, "retweet_list":map_retweets}, columns=["username", "retweet_list"])

In [38]:
map_user_tweet_tr = concat_retweets(train, 'text')
map_user_tweet_ts = concat_retweets(test, 'text')

merged_tr = map_user_tweet_tr.merge(train[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

merged_ts = map_user_tweet_ts.merge(test[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

100%|██████████| 840/840 [01:00<00:00, 13.91it/s]
100%|██████████| 210/210 [00:03<00:00, 54.81it/s]


In [39]:
dup_usernames = np.load('duplicated_usernames.npy', allow_pickle=True)
dup_usernames = np.append(dup_usernames, 'gamzercel')

merged_tr = merged_tr[~merged_tr.username.isin(dup_usernames)]
merged_ts = merged_ts[~merged_ts.username.isin(dup_usernames)]

### validation

In [33]:
mid = (merged_tr.shape[0] // 4) * 3
merged_ts = merged_tr[mid:]
merged_tr = merged_tr[:mid]

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

results = []

models = [SVC(), RandomForestClassifier(), LogisticRegression(), KNeighborsClassifier()]
labels = ["age_enc", "gender_enc"]

for label in labels:
    for model_to_fit in [SVC(), RandomForestClassifier(), LogisticRegression(), KNeighborsClassifier()]:
        if type(model_to_fit).__name__ == 'RandomForestClassifier':
            num_runs = 3
        else:
            num_runs = 1

        if label == "age_enc":
            N = 50
        elif label == "gender_enc":
            N = 200
        else:
            raise Exception("Here is exception")

        for _ in range(num_runs):
            vectorizer = TfidfVectorizer(ngram_range=(1,1), 
                        max_features=N,
                        analyzer='word')
            vectorizer.fit(merged_tr["retweet_list"])

            train_rt_csr = vectorizer.transform(merged_tr["retweet_list"])
            test_rt_csr = vectorizer.transform(merged_ts["retweet_list"])

            # fit model
            model = model_to_fit
            model.fit(train_rt_csr, merged_tr[label])

            preds=list()
            for test_vec_csr in test_rt_csr:
                pred = model.predict(test_vec_csr)[0]
                preds.append(pred)

            average = None
            if 'age_enc':
                average = 'macro'
            else:
                average = 'binary'

            f1 = "{:.3f}".format(f1_score(preds, merged_ts[label], average=average))
            acc = "{:.3f}".format(accuracy_score(preds, merged_ts[label]))
            pre = "{:.3f}".format(precision_score(preds, merged_ts[label], average=average))
            rec = "{:.3f}".format(recall_score(preds, merged_ts[label], average=average))

            results.append([type(model_to_fit).__name__, f1, acc, pre, rec])

In [37]:
pd.DataFrame(results, columns=["model", "f1","acc","pre","rec"])

Unnamed: 0,model,f1,acc,pre,rec
0,SVC,0.377,0.455,0.373,0.54
1,RandomForestClassifier,0.297,0.416,0.319,0.318
2,RandomForestClassifier,0.31,0.392,0.315,0.393
3,RandomForestClassifier,0.333,0.416,0.337,0.391
4,LogisticRegression,0.368,0.45,0.369,0.459
5,KNeighborsClassifier,0.323,0.421,0.335,0.38
6,SVC,0.593,0.603,0.596,0.603
7,RandomForestClassifier,0.625,0.632,0.626,0.632
8,RandomForestClassifier,0.606,0.612,0.608,0.612
9,RandomForestClassifier,0.629,0.636,0.631,0.637


0	SVC	                    0.377	0.455	0.373	0.540
1	RandomForestClassifier	0.297	0.416	0.319	0.318
2	RandomForestClassifier	0.310	0.392	0.315	0.393
3	RandomForestClassifier	0.333	0.416	0.337	0.391
4	LogisticRegression	    0.368	0.450	0.369	0.459
5	KNeighborsClassifier	0.323	0.421	0.335	0.380
6	SVC	                    0.593	0.603	0.596	0.603
7	RandomForestClassifier	0.625	0.632	0.626	0.632
8	RandomForestClassifier	0.606	0.612	0.608	0.612
9	RandomForestClassifier	0.629	0.636	0.631	0.637
10	LogisticRegression	    0.584	0.593	0.587	0.593
11	KNeighborsClassifier	0.598	0.598	0.602	0.603

### test

In [40]:
import warnings
warnings.filterwarnings("ignore")

results = []

models = [SVC(), RandomForestClassifier(), LogisticRegression(), KNeighborsClassifier()]
labels = ["age_enc", "gender_enc"]

for label in labels:
    for model_to_fit in models:
        if type(model_to_fit).__name__ == 'RandomForestClassifier':
            num_runs = 3
        else:
            num_runs = 1

        if label == "age_enc":
            N = 50
        elif label == "gender_enc":
            N = 200
        else:
            raise Exception("Here is exception")

        for _ in range(num_runs):

            vectorizer = TfidfVectorizer(ngram_range=(1,1), 
                            max_features=N,
                            analyzer='word')
            vectorizer.fit(merged_tr["retweet_list"])

            train_rt_csr = vectorizer.transform(merged_tr["retweet_list"])
            test_rt_csr = vectorizer.transform(merged_ts["retweet_list"])

            # fit model
            model = model_to_fit
            model.fit(train_rt_csr, merged_tr[label])

            preds=list()
            for test_vec_csr in test_rt_csr:
                pred = model.predict(test_vec_csr)[0]
                preds.append(pred)

            average = None
            if merged_ts[label].nunique() > 2:
                average = 'macro'
            else:
                average = 'binary'

            f1 = "{:.3f}".format(f1_score(preds, merged_ts[label], average=average))
            acc = "{:.3f}".format(accuracy_score(preds, merged_ts[label]))
            pre = "{:.3f}".format(precision_score(preds, merged_ts[label], average=average))
            rec = "{:.3f}".format(recall_score(preds, merged_ts[label], average=average))

            results.append([type(model_to_fit).__name__, f1, acc, pre, rec])

In [41]:
pd.DataFrame(results, columns=["model", "f1","acc","pre","rec"])

Unnamed: 0,model,f1,acc,pre,rec
0,SVC,0.372,0.449,0.37,0.435
1,RandomForestClassifier,0.315,0.406,0.322,0.336
2,RandomForestClassifier,0.341,0.44,0.349,0.364
3,RandomForestClassifier,0.327,0.415,0.33,0.364
4,LogisticRegression,0.344,0.42,0.344,0.398
5,KNeighborsClassifier,0.297,0.391,0.306,0.332
6,SVC,0.62,0.633,0.705,0.554
7,RandomForestClassifier,0.632,0.657,0.693,0.581
8,RandomForestClassifier,0.633,0.652,0.705,0.574
9,RandomForestClassifier,0.632,0.657,0.693,0.581



0	SVC	                    0.372	0.449	0.370	0.435
1	RandomForestClassifier	0.315	0.406	0.322	0.336
2	RandomForestClassifier	0.341	0.440	0.349	0.364
3	RandomForestClassifier	0.327	0.415	0.330	0.364
4	LogisticRegression	    0.344	0.420	0.344	0.398
5	KNeighborsClassifier	0.297	0.391	0.306	0.332
6	SVC	                    0.620	0.633	0.705	0.554
7	RandomForestClassifier	0.632	0.657	0.693	0.581
8	RandomForestClassifier	0.633	0.652	0.705	0.574
9	RandomForestClassifier	0.632	0.657	0.693	0.581
10	LogisticRegression	    0.551	0.638	0.523	0.582
11	KNeighborsClassifier	0.610	0.599	0.739	0.520