In [6]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from sklearn.utils import compute_class_weight

from sklearn.feature_extraction.text import TfidfVectorizer

import scipy

In [7]:
def progress_transform(serie, vectorizer, chunk_size = 10_000):
    number_chunks = len(serie) // chunk_size
    tf_idf_matrix = scipy.sparse.csr.csr_matrix([])
    for c, chunk in tqdm(enumerate(np.array_split(serie, number_chunks))):
        if not c:
            tf_idf_matrix = vectorizer.transform(chunk)
        else:
            tf_idf_matrix_part = vectorizer.transform(chunk)
            tf_idf_matrix = scipy.sparse.vstack((tf_idf_matrix, tf_idf_matrix_part))

    return tf_idf_matrix

In [8]:
def concat_tweets(df, column):
    """
    concatenates tweets for each user
    returns dataframe including usernames and corresponding concatenated tweets
    """
    map_user = list()
    map_tweets = list()
    
    for user in tqdm(df.username.unique()):
        map_user.append(user)
        tweets_concat = " ".join(df[df.username == user][column].values)
        map_tweets.append(tweets_concat)
        
    return pd.DataFrame({"username":map_user, "tw_concat":map_tweets}, columns=["username", "tw_concat"])

In [9]:
train=pd.read_csv("tweets_train.csv", low_memory=False)
test=pd.read_csv("tweets_test.csv", low_memory=False)

In [11]:
train.dropna(subset=["processed_text"], inplace=True)
test.dropna(subset=["processed_text"], inplace=True)

In [31]:
YEARS = [2020, 2019, 2018]

for YEAR in YEARS:
    for max_features in [5_000, 10_000]:
        
        print(YEAR)
        train_year = train[train.year.isin(range(YEAR,2022))]
        test_year = test[test.year.isin(range(YEAR,2022))]

        map_user_tweet_tr = concat_tweets(train_year, 'processed_text')
        map_user_tweet_ts = concat_tweets(test_year, 'processed_text')

        merged_tr = map_user_tweet_tr.merge(train[['username',
                           'realname','meslek', 'age_group',
                           'age_enc', 'gender', 'gender_enc']].drop_duplicates(subset=['username']), how="left", left_on="username", right_on="username")

        merged_ts = map_user_tweet_ts.merge(test[['username',
                           'realname','meslek', 'age_group',
                           'age_enc', 'gender', 'gender_enc']].drop_duplicates(subset=['username']), how="left", left_on="username", right_on="username")

        vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             max_features=max_features,
                             analyzer='word')

        fitted_vectorizer = vectorizer.fit(merged_tr.tw_concat)

        train_vecs_csr = progress_transform(merged_tr.tw_concat, fitted_vectorizer, chunk_size = 100)
        test_vecs_csr = progress_transform(merged_ts.tw_concat, fitted_vectorizer, chunk_size = 100)


        labels = labels = ["age_enc", "gender_enc"]
        BIAS = False

        for label in labels:

            models = [SVC(), RandomForestClassifier()]

            if BIAS:
                classes = merged_tr[label].unique().tolist()
                classWeight = compute_class_weight('balanced', classes, merged_tr[label]) 
                classWeight = dict(enumerate(classWeight))
                models = [SVC(class_weight=classWeight), RandomForestClassifier(class_weight=classWeight)]

            for model_to_fit in models:
                # fit model
                model = model_to_fit
                model.fit(train_vecs_csr, merged_tr[label])

                preds=list()
                for test_vec_csr in tqdm(test_vecs_csr):
                    pred = model.predict(test_vec_csr)[0]
                    preds.append(pred)

                average = None
                if merged_ts[label].nunique() > 2:
                    average = 'macro'
                else:
                    average = 'binary'

                print(label, type(model).__name__, "f1:", "{:.3f},".format(f1_score(preds, merged_ts[label], average=average)),
                    "acc:", "{:.3f},".format(accuracy_score(preds, merged_ts[label])),
                     "pre:", "{:.3f},".format(precision_score(preds, merged_ts[label], average=average)),
                     "rec:", "{:.3f}".format(recall_score(preds, merged_ts[label], average=average)))

2020


  0%|          | 0/541 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


age_enc SVC f1: 0.253, acc: 0.457, pre: 0.306, rec: 0.218


0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


age_enc RandomForestClassifier f1: 0.216, acc: 0.400, pre: 0.260, rec: 0.211


0it [00:00, ?it/s]

gender_enc SVC f1: 0.586, acc: 0.707, pre: 0.475, rec: 0.763


0it [00:00, ?it/s]

gender_enc RandomForestClassifier f1: 0.588, acc: 0.700, pre: 0.492, rec: 0.732
2019


  0%|          | 0/551 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


age_enc SVC f1: 0.287, acc: 0.483, pre: 0.331, rec: 0.288


0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


age_enc RandomForestClassifier f1: 0.244, acc: 0.414, pre: 0.279, rec: 0.244


0it [00:00, ?it/s]

gender_enc SVC f1: 0.588, acc: 0.710, pre: 0.484, rec: 0.750


0it [00:00, ?it/s]

gender_enc RandomForestClassifier f1: 0.667, acc: 0.745, pre: 0.597, rec: 0.755
2018


  0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


age_enc SVC f1: 0.285, acc: 0.479, pre: 0.330, rec: 0.279


0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


age_enc RandomForestClassifier f1: 0.261, acc: 0.459, pre: 0.308, rec: 0.259


0it [00:00, ?it/s]

gender_enc SVC f1: 0.629, acc: 0.733, pre: 0.532, rec: 0.767


0it [00:00, ?it/s]

gender_enc RandomForestClassifier f1: 0.667, acc: 0.747, pre: 0.597, rec: 0.755
