In [13]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.utils import compute_class_weight

from sklearn.feature_extraction.text import TfidfVectorizer

import scipy

In [2]:
def progress_transform(serie, vectorizer, chunk_size = 10_000):
    number_chunks = len(serie) // chunk_size
    tf_idf_matrix = scipy.sparse.csr.csr_matrix([])
    for c, chunk in tqdm(enumerate(np.array_split(serie, number_chunks))):
        if not c:
            tf_idf_matrix = vectorizer.transform(chunk)
        else:
            tf_idf_matrix_part = vectorizer.transform(chunk)
            tf_idf_matrix = scipy.sparse.vstack((tf_idf_matrix, tf_idf_matrix_part))

    return tf_idf_matrix

### Filtered by years

In [8]:
def concat_tweets(df, column):
    """
    concatenates tweets for each user
    returns dataframe including usernames and corresponding concatenated tweets
    """
    map_user = list()
    map_tweets = list()
    
    for user in tqdm(df.username.unique()):
        map_user.append(user)
        tweets_concat = " ".join(df[df.username == user][column].values)
        map_tweets.append(tweets_concat)
        
    return pd.DataFrame({"username":map_user, "tw_concat":map_tweets}, columns=["username", "tw_concat"])

In [9]:
train=pd.read_csv("tweets_train.csv", low_memory=False)
test=pd.read_csv("tweets_test.csv", low_memory=False)

In [11]:
train.dropna(subset=["processed_text"], inplace=True)
test.dropna(subset=["processed_text"], inplace=True)

In [46]:
YEARS = [2020, 2019, 2018]

results = []

for YEAR in YEARS:
    
    print(YEAR)
    train_year = train[train.year.isin(range(YEAR,2022))]
    test_year = test[test.year.isin(range(YEAR,2022))]

    map_user_tweet_tr = concat_tweets(train_year, 'processed_text')
    map_user_tweet_ts = concat_tweets(test_year, 'processed_text')

    merged_tr = map_user_tweet_tr.merge(train[['username',
                       'realname','meslek', 'age_group',
                       'age_enc', 'gender', 'gender_enc']].drop_duplicates(subset=['username']), how="left", left_on="username", right_on="username")

    merged_ts = map_user_tweet_ts.merge(test[['username',
                       'realname','meslek', 'age_group',
                       'age_enc', 'gender', 'gender_enc']].drop_duplicates(subset=['username']), how="left", left_on="username", right_on="username")

    
    for max_features in [5_000, 10_000]:
        
        vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                             max_features=max_features,
                             analyzer='word')

        fitted_vectorizer = vectorizer.fit(merged_tr.tw_concat)

        train_vecs_csr = progress_transform(merged_tr.tw_concat, fitted_vectorizer, chunk_size = 100)
        test_vecs_csr = progress_transform(merged_ts.tw_concat, fitted_vectorizer, chunk_size = 100)


        labels = ["gender_enc"] # age_enc
        BIAS = False

        for label in labels:

            models = [SVC(), RandomForestClassifier()]

            if BIAS:
                classes = merged_tr[label].unique().tolist()
                classWeight = compute_class_weight('balanced', classes, merged_tr[label]) 
                classWeight = dict(enumerate(classWeight))
                models = [SVC(class_weight=classWeight), RandomForestClassifier(class_weight=classWeight)]

            for model_to_fit in models:
                # fit model
                model = model_to_fit
                model.fit(train_vecs_csr, merged_tr[label])

                preds=list()
                for test_vec_csr in tqdm(test_vecs_csr):
                    pred = model.predict(test_vec_csr)[0]
                    preds.append(pred)

                average = None
                if merged_ts[label].nunique() > 2:
                    average = 'macro'
                else:
                    average = 'binary'
                
                f1 = "{:.3f}".format(f1_score(preds, merged_ts[label], average=average))
                acc = "{:.3f}".format(accuracy_score(preds, merged_ts[label]))
                pre = "{:.3f}".format(precision_score(preds, merged_ts[label], average=average))
                rec = "{:.3f}".format(recall_score(preds, merged_ts[label], average=average))
                
                results.append([f1, acc, pre, rec])
                
#                 print(label, type(model).__name__, "f1:", "{:.3f},".format(f1_score(preds, merged_ts[label], average=average)),
#                                                   "acc:", "{:.3f},".format(accuracy_score(preds, merged_ts[label])),
#                                                   "pre:", "{:.3f},".format(precision_score(preds, merged_ts[label], average=average)),
#                                                   "rec:", "{:.3f}".format(recall_score(preds, merged_ts[label], average=average)))

2020


  0%|          | 0/541 [00:00<?, ?it/s]

  0%|          | 0/140 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

2019


  0%|          | 0/551 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

2018


  0%|          | 0/557 [00:00<?, ?it/s]

  0%|          | 0/146 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [47]:
pd.DataFrame(results, columns=["f1","acc","pre","rec"])

Unnamed: 0,f1,acc,pre,rec
0,0.586,0.707,0.475,0.763
1,0.615,0.714,0.525,0.744
2,0.592,0.714,0.475,0.784
3,0.58,0.7,0.475,0.744
4,0.588,0.71,0.484,0.75
5,0.661,0.738,0.597,0.74
6,0.588,0.71,0.484,0.75
7,0.667,0.752,0.581,0.783
8,0.629,0.733,0.532,0.767
9,0.654,0.753,0.548,0.81


### Validation

In [3]:
merged_tr=pd.read_csv("data/user_tweets_train.csv")
merged_ts=pd.read_csv("data/user_tweets_test.csv")
# print(merged_tr.shape, merged_ts.shape)

dup_usernames = np.load('duplicated_usernames.npy', allow_pickle=True)

merged_tr.dropna(subset=["tw_concat_stemmed"], inplace=True)
merged_tr = merged_tr[~merged_tr.username.isin(dup_usernames)]

merged_ts.dropna(subset=["tw_concat_stemmed"], inplace=True)
merged_ts = merged_ts[~merged_ts.username.isin(dup_usernames)]

In [4]:
print(merged_tr.shape, merged_ts.shape)

(833, 9) (207, 9)


In [5]:
mid = (merged_tr.shape[0] // 4) * 3
merged_ts = merged_tr[mid:]
merged_tr = merged_tr[:mid]

In [6]:
print(merged_tr.shape, merged_ts.shape)

(624, 9) (209, 9)


In [7]:
merged_ts.gender_enc.value_counts()

0    110
1     99
Name: gender_enc, dtype: int64

In [8]:
BIAS = False
results = []

models = [SVC(), RandomForestClassifier(), LogisticRegression(), KNeighborsClassifier()]
models = [LogisticRegression()]

labels = ["age_enc", "gender_enc"]
labels = ["gender_enc"]

preds_all = []

for max_features in [5_000, 10_000, 20_000]:

    for label in labels:
        
        # if label == 'age_enc':
        #     max_features = 20_000
        # else:
        #     max_features = 5_000

        # vectorizer = TfidfVectorizer(ngram_range=(3,3), 
        #                     max_features=max_features,
        #                     analyzer='char_wb')

        vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                            max_features=max_features,
                            analyzer='word')


        fitted_vectorizer = vectorizer.fit(merged_tr.tw_concat_stemmed) # tw_concat_processed_text tw_concat_stemmed

        train_vecs_csr = progress_transform(merged_tr.tw_concat_stemmed, fitted_vectorizer, chunk_size = 100)
        test_vecs_csr = progress_transform(merged_ts.tw_concat_stemmed, fitted_vectorizer, chunk_size = 100)

        for model_to_fit in models:
            
            if type(model_to_fit).__name__ == 'RandomForestClassifier':
                num_runs = 3
            else:
                num_runs = 1

            for _ in range(num_runs):

                if BIAS:
                    classes = merged_tr[label].unique().tolist()
                    classWeight = compute_class_weight('balanced', classes, merged_tr[label]) 
                    classWeight = dict(enumerate(classWeight))
                    models = [SVC(class_weight=classWeight), RandomForestClassifier(class_weight=classWeight)]

                # fit model
                model = model_to_fit
                model.fit(train_vecs_csr, merged_tr[label])

                preds=list()
                for test_vec_csr in test_vecs_csr:
                    pred = model.predict(test_vec_csr)[0]
                    preds.append(pred)

                preds_all.append(preds)

                acc = "{:.3f}".format(accuracy_score(preds, merged_ts[label]))
                
                f1_macro = "{:.3f}".format(f1_score(preds, merged_ts[label], average='macro'))
                pre_macro = "{:.3f}".format(precision_score(preds, merged_ts[label], average='macro'))
                rec_macro = "{:.3f}".format(recall_score(preds, merged_ts[label], average='macro'))

                f1_weighted = "{:.3f}".format(f1_score(preds, merged_ts[label], average='weighted'))
                pre_weighted = "{:.3f}".format(precision_score(preds, merged_ts[label], average='weighted'))
                rec_weighted = "{:.3f}".format(recall_score(preds, merged_ts[label], average='weighted'))

                # f1_female = "{:.3f}".format(f1_score(preds, merged_ts[label], average='binary', pos_label=1))
                # pre_female = "{:.3f}".format(precision_score(preds, merged_ts[label], average='binary',  pos_label=1))
                # rec_female = "{:.3f}".format(recall_score(preds, merged_ts[label], average='binary',  pos_label=1))
                
                results.append([type(model_to_fit).__name__, acc, f1_macro, pre_macro, rec_macro, f1_weighted ])
                

  tf_idf_matrix = scipy.sparse.csr.csr_matrix([])
6it [00:15,  2.62s/it]
2it [00:05,  2.51s/it]
  tf_idf_matrix = scipy.sparse.csr.csr_matrix([])
6it [00:14,  2.41s/it]
2it [00:04,  2.32s/it]
  tf_idf_matrix = scipy.sparse.csr.csr_matrix([])
6it [00:15,  2.66s/it]
2it [00:05,  2.58s/it]


In [53]:
# print("{:.3f}".format(precision_score(preds_all[2], merged_ts[label], average='binary', pos_label=1)),
# "{:.3f}".format(precision_score(merged_ts[label], preds_all[2],  average='binary', pos_label=1)),
# "{:.3f}".format(recall_score(preds_all[2], merged_ts[label],  average='binary', pos_label=1)),
# "{:.3f}".format(recall_score(merged_ts[label], preds_all[2],  average='binary', pos_label=1)))

# confusion_matrix(merged_ts[label], preds_all[2], labels=[1,0])

# print("{:.3f}".format(precision_score(preds_all[2], merged_ts[label], average='binary', pos_label=0)),
# "{:.3f}".format(precision_score(merged_ts[label], preds_all[2],  average='binary', pos_label=0)),
# "{:.3f}".format(recall_score(preds_all[2], merged_ts[label],  average='binary', pos_label=0)),
# "{:.3f}".format(recall_score(merged_ts[label], preds_all[2],  average='binary', pos_label=0)))

0.747 0.712 0.712 0.747


In [10]:
pd.DataFrame(results, columns=["model", "acc", 'f1_macro', 'pre_macro', 'rec_macro', 'f1_weighted'])#.to_csv('trash.csv')

### Training

In [4]:
merged_tr=pd.read_csv("data/user_tweets_train.csv")
merged_ts=pd.read_csv("data/user_tweets_test.csv")

dup_usernames = np.load('duplicated_usernames.npy', allow_pickle=True)

merged_tr.dropna(subset=["tw_concat_stemmed"], inplace=True)
merged_tr = merged_tr[~merged_tr.username.isin(dup_usernames)]

merged_ts.dropna(subset=["tw_concat_stemmed"], inplace=True)
merged_ts = merged_ts[~merged_ts.username.isin(dup_usernames)]

In [8]:
merged_tr.columns

Index(['username', 'tw_concat_stemmed', 'age_group',
       'tw_concat_processed_text', 'realname', 'meslek', 'age_enc', 'gender',
       'gender_enc'],
      dtype='object')

In [9]:
BIAS = False
results = []

models = [SVC(), RandomForestClassifier(), LogisticRegression(), KNeighborsClassifier()]
labels = ["age_enc", "gender_enc"] 

for label in labels:
    
    if label == 'age_enc':
        max_features = 20_000
    else:
        max_features = 5_000

    vectorizer = TfidfVectorizer(ngram_range=(3,3), 
                        max_features=max_features,
                        analyzer='char_wb')

    # vectorizer = TfidfVectorizer(ngram_range=(1,2), 
    #                     max_features=max_features,
    #                     analyzer='word')


    fitted_vectorizer = vectorizer.fit(merged_tr.tw_concat_processed_text)

    train_vecs_csr = progress_transform(merged_tr.tw_concat_processed_text, fitted_vectorizer, chunk_size = 100)
    test_vecs_csr = progress_transform(merged_ts.tw_concat_processed_text, fitted_vectorizer, chunk_size = 100)

    for model_to_fit in [SVC(), KNeighborsClassifier()]:
        
        if type(model_to_fit).__name__ == 'RandomForestClassifier':
            num_runs = 3
        else:
            num_runs = 1

        for _ in range(num_runs):

            if BIAS:
                classes = merged_tr[label].unique().tolist()
                classWeight = compute_class_weight('balanced', classes, merged_tr[label]) 
                classWeight = dict(enumerate(classWeight))
                models = [SVC(class_weight=classWeight), RandomForestClassifier(class_weight=classWeight)]

            # fit model
            model = model_to_fit
            model.fit(train_vecs_csr, merged_tr[label])

            preds=list()
            for test_vec_csr in test_vecs_csr:
                pred = model.predict(test_vec_csr)[0]
                preds.append(pred)

            average = None
            if merged_ts[label].nunique() > 2:
                average = 'macro'
            else:
                average = 'binary'
            
            f1 = "{:.3f}".format(f1_score(preds, merged_ts[label], average=average))
            acc = "{:.3f}".format(accuracy_score(preds, merged_ts[label]))
            pre = "{:.3f}".format(precision_score(preds, merged_ts[label], average=average))
            rec = "{:.3f}".format(recall_score(preds, merged_ts[label], average=average))
            
            results.append([type(model_to_fit).__name__, f1, acc, pre, rec])
                

8it [00:45,  5.65s/it]
2it [00:11,  5.52s/it]
  _warn_prf(average, modifier, msg_start, len(result))
8it [00:56,  7.05s/it]
2it [00:12,  6.42s/it]


In [10]:
pd.DataFrame(results, columns=[type(model_to_fit).__name__,"f1","acc","pre","rec"])

Unnamed: 0,KNeighborsClassifier,f1,acc,pre,rec
0,SVC,0.366,0.498,0.39,0.374
1,KNeighborsClassifier,0.397,0.483,0.4,0.432
2,SVC,0.711,0.749,0.727,0.696
3,KNeighborsClassifier,0.658,0.633,0.83,0.545


In [14]:
recall_score(preds, merged_ts[label], average='binary', pos_label=0)

0.7304347826086957

0	SVC	                    0.366	0.498	0.390	0.374
1	KNeighborsClassifier	0.397	0.483	0.400	0.432
2	SVC	                    0.711	0.749	0.727	0.696
3	KNeighborsClassifier	0.658	0.633	0.830	0.545

### Best Model Performance SVC

In [7]:
# merged_tr=pd.read_csv("turkish_dataset/dataset/train_prep.csv").dropna(subset=["tw_concat_stemmed"])
# merged_ts=pd.read_csv("turkish_dataset/dataset/test_prep.csv").dropna(subset=["tw_concat_stemmed"])

In [2]:
# data_ids = pd.read_csv("data/celeb_filtered.csv", usecols=["user_id"])
celeb_info = pd.read_json(f'celebrity_profiling/ACL-19/celebrity-profiling/webis-celebrity-corpus-2019-distribution.ndjson', lines=True)

In [None]:
labels = list()
for label in celeb_info.labels:
    try:
        labels.append(label['sex or gender (P21)'].split()[0])
    except:
        labels.append(float("nan"))

celeb_info["gender"] = labels

In [26]:
celeb_info[celeb_info["gender"].isin(["female", "male"])].shape

male           46635
female         18315
transgender       40
non-binary        18
Name: gender, dtype: int64

In [8]:
label = "label_x"
model = SVC(probability=True)
max_features = 10_000

vectorizer = TfidfVectorizer(ngram_range=(1,2), 
                    max_features=max_features,
                    analyzer='word')

fitted_vectorizer = vectorizer.fit(merged_tr.tw_concat_stemmed)

train_vecs_csr = progress_transform(merged_tr.tw_concat_stemmed, fitted_vectorizer, chunk_size = 100)
test_vecs_csr = progress_transform(merged_ts.tw_concat_stemmed, fitted_vectorizer, chunk_size = 100)

model.fit(train_vecs_csr, merged_tr[label])

preds=list()
for test_vec_csr in tqdm(test_vecs_csr):
    pred = model.predict(test_vec_csr)[0]
    preds.append(pred)

average = None
if merged_ts[label].nunique() > 2:
    average = 'macro'
else:
    average = 'binary'

33it [00:04,  6.75it/s]
19it [00:03,  6.33it/s]
1924it [00:44, 43.29it/s]


In [10]:
f1 = "{:.3f}".format(f1_score(preds, merged_ts[label], average=average, pos_label='male'))
acc = "{:.3f}".format(accuracy_score(preds, merged_ts[label]))
pre = "{:.3f}".format(precision_score(preds, merged_ts[label], average=average, pos_label='male'))
rec = "{:.3f}".format(recall_score(preds, merged_ts[label], average=average, pos_label='male'))

print([f1, acc, pre, rec])

['0.781', '0.800', '0.784', '0.778']


In [12]:
# import pickle

# with open('models/gender.svc.bin', 'wb') as handle:
#     pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('models/gender.vectorizer.bin', 'wb') as handle2:
#     pickle.dump(vectorizer, handle2, protocol=pickle.HIGHEST_PROTOCOL)