In [7]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

In [8]:
df = pd.read_csv('tweet_data.csv')
print("number of users: ", df.username.unique().shape[0])
print("columns", df.columns)

userlist_shuffled = pd.Series(df.username.unique()).sample(frac=1, random_state=26).reset_index(drop=True).tolist()

mid = 4 * (len(userlist_shuffled) // 5)
train = df[df.username.isin(userlist_shuffled[:mid])]
test = df[df.username.isin(userlist_shuffled[mid:])]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print("train shape: ", train.shape, "test shape", test.shape)

number of users:  729
columns Index(['id_str', 'created_at', 'text', 'entities', 'retweeted', 'username',
       'realname', 'gender', 'age', 'meslek', 'gender_enc', 'age_group',
       'age_enc'],
      dtype='object')
train shape:  (1094507, 13) test shape (275204, 13)


### Preprocessing

In [9]:
import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

text = df.text[0]
print(text) # with emoji

print(remove_emoji(text))

RT @Bozoklularr: Herkesin üniversitesi açılmış, bir kafelerde,sokaklarda story atıyorlar.
Allah'ım nasip et.🤲
RT @Bozoklularr: Herkesin üniversitesi açılmış, bir kafelerde,sokaklarda story atıyorlar.
Allah'ım nasip et.


In [10]:
def preprocess(entities_df, text_df):
    if isinstance(text_df, list): 
        prep_list = list()
        for ent_df, txt_df in tqdm(zip(entities_df, text_df)):
            prep_text=preprocess_ind(ent_df, txt_df)
            prep_list.append(prep_text)
        return prep_list
    else:
        return preprocess_ind(entities_df, text_df)
    
def preprocess_ind(entities_df, text_df):
    ent_dict = eval(entities_df)
    text = text_df
    
    # Extract entities directly from tweet entities
    texts_to_extract = []
    for key in ent_dict.keys():
        if ent_dict[key]:
            for ent_dict_2 in ent_dict[key]:
                inds = ent_dict_2['indices']
                texts_to_extract.append(text[inds[0]:inds[1]])
    for text_to_extract in texts_to_extract:
        text = text.replace(text_to_extract, '')
        
    # extract any words starting with # or @ or links starting with https
    for tag in text.split(): 
        if tag.startswith("#") or tag.startswith("@") or tag.startswith("https://"):
            text=text.replace(tag, '')
    
    # extract newline and RT
    text = text.replace('\n', ' ')
    text = text.replace('RT : ', '')
    
    #demojidfy
    text = remove_emoji(text)
    
    return text

In [11]:
preprocess(df.entities[0], df.text[0])

"Herkesin üniversitesi açılmış, bir kafelerde,sokaklarda story atıyorlar. Allah'ım nasip et."

In [12]:
train["processed_text"] = preprocess(train.entities.tolist(), train.text.tolist())
test["processed_text"] = preprocess(test.entities.tolist(), test.text.tolist())

0it [00:00, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["processed_text"] = preprocess(train.entities.tolist(), train.text.tolist())


0it [00:00, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["processed_text"] = preprocess(test.entities.tolist(), test.text.tolist())


#### stemming

In [13]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

stemmer=SnowballStemmer('porter')
tokenizer = RegexpTokenizer(r'\w+')

def stem(text_df, stemmer):
    stemmed_list = []
    if isinstance(text_df, list): 
        for text in tqdm(text_df):
            stemmed_list.append(stem_ind(text, stemmer))
        return stemmed_list
    else:
        return stem_ind(text_df, stemmer)
        
def stem_ind(text_df, stemmer):
    # Stem each word
    stemmed =""
    #import pdb; pdb.set_trace()
    worded = tokenizer.tokenize(text_df)
    for word in worded:
        stemmed = stemmed + stemmer.stem(word) + ' '
    return stemmed

In [14]:
train["stemmed"] = stem(train["processed_text"].tolist(), stemmer)
test["stemmed"] = stem(test["processed_text"].tolist(), stemmer)

  0%|          | 0/1094507 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["stemmed"] = stem(train["processed_text"].tolist(), stemmer)


  0%|          | 0/275204 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["stemmed"] = stem(test["processed_text"].tolist(), stemmer)


In [18]:
def concat_tweets(df, column):
    """
    concatenates tweets for each user
    returns dataframe including usernames and corresponding concatenated tweets
    """
    map_user = list()
    map_tweets = list()
    
    for user in tqdm(df.username.unique()):
        map_user.append(user)
        tweets_concat = " ".join(df[df.username == user][column].values)
        map_tweets.append(tweets_concat)
        
    return pd.DataFrame({"username":map_user, f"tw_concat_{column}":map_tweets}, columns=["username", f"tw_concat_{column}"])

In [19]:
map_user_tweet_tr_stem = concat_tweets(train, 'stemmed')
map_user_tweet_ts_stem = concat_tweets(test, 'stemmed')

map_user_tweet_tr_process = concat_tweets(train, 'processed_text')
map_user_tweet_ts_process = concat_tweets(test, 'processed_text')

  0%|          | 0/580 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/580 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

In [21]:
map_user_tweet_tr = map_user_tweet_tr_stem.merge(map_user_tweet_tr_process, on="username")
map_user_tweet_ts = map_user_tweet_ts_stem.merge(map_user_tweet_ts_process, on="username")

In [24]:
# # This cell just to be run for specifying years
train['year'] = train.created_at.apply(lambda x: x[:4])
test['year'] = test.created_at.apply(lambda x: x[:4])

merged_tr = map_user_tweet_tr.merge(train[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

merged_ts = map_user_tweet_ts.merge(test[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

In [27]:
merged_tr.to_csv("user_tweets_train.csv", index=False)
merged_ts.to_csv("user_tweets_test.csv", index=False)

In [None]:


# train_2021 = train[train.year == "2021"]
# test_2021 = test[test.year == "2021"]

# map_user_tweet_tr = concat_tweets(train_2021, 'stemmed')
# map_user_tweet_ts = concat_tweets(test_2021, 'stemmed')

# merged_tr = map_user_tweet_tr.merge(train_2021[['username',
#                    'realname','meslek', 'age_group',
#                    'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

# merged_ts = map_user_tweet_ts.merge(test_2021[['username',
#                    'realname','meslek', 'age_group',
#                    'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

### Prepare Data for BERTTurk

In [None]:
# map_user_tweet_tr = concat_tweets(train, 'processed_text')
# map_user_tweet_ts = concat_tweets(test, 'processed_text')

# merged_tr = map_user_tweet_tr.merge(train[['username',
#                    'realname','meslek', 'age_group',
#                    'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

# merged_ts = map_user_tweet_ts.merge(test[['username',
#                    'realname','meslek', 'age_group',
#                    'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

# def process_data(df, label):
#     column_titles = ['sentence1', 'label']
#     df = df.rename(columns={'tw_concat':'sentence1', label:'label'})
#     df = df[column_titles]
#     df = df.reindex(columns=column_titles)
#     return df

# process_data(merged_tr, 'gender_enc').to_csv('train_gender.csv',index=False)
# process_data(merged_ts, 'gender_enc').to_csv('test_gender.csv',index=False)

# process_data(merged_tr, 'age_enc').to_csv('train_age.csv',index=False)
# process_data(merged_ts, 'age_enc').to_csv('test_age.csv',index=False)