In [6]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm

In [3]:
df = pd.read_csv('tweet_data.csv')
print("number of users: ", df.username.unique().shape[0])
print("columns", df.columns)

userlist_shuffled = pd.Series(df.username.unique()).sample(frac=1, random_state=26).reset_index(drop=True).tolist()

mid = 4 * (len(userlist_shuffled) // 5)
train = df[df.username.isin(userlist_shuffled[:mid])]
test = df[df.username.isin(userlist_shuffled[mid:])]

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print("train shape: ", train.shape, "test shape", test.shape)

number of users:  1050
columns Index(['id_str', 'created_at', 'text', 'entities', 'retweeted', 'username',
       'realname', 'gender', 'age', 'meslek', 'year', 'year_relative',
       'age_normalized', 'gender_enc', 'age_group', 'age_group_norm',
       'age_enc', 'age_enc_norm'],
      dtype='object')
train shape:  (1641471, 18) test shape (391212, 18)


In [3]:
# This cell just to be run for specifying years
# train['year'] = train.created_at.apply(lambda x: x[:4])
# test['year'] = test.created_at.apply(lambda x: x[:4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['year'] = train.created_at.apply(lambda x: x[:4])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['year'] = test.created_at.apply(lambda x: x[:4])


### Preprocessing

In [4]:
import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

text = df.text[0]
print(text) # with emoji

print(remove_emoji(text))

RT @Bozoklularr: Herkesin üniversitesi açılmış, bir kafelerde,sokaklarda story atıyorlar.
Allah'ım nasip et.🤲
RT @Bozoklularr: Herkesin üniversitesi açılmış, bir kafelerde,sokaklarda story atıyorlar.
Allah'ım nasip et.


In [5]:
def preprocess(entities_df, text_df):
    if isinstance(text_df, list): 
        prep_list = list()
        for ent_df, txt_df in tqdm(zip(entities_df, text_df)):
            prep_text=preprocess_ind(ent_df, txt_df)
            prep_list.append(prep_text)
        return prep_list
    else:
        return preprocess_ind(entities_df, text_df)
    
def preprocess_ind(entities_df, text_df):
    ent_dict = eval(entities_df)
    text = text_df
    
    # Extract entities directly from tweet entities
    texts_to_extract = []
    for key in ent_dict.keys():
        if ent_dict[key]:
            for ent_dict_2 in ent_dict[key]:
                inds = ent_dict_2['indices']
                texts_to_extract.append(text[inds[0]:inds[1]])
    for text_to_extract in texts_to_extract:
        text = text.replace(text_to_extract, '')
        
    # extract any words starting with # or @ or links starting with https
    for tag in text.split(): 
        if tag.startswith("#") or tag.startswith("@") or tag.startswith("https://"):
            text=text.replace(tag, '')
    
    # extract newline and RT
    text = text.replace('\n', ' ')
    text = text.replace('RT : ', '')
    
    #demojidfy
    text = remove_emoji(text)
    
    return text

In [6]:
preprocess(df.entities[0], df.text[0])

"Herkesin üniversitesi açılmış, bir kafelerde,sokaklarda story atıyorlar. Allah'ım nasip et."

In [7]:
train["processed_text"] = preprocess(train.entities.tolist(), train.text.tolist())
test["processed_text"] = preprocess(test.entities.tolist(), test.text.tolist())

1641471it [01:52, 14581.22it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["processed_text"] = preprocess(train.entities.tolist(), train.text.tolist())
391212it [00:27, 14036.97it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["processed_text"] = preprocess(test.entities.tolist(), test.text.tolist())


#### stemming

In [8]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from TurkishStemmer import TurkishStemmer

stemmer = TurkishStemmer()

# stemmer=SnowballStemmer('porter')
tokenizer = RegexpTokenizer(r'\w+')

def stem(text_df, stemmer):
    stemmed_list = []
    if isinstance(text_df, list): 
        for text in tqdm(text_df):
            stemmed_list.append(stem_ind(text, stemmer))
        return stemmed_list
    else:
        return stem_ind(text_df, stemmer)
        
def stem_ind(text_df, stemmer):
    # Stem each word
    stemmed =""
    #import pdb; pdb.set_trace()
    worded = tokenizer.tokenize(text_df)
    for word in worded:
        stemmed = stemmed + stemmer.stem(word) + ' '
    return stemmed

In [9]:
train["stemmed"] = stem(train["processed_text"].tolist(), stemmer)
test["stemmed"] = stem(test["processed_text"].tolist(), stemmer)

100%|██████████| 1641471/1641471 [05:15<00:00, 5206.01it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["stemmed"] = stem(train["processed_text"].tolist(), stemmer)
100%|██████████| 391212/391212 [01:12<00:00, 5373.49it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["stemmed"] = stem(test["processed_text"].tolist(), stemmer)


In [10]:
train.to_csv("tweets_prep_train.csv", index=False)
test.to_csv("tweets_prep_test.csv", index=False)

#### Filter

In [29]:
# temporary will be deleted
import pandas as pd
train = pd.read_csv("data/tweets_prep_train.csv")
test = pd.read_csv("data/tweets_prep_test.csv")

train = train[train.processed_text.notna() & train.stemmed.notna()]
test = test[test.processed_text.notna() & test.stemmed.notna()]

In [30]:
# temporary will be deleted
most_commons = list()
with open(f"data/age_specific_words.txt", encoding="utf-8" ,mode="r") as file:
    for line in file.readlines():
        most_commons.append(line.replace("\n", ""))
        
train_idxs = train.processed_text.apply(lambda x: any([(most_common in x) for most_common in most_commons]))
train = train[train_idxs]

test_idxs = test.processed_text.apply(lambda x: any([(most_common in x) for most_common in most_commons]))
test = test[test_idxs]

In [4]:
def concat_tweets(df, column:str, interval:str = None):
    """
    concatenates tweets for each user
    returns dataframe including usernames and corresponding concatenated tweets
    """
    map_user = list()
    map_tweets = list()
    map_inter = list()
    
    if interval:
        
        unique_username_inter = pd.unique([(username, inter) for username, inter in zip(df["username"], df[interval])])

        for (username, inter) in tqdm(unique_username_inter):
            
            map_user.append(username)
            map_inter.append(inter)

            tweets_concat = " ".join(df[(df.username == username) & (df[interval] == inter)][column].values)
            map_tweets.append(tweets_concat)
            
        return pd.DataFrame.from_dict({"username":map_user, f"tw_concat_{column}":map_tweets, interval:map_inter})

    else:

        for username in tqdm(df["username"].unique()):
            
            map_user.append(username)

            tweets_concat = " ".join(df[df["username"] == username][column].values)
            map_tweets.append(tweets_concat)
            
        return pd.DataFrame.from_dict({"username":map_user, f"tw_concat_{column}":map_tweets})

In [31]:
map_user_tweet_tr_stem = concat_tweets(train, 'stemmed')
map_user_tweet_ts_stem = concat_tweets(test, 'stemmed')

map_user_tweet_tr_process = concat_tweets(train, 'processed_text')
map_user_tweet_ts_process = concat_tweets(test, 'processed_text')

100%|██████████| 839/839 [01:10<00:00, 11.92it/s]
100%|██████████| 210/210 [00:36<00:00,  5.76it/s]
100%|██████████| 839/839 [01:16<00:00, 10.93it/s]
100%|██████████| 210/210 [00:09<00:00, 22.20it/s]


In [32]:
map_user_tweet_tr = map_user_tweet_tr_stem.merge(map_user_tweet_tr_process, on=["username"])
map_user_tweet_ts = map_user_tweet_ts_stem.merge(map_user_tweet_ts_process, on=["username"])

In [33]:
merged_tr = map_user_tweet_tr.merge(train[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on=["username"], right_on=["username"])

merged_ts = map_user_tweet_ts.merge(test[['username',
                   'realname','meslek', 'age_group',
                   'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on=["username"], right_on=["username"])

In [34]:
merged_tr.to_csv("user_tweets_train_filtby_age.csv", index=False)
merged_ts.to_csv("user_tweets_test_filtby_age.csv", index=False)

### Filter only current age group

In [53]:
# Since we just interested in the tweets for the current age group, filter only the current age group tweets
# merged_tr = pd.read_csv("data/user_tweets_train_norm.csv")
# merged_ts = pd.read_csv("data/user_tweets_test_norm.csv")

merged_tr.to_csv("user_tweets_train.csv", index=False)
merged_ts.to_csv("user_tweets_test.csv", index=False)

### Prepare Data for BERTTurk

In [None]:
# map_user_tweet_tr = concat_tweets(train, 'processed_text')
# map_user_tweet_ts = concat_tweets(test, 'processed_text')

# merged_tr = map_user_tweet_tr.merge(train[['username',
#                    'realname','meslek', 'age_group',
#                    'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

# merged_ts = map_user_tweet_ts.merge(test[['username',
#                    'realname','meslek', 'age_group',
#                    'age_enc', 'gender', 'gender_enc']].drop_duplicates(), how="left", left_on="username", right_on="username")

# def process_data(df, label):
#     column_titles = ['sentence1', 'label']
#     df = df.rename(columns={'tw_concat':'sentence1', label:'label'})
#     df = df[column_titles]
#     df = df.reindex(columns=column_titles)
#     return df

# process_data(merged_tr, 'gender_enc').to_csv('train_gender.csv',index=False)
# process_data(merged_ts, 'gender_enc').to_csv('test_gender.csv',index=False)

# process_data(merged_tr, 'age_enc').to_csv('train_age.csv',index=False)
# process_data(merged_ts, 'age_enc').to_csv('test_age.csv',index=False)