In [1]:
import time
code_start = time.time()

# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import re

In [1]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, KFold, cross_val_predict

In [4]:
import torch
from torch.optim import Adam
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [5]:
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [7]:
torch.cuda.empty_cache()

In [8]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Processing Data

In [9]:
df_initial=pd.read_csv('labeled_tweet_table_Age.csv', encoding='utf8')

In [10]:
df_initial.head()

Unnamed: 0,Tweet,Screen Name,img_path,Under 21
0,@AdvoBarryRoux @GetVidBot,_____zac_____,0,1
1,"The owner of drip doesn't even have 100 mill, ...",_____zac_____,0,1
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0,1
3,"@casspernyovest is cappin that ""R100m"" figure...",_____zac_____,0,1
4,I want a recipe from @JBscotchSA for #JBLemona...,_____zac_____,0,1


In [11]:
df_initial.shape

(106314, 4)

In [12]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"#[\w]+":"",\
          r"https?:\/\/[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)":"",\
          r"https?:\/\/[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)\b(\;\w+\=\w+)":"",\
         r"[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)":""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        t = re.sub(regx, regexMap[regx], t)
    return t

In [13]:
df_initial["Tweet"]=df_initial["Tweet"].apply(preprocess)

In [14]:
df_initial.head()

Unnamed: 0,Tweet,Screen Name,img_path,Under 21
0,,_____zac_____,0,1
1,"The owner of drip doesnt even have 100 mill, d...",_____zac_____,0,1
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0,1
3,"is cappin that R100m figure is so inflated, ...",_____zac_____,0,1
4,I want a recipe from for ! If youre looking f...,_____zac_____,0,1


In [15]:
df_initial.shape

(106314, 4)

In [16]:
df = df_initial[["Tweet","Screen Name","Under 21"]]

In [17]:
df.head()

Unnamed: 0,Tweet,Screen Name,Under 21
0,,_____zac_____,1
1,"The owner of drip doesnt even have 100 mill, d...",_____zac_____,1
2,even Lekau the owner of Drip was saying that i...,_____zac_____,1
3,"is cappin that R100m figure is so inflated, ...",_____zac_____,1
4,I want a recipe from for ! If youre looking f...,_____zac_____,1


In [18]:
df.shape

(106314, 3)

In [19]:
df['Under 21'].value_counts()

0    56739
1    49575
Name: Under 21, dtype: int64

In [20]:
screen_names_list = list(df['Screen Name'].unique())

print(screen_names_list[0:5])
print(len(screen_names_list))

['_____zac_____', '___aleia', '___schaeffer___', '__drewc', '__EmilyRice__']
1145


# Naive Bayes Classifier

## Dataset [Concatenate strings for all users]

In [21]:
tweets_dict = dict()

for i,screen_name in enumerate(screen_names_list):
    
    tweets_list = df[df['Screen Name']==screen_name]["Tweet"].tolist()
    tweets_dict[i] = [screen_name,' '.join(df[df['Screen Name']==screen_name]["Tweet"].tolist()),df[df['Screen Name']==screen_name]["Under 21"].unique()[0]]

In [22]:
tweets_NB = pd.DataFrame.from_dict(tweets_dict , orient='index')
tweets_NB = tweets_NB.rename(columns={0: 'Screen Name', 1: 'Tweets', 2: 'Under 21'})

In [23]:
train_tweets_NB, test_tweets_NB = train_test_split(tweets_NB,train_size=0.8, random_state=24)

## Vectorizing Words

In [24]:
def Vectorizing(train_tweets_NB, test_tweets_NB,stop_words_vectorizer):
    stop_words_vectorizer.fit(train_tweets_NB["Tweets"].values)
    
    print("Number of words in Vocabulary-",len(stop_words_vectorizer.vocabulary_))
    
    x_input=stop_words_vectorizer.transform(train_tweets_NB["Tweets"].values)
    x_test_input=stop_words_vectorizer.transform(test_tweets_NB["Tweets"].values)
    
    return x_input, x_test_input

## Model and training

In [25]:
def model_NB(train_tweets_NB, test_tweets_NB,nb,stop_words_vectorizer):

    x_input, x_test_input = Vectorizing(train_tweets_NB, test_tweets_NB,stop_words_vectorizer)
    nb.fit(x_input,train_tweets_NB["Under 21"])
    
    y_pred_train = nb.predict(x_input)
    print("Train accurary-",metrics.accuracy_score(train_tweets_NB["Under 21"].values, y_pred_train))
    
    y_pred_test = nb.predict(x_test_input)
    print("Test accurary-",metrics.accuracy_score(test_tweets_NB["Under 21"].values, y_pred_test))
    
    print("Classification Report\n",classification_report(y_true=test_tweets_NB["Under 21"].values,y_pred=y_pred_test))

In [26]:
def main_NB(train_tweets_NB, test_tweets_NB, tweets_NB):
    nb = MultinomialNB()
    stop_words_vectorizer=CountVectorizer(stop_words='english')
    # model_NB(train_tweets_NB, test_tweets_NB,nb,stop_words_vectorizer)
    KFold_model(nb, tweets_NB, stop_words_vectorizer)


In [None]:
def KFold_model(nb, tweets_NB, stop_words_vectorizer):
    stop_words_vectorizer.fit(tweets_NB["Tweets"].values)
    print("Number of words in Vocabulary-",len(stop_words_vectorizer.vocabulary_))
    x_input = stop_words_vectorizer.transform(tweets_NB["Tweets"].values)
    y_pred = cross_val_predict(nb, x_input, tweets_NB["Under 21"], cv=5)
    print("Classification Report\n",classification_report(y_true=tweets_NB["Under 21"].values,y_pred=y_pred))


In [27]:
main_NB(train_tweets_NB, test_tweets_NB, tweets_NB)

Number of words in Vocabulary- 38049
Train accurary- 0.9748908296943232
Test accurary- 0.6768558951965066
Classification Report
               precision    recall  f1-score   support

           0       0.65      0.84      0.73       122
           1       0.73      0.50      0.59       107

    accuracy                           0.68       229
   macro avg       0.69      0.67      0.66       229
weighted avg       0.69      0.68      0.67       229



# BERT Model

## Dataset [Split dataset by users]

In [28]:
train_tweets_sn, test_tweets_sn = train_test_split(screen_names_list,train_size=0.8, random_state=24)

In [29]:
print(len(train_tweets_sn),len(test_tweets_sn))

916 229


In [30]:
train_tweets_df = df[df["Screen Name"]==train_tweets_sn[0]]
for x in train_tweets_sn[1:]:
    train_tweets_df = train_tweets_df.append(df[df["Screen Name"]==x])
train_tweets_df

Unnamed: 0,Tweet,Screen Name,Under 21
74852,"Gonna play Minecraft tonight! , cause I cant r...",holyheckitshope,1
74853,One of the runs Im watching on GDQs YouTube fe...,holyheckitshope,1
74854,More of Stardew Valley tonight! ☀🌊 Beach Farm!...,holyheckitshope,1
74855,"Hayley I love yooooouuu ☀🌊 Beach Farm! Fall, ...",holyheckitshope,1
74856,Didnt get a chance to say earlier when I retw...,holyheckitshope,1
...,...,...,...
40620,Going live on Twitch at 10:30pm est,SarcsmNMistakes,1
40621,Watch us Live on Twitch\n\n,SarcsmNMistakes,1
40622,Gonna stream tonight on twitch,SarcsmNMistakes,1
40623,Subscribe to my YouTube channel as well! Thanks!,SarcsmNMistakes,1


In [31]:
test_tweets_df = df[df["Screen Name"]==test_tweets_sn[0]]
for x in test_tweets_sn[1:]:
    test_tweets_df = test_tweets_df.append(df[df["Screen Name"]==x])
test_tweets_df

Unnamed: 0,Tweet,Screen Name,Under 21
88895,I see u 👀,megannlindstrom,1
88896,The reason I have a crippling caffeine addicti...,megannlindstrom,1
88897,“Not gonna lie that tweet was kinda cheug” 😑,megannlindstrom,1
88898,I taught Adam the word “Cheugy” and he won’t s...,megannlindstrom,1
88899,Nothin like steppin out the gym and breathing ...,megannlindstrom,1
...,...,...,...
79543,2 &amp; 3 🤍✨,Damarii_98,0
79544,Amen 🙏🏼🤍✨,Damarii_98,0
79545,both of our placements are in here hehe 🤍✨,Damarii_98,0
79546,Affirmed and claimed this thank you 🙏🏼🤍,Damarii_98,0


In [32]:
print(train_tweets_df.shape)
print(test_tweets_df.shape)

(84797, 3)
(21517, 3)


In [33]:
print(train_tweets_df["Under 21"].value_counts())
print(test_tweets_df["Under 21"].value_counts())

0    45349
1    39448
Name: Under 21, dtype: int64
0    11390
1    10127
Name: Under 21, dtype: int64


## Dataloader

In [34]:
class Tweet_Dataset(Dataset):
    def __init__(self,dataset,tokenizer,max_len):
        
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len    
        
    def __len__(self):
        return len(self.dataset["Tweet"])
    
    def __getitem__(self, index):
        
        tweet = str(self.dataset.iloc[index,0])
        label = self.dataset.iloc[index,2]
        
        encoding_input = self.tokenizer.encode_plus(tweet,max_length=self.max_len, add_special_tokens=True,\
                                               return_token_type_ids=False,pad_to_max_length=True, return_attention_mask=True,\
                                               return_tensors='pt',truncation=True)
        
        
        return {'tweet':tweet,'label':label,'input_ids':encoding_input['input_ids'].flatten(),\
                'attention_mask':encoding_input['attention_mask'].flatten()} 

In [35]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
token_lens = []
for txt in df["Tweet"]:
    tokens = tokenizer.encode(txt)
    token_lens.append(len(tokens))
print(max(token_lens))

150


## Creating a model

In [36]:
class Classifier(torch.nn.Module):
    def __init__(self):
        
        super(Classifier, self).__init__()
        
        self.bert_model=BertModel.from_pretrained("bert-base-cased")
        
        self.dropout = nn.Dropout(p=0.3)
        
        self.linear = nn.Linear(self.bert_model.config.hidden_size,2) 
        
    def forward(self,input_ids, attention_mask):
        
        last_hidden_layer,pooled_output = self.bert_model(input_ids=input_ids,attention_mask=attention_mask, return_dict=False)
        
        dropout_output = self.dropout(pooled_output)
        
        linear_output = self.linear(dropout_output)
        
        return linear_output

## Training and Testing

In [37]:
def train_loop(dataloader, model, loss_fn, optimizer,device, scheduler):
    
    size = len(dataloader.dataset)
    model=model.train()
    losses=0 
    accuracy=0 
    
    for d in dataloader:
        
        
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['label'].to(device) 
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, targets)

        # Backpropagation
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        
        correct = (preds == targets).float()
        acc=torch.sum(correct)
        accuracy+=acc.item()  
        
        scheduler.step()
        
        losses+=loss.item()   
        
    return accuracy/size, losses/size

In [38]:
def test_loop(dataloader, model, device):
    
    model=model.eval()
    
    predictions = []
    
    with torch.no_grad():
        for d in dataloader:
            
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['label'].to(device)
        
            outputs = model(input_ids=input_ids,attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            
            predictions = predictions + preds.tolist()
    
    values, counts = np.unique(predictions, return_counts=True)
    ind = np.argmax(counts)
    final_pred = values[ind]
 
    return final_pred

In [39]:
def age_prediction(train_twitter_loader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler, epochs, tokenizer, max_len, batch_size, test_twitter_loader):
    best_test_acc = 0
    
    for t in range(epochs):
        print(f'Epoch {t + 1}/{epochs}')
        print('-' * 10)

        start=time.time()
                    
        train_acc, train_loss = train_loop(train_twitter_loader, model, loss, optimizer, device, scheduler)

        correct_pred = 0
        
        predictions=[]
        target_values=[]
        
        for y in test_tweets_sn:
            if test_twitter_loader is not None:
                test_dataset = Tweet_Dataset(test_tweets_df[test_tweets_df["Screen Name"]==y],tokenizer,max_len)
                test_twitter_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            
            test_pred = test_loop(test_twitter_loader, model, device)
            
            test_label = test_tweets_df[test_tweets_df["Screen Name"]==y]["Under 21"].unique()
            
            if(test_pred==test_label[0]):
                    correct_pred+=1
            
            predictions.append(test_pred)
            target_values.append(test_label[0])
        
        end=time.time()
        print("time taken-",round((end-start)/60.0,2),"minutes")

        print("Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 3)))
        
        test_acc = correct_pred/len(test_tweets_sn)
        print("Test Accuracy: {}%".format(round(test_acc*100, 3)))
        
        print("Classification Report\n",classification_report(y_true=target_values,y_pred=predictions))

        if test_acc > best_test_acc:
            best_test_acc = test_acc
            # Save the parameters of the model
            torch.save(model.state_dict(), 'model_param.pt')

In [None]:
def reset_weights(m):
    if isinstance(m, nn.Linear):
        m.reset_parameters()

In [40]:
def main(train_tweets_df, test_tweets_df, test_tweets_sn):
    
    learning_rate=3.1e-5  
    epochs = 5
    
    MAX_LEN = 180  
    BATCH_SIZE = 16

    model = Classifier()
    model = model.to(device)

    kfold=KFold(n_splits=5,shuffle=True)
    
    loss=nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    train_dataset = Tweet_Dataset(train_tweets_df,tokenizer,MAX_LEN)
    train_twitter_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    dataset = Tweet_Dataset(screen_names_list, tokenizer, MAX_LEN)
    
    total_steps = len(train_twitter_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

    for fold,(train_idx, test_idx) in enumerate(kfold.split(dataset)):
        print('------------fold no---------{}----------------------'.format(fold))
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

        trainloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=BATCH_SIZE, sampler=train_subsampler)
        testloader = torch.utils.data.DataLoader(
                      dataset,
                      batch_size=BATCH_SIZE, sampler=test_subsampler)

        model.apply(reset_weights)
        
        age_prediction(trainloader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler, epochs, tokenizer, MAX_LEN, BATCH_SIZE, testloader)
    
    # age_prediction(train_twitter_loader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler, epochs, tokenizer, MAX_LEN, BATCH_SIZE)

In [41]:
main(train_tweets_df, test_tweets_df, test_tweets_sn)

Epoch 1/5
----------
time taken- 17.62 minutes
Train Loss 0.042 | Train Accuracy: 58.756%
Test Accuracy: 62.009%
Classification Report
               precision    recall  f1-score   support

           0       0.59      0.98      0.73       122
           1       0.88      0.21      0.35       107

    accuracy                           0.62       229
   macro avg       0.74      0.60      0.54       229
weighted avg       0.73      0.62      0.55       229

Epoch 2/5
----------
time taken- 17.49 minutes
Train Loss 0.039 | Train Accuracy: 64.739%
Test Accuracy: 66.376%
Classification Report
               precision    recall  f1-score   support

           0       0.63      0.91      0.74       122
           1       0.79      0.38      0.52       107

    accuracy                           0.66       229
   macro avg       0.71      0.65      0.63       229
weighted avg       0.70      0.66      0.64       229

Epoch 3/5
----------
time taken- 17.49 minutes
Train Loss 0.031 | Train Ac

## Prediction

In [42]:
def prediction_loop(test_tweets_df,model,device,test_tweets_sn, tokenizer, max_len, batch_size):
    
    model=model.eval()
    
    predictions=[]
    target_values=[]
       
    for y in test_tweets_sn:
            
        test_dataset = Tweet_Dataset(test_tweets_df[test_tweets_df["Screen Name"]==y],tokenizer,max_len)
        test_twitter_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

        test_pred = test_loop(test_twitter_loader, model, device)

        test_label = test_tweets_df[test_tweets_df["Screen Name"]==y]["Under 21"].unique()

        predictions.append(test_pred)
        target_values.append(test_label[0])
        
    return predictions, target_values

In [43]:
model_pred = Classifier()
model_pred.load_state_dict(torch.load('model_param.pt'))
model_pred = model_pred.to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

MAX_LEN = 180 
BATCH_SIZE = 16

In [44]:
y_pred,y=prediction_loop(test_tweets_df,model_pred,device, test_tweets_sn, tokenizer, MAX_LEN, BATCH_SIZE )

In [45]:
print(classification_report(y_true=y,y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.76      0.76      0.76       122
           1       0.73      0.73      0.73       107

    accuracy                           0.75       229
   macro avg       0.75      0.75      0.75       229
weighted avg       0.75      0.75      0.75       229



In [None]:
folds = 5
epochs = 5
kfold = KFold(n_splits=5, shuffle=True)



In [46]:
print("time taken for notebook-",round((time.time()-code_start)/60.0,2))

time taken for notebook- 90.11
