# Importing Libraries

In [1]:
import time
code_start = time.time()
import json
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
import torch
from torch.optim import Adam
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup

In [4]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
torch.cuda.empty_cache()

In [7]:
import warnings
warnings.simplefilter(action='ignore')

# Orginal Dataset [Reading, Combining]

In [8]:
with open("./labeled_users_1145/tweets.json", encoding="utf8") as file:
    text = file.read()
    tweets_1145 = json.loads(text)

In [9]:
len(tweets_1145)

2678

In [10]:
tweets_1145['strwbrrymlkt'][0:50]

['@_spacejamtwo THIRTY WHAT',
 'I love my best friend https://t.co/f1fhHWmEK4',
 'THANK YOU https://t.co/iGF1lyEWI0',
 '@_spacejamtwo I genuinely can not gage the size of you in this photo and it bothers me',
 '@hanpanmangaki IM SCREAMING THIS IS SO AMAZING!!!!!!!',
 'OH MY GOD ITS ME!!!!!! LOOK AT HOW AMAZING THIS IS!!!!!!!! https://t.co/kgQJjgDNni',
 '@joonmoonchild @Scootsies ITS BEAUTIFUL',
 'I am selling kf94 masks!!! Please dm me if you are interested!! Would appreciate rts 💖💖💖 https://t.co/sIpIrjXXlX',
 '@sukixtsuki 🥺💖',
 '@meixins THANKS MEI💖💖💖',
 '@sukixtsuki Islynyc on jnsta!!!!!!',
 'These are real glasses and not a snapchat filter,, OKAY!? https://t.co/Dih01yzeLX',
 '@h0tcomedian Charge them double instead',
 'My chris evans white sweater 🔪 https://t.co/3cr4DDfgiN',
 'Its my best friends birthday!! https://t.co/T5gtX1kzNw',
 'Im only here to upload selfies lmfaooooo https://t.co/T7xp5xctaI',
 '@joonmoonchild LMFMAOOS WHAT?!?!?',
 '@h0tcomedian THANKS GIRLLL',
 'Bitch!!! I c

In [11]:
len(tweets_1145['___Dals'])

100

In [12]:
users_1145 = pd.read_csv("./labeled_users_1145/labeled_users.csv", lineterminator='\n')

In [13]:
len(users_1145)

1145

In [14]:
users_1145

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,screen_name,user_id,lang,name,location,description,protected,followers_count,...,profile_background_url,profile_image_url,user.name,num.tweets.used.Lexicon.prediction,Lexicon.age.prediction,Lexicon.gender.prediction..index.,lexicon.gender.prediction,human.labeled.gender,human.labeled.age,age
0,0,1,_____zac_____,4.614412e+08,en,zac ¢,"Maryland, USA",_____Û___È_Ü´Ù,False,208,...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1226134911...,@_____zac_____,100.0,27.652434,-1.457167,M,,23.0,1.0
1,1,2,___aleia,7.650000e+17,en,_æ___ dad ___æ_,"Ohio, USA",BLACK. LIVES. MATTER.,False,466,...,,http://pbs.twimg.com/profile_images/1271280679...,@___aleia,100.0,24.111464,0.985713,F,,19.0,0.0
2,3,4,___schaeffer___,1.257110e+09,en,Brenden Schaeffer,The Lou,Culver-Stockton College '20 ¢ Ô_Ô_Ô KM 1548...,False,811,...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1268044218...,@___schaeffer___,59.0,35.518352,-3.591586,M,,22.0,0.0
3,8,9,__drewc,1.050000e+18,en,drew,"New York, USA",_öÂ_öé _öÂ_ö_ _ç´Ù È \r\r26 #NewYork,False,27,...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1090809548...,@__drewc,134.0,24.910635,1.969121,F,,26.0,1.0
4,9,10,__EmilyRice__,3.797155e+09,en,em,"Marble Falls, TX",#TXST22,False,158,...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/8415201103...,@__EmilyRice__,100.0,25.191925,2.382856,F,,20.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140,3268,3269,zmeadows_18,7.050000e+17,und,Z Meadows,"Circleville, OH",|OUCÈ23__|,False,556,...,,http://pbs.twimg.com/profile_images/1252321514...,@zmeadows_18,100.0,32.385038,-1.729790,M,,19.0,0.0
1141,3271,3272,ZoeBerrier,9.020000e+17,en,Zoâ _êâ,"Millersville, PA",MU 2021 (she/her)\r\rQueen of putting lipstick...,False,94,...,,http://pbs.twimg.com/profile_images/1241199033...,@ZoeBerrier,100.0,22.585143,1.243141,F,F,20.0,0.0
1142,3272,3273,ZoeCalamaco,3.214954e+09,no,Zoe _,San Angelo tx/ aspermont tx,Angelo state,False,475,...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1277399838...,@ZoeCalamaco,100.0,21.348766,1.939069,F,F,21.0,0.0
1143,3274,3275,ZoPeachy,9.890000e+17,en,Zobella Thee Alpha __ê_____´Ù__ ...,"New England/Boston, MA",Harlot for hire. FinDom. 27. Nonbinary. they/t...,False,396,...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1153648270...,@ZoPeachy,100.0,34.894953,2.058720,F,,27.0,1.0


In [15]:
users_1145["Under_21"] = (users_1145["human.labeled.age"] < 21 ).astype(int)

In [16]:
users_1145["Under_21"].value_counts()

0    718
1    427
Name: Under_21, dtype: int64

In [17]:
list_of_screen_names = users_1145['screen_name'].unique()
print(len(list_of_screen_names))

1145


In [18]:
ageDict = users_1145.set_index('screen_name').to_dict()['Under_21']
imgPathDict = users_1145.set_index('screen_name').to_dict()['Unnamed: 0']

tweetList, screenNameList, imgPathList, ageList = [], [], [], []

for screen_name in tweets_1145:
    
    if((screen_name in list_of_screen_names)==False):
        continue
    for tweet in tweets_1145[screen_name]:
        
        tweetList.append(tweet)
        
        screenNameList.append(screen_name)
        
        imgPathList.append(imgPathDict[screen_name])
        
        ageList.append(ageDict[screen_name])
        
df_1 = pd.DataFrame({'Tweet': tweetList, 'Screen Name': screenNameList, 'img_path': imgPathList, 'Under 21': ageList})
df_1.to_csv('labeled_tweet_table_Age.csv', index=False)
df_1

Unnamed: 0,Tweet,Screen Name,img_path,Under 21
0,@AdvoBarryRoux @GetVidBot,_____zac_____,0,0
1,"The owner of drip doesn't even have 100 mill, ...",_____zac_____,0,0
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0,0
3,"@casspernyovest is cappin that ""R100m"" figure...",_____zac_____,0,0
4,I want a recipe from @JBscotchSA for #JBLemona...,_____zac_____,0,0
...,...,...,...,...
106309,uci fucking evil for making me go to school to...,zzzakari4,3279,0
106310,"incredibly hot take, but i just think its funn...",zzzakari4,3279,0
106311,@valedesmadre u knw how i am 😫 its too much pr...,zzzakari4,3279,0
106312,will be needing a couple more business days to...,zzzakari4,3279,0


# Processing Dataset

In [19]:
df_initial=pd.read_csv('labeled_tweet_table_Age.csv', encoding='utf8')

In [20]:
regexMap={r"<[\w'/'\s]*>": "",r"[\'\"\-]+": "",r"@[\w]+":"",r"#[\w]+":"",\
          r"https?:\/\/[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)":"",\
          r"https?:\/\/[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)\b(\;\w+\=\w+)":"",\
         r"[\w+@:%._\+~#=]{1,256}\.[\w+()]{1,6}\b([\w+()@:%_\+.~#?&\/\/=]*)":""}
def preprocess(datainput):
    t=datainput
    for regx in regexMap.keys():
        t = re.sub(regx, regexMap[regx], t)
    return t

In [21]:
df_initial["Tweet"]=df_initial["Tweet"].apply(preprocess)

In [22]:
df_2 = df_initial[["Tweet","Screen Name","Under 21"]]

In [23]:
df_2.to_csv('Age_Tweets_Processed.csv', index=False)
df_2

Unnamed: 0,Tweet,Screen Name,Under 21
0,,_____zac_____,0
1,"The owner of drip doesnt even have 100 mill, d...",_____zac_____,0
2,even Lekau the owner of Drip was saying that i...,_____zac_____,0
3,"is cappin that R100m figure is so inflated, ...",_____zac_____,0
4,I want a recipe from for ! If youre looking f...,_____zac_____,0
...,...,...,...
106309,uci fucking evil for making me go to school to...,zzzakari4,0
106310,"incredibly hot take, but i just think its funn...",zzzakari4,0
106311,u knw how i am 😫 its too much pressure to jus...,zzzakari4,0
106312,will be needing a couple more business days to...,zzzakari4,0


In [24]:
df_2['Under 21'].value_counts()

0    67044
1    39270
Name: Under 21, dtype: int64

In [25]:
screen_names_list = df_2['Screen Name'].unique()

print(screen_names_list[0:5])
print(len(screen_names_list))

['_____zac_____' '___aleia' '___schaeffer___' '__drewc' '__EmilyRice__']
1145


In [26]:
def final_classification_report(cr_list, cm_list):
    n = len(cr_list)
    acc, prec_0, rec_0, f1_0, prec_1, rec_1, f1_1 = 0, 0, 0, 0, 0, 0, 0
    cm = np.zeros((2,2))
    
    for i,cr in enumerate(cr_list):
        acc += cr['accuracy']
        prec_0 += cr['0']['precision']
        rec_0 += cr['0']['recall']
        f1_0 += cr['0']['f1-score']
        prec_1 += cr['1']['precision']
        rec_1 += cr['1']['recall']
        f1_1 += cr['1']['f1-score']
        
        cm += cm_list[i]
    
    print("Overall Accuracy-",round(acc/n,3),"\n")
    print("------(Age >= 21)------\n")
    print("Precision-",round(prec_0/n,3))
    print("Recall-",round(rec_0/n,3))
    print("F1-",round(f1_0/n,3))
    print("\n------(Age < 21)------\n")
    print("Precision-",round(prec_1/n,3))
    print("Recall-",round(rec_1/n,3))
    print("F1-",round(f1_1/n,3))
    print("\nConfusion Matrix-\n",cm)
    

## Dataset [Concatenate strings for all users]

In [27]:
def df_concat(screen_names,df):
    tweets_dict = dict()

    for i,screen_name in enumerate(screen_names):
    
        tweets_list = df[df['Screen Name']==screen_name]["Tweet"].tolist()
        tweets_dict[i] = [screen_name,' '.join(tweets_list),df[df['Screen Name']==screen_name]["Under 21"].unique()[0]]
    tweets_NB = pd.DataFrame.from_dict(tweets_dict , orient='index')
    tweets_NB = tweets_NB.rename(columns={0: 'Screen Name', 1: 'Tweets', 2: 'Under 21'})
    return tweets_NB

In [28]:
df_concat(screen_names_list,df_2)

Unnamed: 0,Screen Name,Tweets,Under 21
0,_____zac_____,"The owner of drip doesnt even have 100 mill,...",0
1,___aleia,I haven’t talked to this girl since my sophomo...,1
2,___schaeffer___,☝🏼👋🏼 37149 congrats sis keep workin!! ...,0
3,__drewc,Yo rly Try Cash App using my code and we’ll ea...,0
4,__EmilyRice__,yes but come to san marcos and live with me 🥰...,1
...,...,...,...
1140,zmeadows_18,ROLL BOBBIES ROLL💚🖤💚🖤💚 We Are Texans! Im takin...,1
1141,ZoeBerrier,Weve evolved past the need for those silly lit...,1
1142,ZoeCalamaco,one person followed me // automatically checke...,0
1143,ZoPeachy,Good morning! Say it back ♡ Happy Friday! Sen...,0


# Naive Bayes Classifier

In [29]:
NB_start = time.time()

## Vectorizing Words

In [30]:
def Vectorizing(train_tweets_NB, test_tweets_NB,stop_words_vectorizer):
    stop_words_vectorizer.fit(train_tweets_NB["Tweets"].values)
    
    x_input=stop_words_vectorizer.transform(train_tweets_NB["Tweets"].values)
    x_test_input=stop_words_vectorizer.transform(test_tweets_NB["Tweets"].values)
    
    return x_input, x_test_input

## Model and training

In [31]:
def model_NB(train_tweets_NB, test_tweets_NB,nb,stop_words_vectorizer):
    
    x_input, x_test_input = Vectorizing(train_tweets_NB, test_tweets_NB,stop_words_vectorizer)
    
    nb.fit(x_input,train_tweets_NB["Under 21"])
    
    y_pred_train = nb.predict(x_input)
    print("Train accuracy-",round(metrics.accuracy_score(train_tweets_NB["Under 21"].values, y_pred_train),3))
    
    y_pred_test = nb.predict(x_test_input)
    print("Test accuracy-",round(metrics.accuracy_score(test_tweets_NB["Under 21"].values, y_pred_test),3))
    
    c_report = classification_report(y_true=test_tweets_NB["Under 21"].values,y_pred=y_pred_test,output_dict=True)
    cm = confusion_matrix(test_tweets_NB["Under 21"].values,y_pred_test)
    
    return c_report, cm

In [32]:
df_nb = df_2
screen_names_list = df_nb['Screen Name'].unique()

kf_NB = KFold(n_splits=5, shuffle = True, random_state=24)
c_report_list = []
cm_list = []
k=1

stop_words_vectorizer=CountVectorizer(stop_words='english')

for train_tweets_sn, test_tweets_sn in kf_NB.split(screen_names_list):
    
    print(f'K-fold {k}/{5}')
    print('-' * 20)
    
    train_tweets_sn = screen_names_list[train_tweets_sn]
    test_tweets_sn = screen_names_list[test_tweets_sn]
    
    train_tweets_NB = df_concat(train_tweets_sn,df_nb)
    test_tweets_NB = df_concat(test_tweets_sn,df_nb)
    
    nb = MultinomialNB()
    
    a,b = model_NB(train_tweets_NB, test_tweets_NB,nb,stop_words_vectorizer)
    
    c_report_list.append(a)
    cm_list.append(b)
    
    k+=1

K-fold 1/5
--------------------
Train accuracy- 0.987
Test accuracy- 0.646
K-fold 2/5
--------------------
Train accuracy- 0.992
Test accuracy- 0.716
K-fold 3/5
--------------------
Train accuracy- 0.988
Test accuracy- 0.703
K-fold 4/5
--------------------
Train accuracy- 0.991
Test accuracy- 0.62
K-fold 5/5
--------------------
Train accuracy- 0.989
Test accuracy- 0.655


In [33]:
print("time taken for NB Model-",round((time.time()-NB_start)/60.0,2),"minutes")

time taken for NB Model- 1.1 minutes


## Result

In [34]:
final_classification_report(c_report_list,cm_list)

Overall Accuracy- 0.668 

------(Age >= 21)------

Precision- 0.67
Recall- 0.932
F1- 0.779

------(Age < 21)------

Precision- 0.659
Recall- 0.228
F1- 0.335

Confusion Matrix-
 [[669.  49.]
 [331.  96.]]


# Logistic Regression

In [35]:
LR_start = time.time()

## TFI DF Feature Extraction

In [36]:
def Tfi_Df(train_tweets_NB, test_tweets_NB,tfidf):
    
    tfidf.fit(train_tweets_NB["Tweets"])
    
    x_input=tfidf.transform(train_tweets_NB["Tweets"])
    x_test_input=tfidf.transform(test_tweets_NB["Tweets"])
    
    return x_input, x_test_input

## Model and training

In [37]:
def model_LR(train_tweets_NB, test_tweets_NB,log,tfidf):
    
    x_input, x_test_input = Tfi_Df(train_tweets_NB, test_tweets_NB,tfidf)
    
    log.fit(x_input,train_tweets_NB["Under 21"])
    
    y_pred_train = log.predict(x_input)
    print("Train accuracy-",round(metrics.accuracy_score(train_tweets_NB["Under 21"].values, y_pred_train),3))
    
    y_pred_test = log.predict(x_test_input)
    print("Test accuracy-",round(metrics.accuracy_score(test_tweets_NB["Under 21"].values, y_pred_test),3))
    
    c_report = classification_report(y_true=test_tweets_NB["Under 21"].values,y_pred=y_pred_test,output_dict=True)
    cm = confusion_matrix(test_tweets_NB["Under 21"].values,y_pred_test)
    
    return c_report, cm

In [38]:
df_lr = df_2
screen_names_list = df_lr['Screen Name'].unique()

kf_LR = KFold(n_splits=5, shuffle = True, random_state=24)
c_report_list = []
cm_list = []
k=1

tfidf = TfidfVectorizer(stop_words='english')

for train_tweets_sn, test_tweets_sn in kf_LR.split(screen_names_list):
    
    print(f'K-fold {k}/{5}')
    print('-' * 20)
    
    train_tweets_sn = screen_names_list[train_tweets_sn]
    test_tweets_sn = screen_names_list[test_tweets_sn]
    
    train_tweets_NB = df_concat(train_tweets_sn,df_lr)
    test_tweets_NB = df_concat(test_tweets_sn,df_lr)
    
    log = LogisticRegression(class_weight = 'balanced')
    
    a,b = model_LR(train_tweets_NB, test_tweets_NB,log,tfidf)
    
    c_report_list.append(a)
    cm_list.append(b)
    
    k+=1

K-fold 1/5
--------------------
Train accuracy- 0.98
Test accuracy- 0.716
K-fold 2/5
--------------------
Train accuracy- 0.976
Test accuracy- 0.707
K-fold 3/5
--------------------
Train accuracy- 0.975
Test accuracy- 0.686
K-fold 4/5
--------------------
Train accuracy- 0.977
Test accuracy- 0.677
K-fold 5/5
--------------------
Train accuracy- 0.98
Test accuracy- 0.686


In [39]:
print("time taken for LR Model-",round((time.time()-LR_start)/60.0,2),"minutes")

time taken for LR Model- 1.11 minutes


## Result

In [40]:
final_classification_report(c_report_list,cm_list)

Overall Accuracy- 0.694 

------(Age >= 21)------

Precision- 0.742
Recall- 0.786
F1- 0.763

------(Age < 21)------

Precision- 0.601
Recall- 0.542
F1- 0.569

Confusion Matrix-
 [[564. 154.]
 [196. 231.]]


# BERT Model

## Dataset [Split dataset by users]

In [41]:
def df_BERT(tweets_sn, df):
    tweets_df = df[df["Screen Name"]==tweets_sn[0]]
    for x in tweets_sn[1:]:
        tweets_df = tweets_df.append(df[df["Screen Name"]==x])
    return tweets_df

## Oversampling

In [42]:
y = df_2['Under 21']

ros = RandomOverSampler(random_state=24)
df_resampled, y_ros = ros.fit_resample(df_2, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

df_resampled = pd.DataFrame(df_resampled, columns=df_2.columns)

Original dataset shape Counter({0: 67044, 1: 39270})
Resample dataset shape Counter({0: 67044, 1: 67044})


In [43]:
df_concat(screen_names_list,df_resampled)["Under 21"].value_counts()

0    718
1    427
Name: Under 21, dtype: int64

## Dataloader

In [44]:
class Tweet_Dataset(Dataset):
    def __init__(self,dataset,tokenizer,max_len):
        
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len    
        
    def __len__(self):
        return len(self.dataset["Tweet"])
    
    def __getitem__(self, index):
        
        tweet = str(self.dataset.iloc[index,0])
        label = self.dataset.iloc[index,2]
        
        encoding_input = self.tokenizer.encode_plus(tweet,max_length=self.max_len, add_special_tokens=True,\
                                               return_token_type_ids=False,pad_to_max_length=True, return_attention_mask=True,\
                                               return_tensors='pt',truncation=True)
        
        
        return {'tweet':tweet,'label':label,'input_ids':encoding_input['input_ids'].flatten(),\
                'attention_mask':encoding_input['attention_mask'].flatten()} 

In [45]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
token_lens = []
for txt in df_2["Tweet"]:
    tokens = tokenizer.encode(txt)
    token_lens.append(len(tokens))
print(max(token_lens))

150


## Creating a model

In [46]:
class Classifier(torch.nn.Module):
    def __init__(self):
        
        super(Classifier, self).__init__()
        
        self.bert_model=BertModel.from_pretrained("bert-base-cased")
        
        self.dropout = nn.Dropout(p=0.3)
        
        self.linear = nn.Linear(self.bert_model.config.hidden_size,2) 
        
    def forward(self,input_ids, attention_mask):
        
        last_hidden_layer,pooled_output = self.bert_model(input_ids=input_ids,attention_mask=attention_mask, return_dict=False)
        
        dropout_output = self.dropout(pooled_output)
        
        linear_output = self.linear(dropout_output)
        
        return linear_output

## Training and Testing

In [47]:
def train_loop(dataloader, model, loss_fn, optimizer,device, scheduler):
    
    size = len(dataloader.dataset)
    model=model.train()
    losses=0 
    accuracy=0 
    
    for d in dataloader:
        
        
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['label'].to(device) 
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids,attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, targets)

        # Backpropagation
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        
        correct = (preds == targets).float()
        acc=torch.sum(correct)
        accuracy+=acc.item()  
        
        scheduler.step()
        
        losses+=loss.item()   
        
    return accuracy/size, losses/size

In [48]:
def test_loop(dataloader, model, device):
    
    model=model.eval()
    
    predictions = []
    
    with torch.no_grad():
        for d in dataloader:
            
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['label'].to(device)
        
            outputs = model(input_ids=input_ids,attention_mask=attention_mask)
            
            _, preds = torch.max(outputs, dim=1)
            
            predictions = predictions + preds.tolist()
    
    values, counts = np.unique(predictions, return_counts=True)
    ind = np.argmax(counts)
    final_pred = values[ind]
 
    return final_pred

In [49]:
def age_prediction(train_twitter_loader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler, \
                   epochs, tokenizer, max_len, batch_size):
    
    best_test_acc = 0
    c_report_best = None
    c_matrix_best = None
    
    for t in range(epochs):
        print(f'Epoch {t + 1}/{epochs}')
        print('-' * 10)

        start=time.time()
                    
        train_acc, train_loss = train_loop(train_twitter_loader, model, loss, optimizer, device, scheduler)

        correct_pred = 0
        
        predictions=[]
        target_values=[]
        
        for y in test_tweets_sn:
            
            test_dataset = Tweet_Dataset(test_tweets_df[test_tweets_df["Screen Name"]==y],tokenizer,max_len)
            test_twitter_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            
            test_pred = test_loop(test_twitter_loader, model, device)
            
            test_label = test_tweets_df[test_tweets_df["Screen Name"]==y]["Under 21"].unique()
            
            if(test_pred==test_label[0]):
                    correct_pred+=1
            
            predictions.append(test_pred)
            target_values.append(test_label[0])
        
        end=time.time()
        print("time taken-",round((end-start)/60.0,2),"minutes")

        print("Train Loss {} | Train Accuracy: {}%".format(round(train_loss, 3), round(train_acc*100, 3)))
        
        test_acc = correct_pred/len(test_tweets_sn)
        print("Test Accuracy: {}%".format(round(test_acc*100, 3)))
        
        c_report = classification_report(y_true=target_values,y_pred=predictions,output_dict=True)
        c_matrix = confusion_matrix(target_values,predictions)
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            # Save the parameters of the model
            c_report_best = c_report
            c_matrix_best = c_matrix
            
    return c_report_best, c_matrix_best

In [50]:
def main(train_tweets_df, test_tweets_df, test_tweets_sn):
    
    learning_rate = 3.1e-5  
    epochs = 4
    
    MAX_LEN = 160  
    BATCH_SIZE = 64 

    model = Classifier()
    model = model.to(device)
    
    loss=nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=3.1e-6)
    
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    
    train_dataset = Tweet_Dataset(train_tweets_df,tokenizer,MAX_LEN)
    train_twitter_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    total_steps = len(train_twitter_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)
    
    return age_prediction(train_twitter_loader, test_tweets_df, test_tweets_sn, model, loss, optimizer, device, scheduler,\
                          epochs, tokenizer, MAX_LEN, BATCH_SIZE)

In [51]:
BERT_start = time.time()

In [52]:
df = df_resampled
screen_names_list = df['Screen Name'].unique()

kf = KFold(n_splits=5, shuffle = True, random_state=24)
c_report_list = []
cm_list = []
k=1

for train_tweets_sn, test_tweets_sn in kf.split(screen_names_list):
    print(f'K-fold {k}/{5}')
    print('-' * 20)
    
    train_tweets_sn = screen_names_list[train_tweets_sn]
    test_tweets_sn = screen_names_list[test_tweets_sn]

    train_tweets_df = df_BERT(train_tweets_sn, df)
    test_tweets_df = df_BERT(test_tweets_sn, df)
    c_report, c_matrix = main(train_tweets_df, test_tweets_df, test_tweets_sn)        
    
    c_report_list.append(c_report)
    cm_list.append(c_matrix)
    
    k=k+1

K-fold 1/5
--------------------
Epoch 1/4
----------
time taken- 16.37 minutes
Train Loss 0.01 | Train Accuracy: 59.652%
Test Accuracy: 72.052%
Epoch 2/4
----------
time taken- 16.38 minutes
Train Loss 0.008 | Train Accuracy: 72.094%
Test Accuracy: 71.616%
Epoch 3/4
----------
time taken- 16.38 minutes
Train Loss 0.006 | Train Accuracy: 82.275%
Test Accuracy: 67.686%
Epoch 4/4
----------
time taken- 16.38 minutes
Train Loss 0.004 | Train Accuracy: 87.201%
Test Accuracy: 69.869%
K-fold 2/5
--------------------
Epoch 1/4
----------
time taken- 16.39 minutes
Train Loss 0.01 | Train Accuracy: 60.133%
Test Accuracy: 69.432%
Epoch 2/4
----------
time taken- 16.41 minutes
Train Loss 0.009 | Train Accuracy: 71.601%
Test Accuracy: 70.306%
Epoch 3/4
----------
time taken- 16.4 minutes
Train Loss 0.006 | Train Accuracy: 81.778%
Test Accuracy: 71.616%
Epoch 4/4
----------
time taken- 16.4 minutes
Train Loss 0.004 | Train Accuracy: 87.05%
Test Accuracy: 72.926%
K-fold 3/5
--------------------
Epoch

In [53]:
print("time taken for BERT Model-",round((time.time()-BERT_start)/3600.0,2),"hours")

time taken for BERT Model- 5.49 hours


## Result

In [54]:
final_classification_report(c_report_list, cm_list)

Overall Accuracy- 0.696 

------(Age >= 21)------

Precision- 0.724
Recall- 0.835
F1- 0.773

------(Age < 21)------

Precision- 0.636
Recall- 0.46
F1- 0.526

Confusion Matrix-
 [[600. 118.]
 [230. 197.]]


## END

In [55]:
print("time taken for notebook-",round((time.time()-code_start)/3600.0,2),"hours")

time taken for notebook- 5.54 hours
