In [None]:
import os
import pandas as pd
import numpy as np
from xml.dom import minidom
import xml.etree.ElementTree as ET
from nltk.tokenize import WordPunctTokenizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import gensim.downloader as api
import torch
import torch.nn as nn
import torch.utils.data as Data

In [None]:
#file list
path = './pan22-author-profiling-training-2022-03-29/en/'
f_list = os.listdir(path)
file_list = []
for f in f_list:
    if f == 'truth.txt':
        pass
    else:
        file_list.append(f)

In [None]:
# extracting labels 
info_file = path+'truth.txt'
labels = np.zeros((420,2),dtype = list)
with open(info_file) as f:
  i = 0
  for line in f:
    l = line.split(':::')
    labels[i,0] = l[0]
    labels[i,1] = l[1][:-1]
    i+=1
    

labels_df = pd.DataFrame(labels, columns = ['name','is'])

In [None]:
model = api.load('glove-twitter-100')
tokenizer = WordPunctTokenizer()

In [None]:
#method 1 - vector containing average embedding from each tweet
tweets = np.zeros((len(file_list),201), dtype = float)
for f in range(len(file_list)):
    tree = ET.parse(path+file_list[f])
    root = tree.getroot()
    
    if labels[f,1] == 'NI':
        tweets[f,0] = 0
    else:
        tweets[f,0] = 1
        
    for i in range(len(root[0])):
        v= []
        tweet = (root[0][i].text).lower()
        tokens = ([tweet.lower() for tweet in tokenizer.tokenize(tweet)])
        for t in tokens:
            if t in model.key_to_index.keys():
                vec = model.get_vector(t)
                tweets[f,i+1] = np.mean(vec)

In [None]:
df = pd.DataFrame(tweets)
display(df)

In [None]:
#splitting the dataset into training and test set
train, test = train_test_split(df, test_size=60)

In [None]:
#extracting values and labels
x_train = train.loc[:,1:].values
y_train = train[0].values
x_test = test.loc[:,1:].values
y_test = test[0].values

#array for one user
x_train = np.asarray(x_train)

In [None]:
# simple classifiers

#accuracy for test set
clf = SVC(kernel='linear', probability=True)
clf.fit(x_train,list(y_train))
pre1 = clf.predict(x_test)
acc = accuracy_score(y_test, pre1)
print(acc)

clf =  MLPClassifier(alpha=1,max_iter=1000)
clf.fit(x_train, y_train)
pre1 = clf.predict(x_test)
acc = accuracy_score(y_test, pre1)
print(acc)

clf =  GaussianNB()
clf.fit(x_train, y_train)
pre1 = clf.predict(x_test)
acc = accuracy_score(y_test, pre1)
print(acc)

In [None]:
#accuracy for training set
clf = SVC(kernel='linear', probability=True)
clf.fit(x_train,list(y_train))
pre1 = clf.predict(x_train)
acc = accuracy_score(y_train, pre1)
print(acc)

clf =  MLPClassifier(alpha=1,max_iter=1000)
clf.fit(x_train, y_train)
pre1 = clf.predict(x_train)
acc = accuracy_score(y_train, pre1)
print(acc)

clf =  GaussianNB()
clf.fit(x_train, y_train)
pre1 = clf.predict(x_train)
acc = accuracy_score(y_train, pre1)
print(acc)

In [None]:
#LSTM model

input_size = 200

class LSTM(nn.Module):
    def __init__(self, input_size=input_size, hidden_layer_size=100, output_size=1):
       
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_x):
        input_x = input_x.view(len(input_x), 1, -1)
        hidden_cell = (torch.zeros(1, 1, self.hidden_layer_size),  # shape: (n_layers, batch, hidden_size)
                       torch.zeros(1, 1, self.hidden_layer_size))
        lstm_out, (h_n, h_c) = self.lstm(input_x, hidden_cell)
        linear_out = self.linear(lstm_out.view(len(input_x), -1))  # =self.linear(lstm_out[:, -1, :])
        predictions = self.sigmoid(linear_out)
        return predictions

In [None]:
x, y = torch.from_numpy(x_train.astype(float)).to(torch.float32), torch.from_numpy(np.array(y_train)).to(torch.float32)

train_loader = Data.DataLoader(
        dataset=Data.TensorDataset(x, y), 
        batch_size=1,  
        shuffle=True,  
        num_workers=2, 
    )
lstm = LSTM()  
loss_function = nn.BCELoss()  # loss
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)  
epochs = 10
    
lstm.train()
for i in range(epochs):
    for seq, labels in train_loader:
        optimizer.zero_grad()
        y_pred = lstm(seq).squeeze()  
        labels = labels.squeeze()
        single_loss = loss_function(y_pred, labels)
           
        single_loss.backward()
        optimizer.step()
       
lstm.eval()

for seq, labels in train_loader:  
    y_pred = lstm(seq).squeeze()  
    labels = labels.squeeze()
    single_loss = loss_function(y_pred, labels)
    print(y_pred)



In [None]:
x = torch.from_numpy(np.array(x_test))
y_test_pred = lstm(x.float())

y_test_pred = y_test_pred.cpu().detach().numpy()

result = []
for i in y_test_pred:
    if i >0.5:
        result.append(1)
    else:
        result.append(0)
        
 
#accuracy for LSTM classification
result = np.array(result).squeeze()
acc = accuracy_score(result.astype(int),y_test.astype(int))
print(acc)

In [None]:
# Pre--processing method 2
def get_phrase_embedding(phrase):

    vector = np.zeros([model.vector_size], dtype='float32')
    
    phrase = phrase.lower()
    phrase = tokenizer.tokenize(phrase)
    
    phrase_vectors = []
    
    for i in phrase:
        if i in model.key_to_index.keys():
            phrase_vectors.append(model.get_vector(i))
    
    phrase_vectors = np.array(phrase_vectors)
    
    if len(phrase_vectors) == 0:
        return vector
    
    phrase_vectors = np.mean(phrase_vectors, axis=0)
    
    return phrase_vectors

def get_person_vector(phrase):
    data = []
    N = len(phrase[0])
    for n in range(0,N):
        vector = get_phrase_embedding(str(phrase[0][n].text))
        data.append(vector)
    data1 = np.array(data).reshape(20000,)
    return data1

In [None]:
data = []

for f in range(len(file_list)): 
    file = file_list[f]
    tree = ET.parse(path+"/"+file)
    root = tree.getroot()
    vector = get_person_vector(root)
    is_is = labels_df['is'][f]
    if is_is =='I':
        data.append([vector,0])
    else:
        data.append([vector,1])

In [None]:
df = pd.DataFrame(data) 
display(df)

In [None]:
train, test = train_test_split(df, test_size=60)
x_train = list(train[0].values)
x_test = list(test[0].values)
y_train = train[1].values
y_test = test[1].values

In [None]:
# simple classifiers

#accuracy for test set
clf = SVC(kernel='linear', probability=True)
clf.fit(x_train,list(y_train))
pre1 = clf.predict(x_test)
acc = accuracy_score(y_test, pre1)
print(acc)

clf =  MLPClassifier(alpha=1,max_iter=1000)
clf.fit(x_train, y_train)
pre1 = clf.predict(x_test)
acc = accuracy_score(y_test, pre1)
print(acc)

clf =  GaussianNB()
clf.fit(x_train, y_train)
pre1 = clf.predict(x_test)
acc = accuracy_score(y_test, pre1)
print(acc)

In [None]:
#accuracy for training set
clf = SVC(kernel='linear', probability=True)
clf.fit(x_train,list(y_train))
pre1 = clf.predict(x_train)
acc = accuracy_score(y_train, pre1)
print(acc)

clf =  MLPClassifier(alpha=1,max_iter=1000)
clf.fit(x_train, y_train)
pre1 = clf.predict(x_train)
acc = accuracy_score(y_train, pre1)
print(acc)

clf =  GaussianNB()
clf.fit(x_train, y_train)
pre1 = clf.predict(x_train)
acc = accuracy_score(y_train, pre1)
print(acc)

In [None]:
#LSTM

class LSTM(nn.Module):
    def __init__(self, input_size=20000, hidden_layer_size=100, output_size=1):
       
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_x):
        input_x = input_x.view(len(input_x), 1, -1)
        hidden_cell = (torch.zeros(1, 1, self.hidden_layer_size),  # shape: (n_layers, batch, hidden_size)
                       torch.zeros(1, 1, self.hidden_layer_size))
        lstm_out, (h_n, h_c) = self.lstm(input_x, hidden_cell)
        linear_out = self.linear(lstm_out.view(len(input_x), -1))  # =self.linear(lstm_out[:, -1, :])
        predictions = self.sigmoid(linear_out)
        return predictions

In [None]:
a = np.ones((420,200,100))
b = df[0].values
for i in range(420):
    a[i]=b[i].reshape(200,100)

x, y = torch.from_numpy(a.astype(float)).to(torch.float32), torch.from_numpy(np.array(df[1].values)).to(torch.float32)

train_loader = Data.DataLoader(
        dataset=Data.TensorDataset(x, y), 
        batch_size=1,  
        shuffle=True,  
        num_workers=2, 
    )
lstm = LSTM()  
loss_function = nn.BCELoss()  # loss
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)  
epochs = 10
    
lstm.train()
for i in range(epochs):
    for seq, labels in train_loader:
        optimizer.zero_grad()
        y_pred = lstm(seq).squeeze()  
        labels = labels.squeeze()
        single_loss = loss_function(y_pred, labels)
           
        single_loss.backward()
        optimizer.step()
       
lstm.eval()

for seq, labels in train_loader:  
    y_pred = lstm(seq).squeeze()  
    labels = labels.squeeze()
    single_loss = loss_function(y_pred, labels)
    print(y_pred)




In [None]:
a = np.ones((60,200,100))
b = test[0].values
for i in range(60):
    a[i]=b[i].reshape(200,100)
    
x, y = torch.from_numpy(a.astype(float)).to(torch.float32), torch.from_numpy(np.array(train[1].values)).to(torch.float32)
y_test_pred = lstm(x).squeeze()  
result = []
for i in y_test_pred:
    if i >0.5:
        result.append(1)
    else:
        result.append(0)

acc = sum(result==test[1].values)/60
print(acc)