# **Doc2Vec**

## Importing Necessary Libraries

In [None]:
import json
import pandas as pd
import numpy as np
import spacy
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


## Read Data

Reads the provided dataset (read .jsonl files) from pandas library, after that displays the top data using df.head(), as shown below the dataset has 4 text,label,	model,	source and	id

In [None]:
!pip install gensim
!pip install --upgrade gdown
!gdown --folder https://drive.google.com/drive/folders/1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc
!rm -rf /content/SubtaskA/subtaskA_dev_multilingual.jsonl
!rm -rf /content/SubtaskA/subtaskA_train_multilingual.jsonl

In [None]:
with open('/content/SubtaskA/subtaskA_train_monolingual.jsonl') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

with open('/content/SubtaskA/subtaskA_dev_monolingual.jsonl') as f:
    val_data = [json.loads(line) for line in f]
val_df = pd.DataFrame(val_data)

ax = sns.countplot(x="label", data=df)

In [None]:
df.head()

In [None]:
sw = stopwords.words('english')
def clean_text(text):

    text = text.lower()

    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs
    #text = re.sub(r"http", "",text)

    html=re.compile(r'<.*?>')

    text = html.sub(r'',text) #Removing html tags

    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations

    text = [word.lower() for word in text.split() if word.lower() not in sw]

    text = " ".join(text) #removing stopwords

    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis

    return text

# df['text_clean'] = df['text'].apply(lambda x: clean_text(x))
# df.head()

Training and validation dataset cleaning

In [None]:
#training dataset cleaning
base = df[['text', 'label']]
base.text = base.text.map(clean_text)
lista_base = list(base.text)
tagged_trained_data = [TaggedDocument(words=word_tokenize(str(_d).lower()), tags=[str(i)]) for i, _d in enumerate(lista_base)]

model = gensim.models.doc2vec.Doc2Vec(vector_size=100, epochs=10)
model.build_vocab(tagged_trained_data)
model.train(tagged_trained_data, total_examples=model.corpus_count, epochs=10)
similar_doc = model.docvecs.most_similar('0')
# print(similar_doc[0])

matrix = [] # <- Matriz com as 100 coordenadas de cada palavra.
for i in range(len(base.text)):
  tokens_aux = nltk.word_tokenize(lista_base[i])
  matrix.append(model.infer_vector(tokens_aux))
base_matriz = pd.DataFrame(matrix)

#validation dataset cleaning
val_base = val_df[['text', 'label']]


val_base.text = val_base.text.map(clean_text)
lista_val_base = list(val_base.text)

val_matrix = [] # <- Matriz com as 100 coordenadas de cada palavra.
for i in range(len(val_base.text)):
  tokens_aux = nltk.word_tokenize(lista_val_base[i])
  val_matrix.append(model.infer_vector(tokens_aux))
val_base_matriz = pd.DataFrame(val_matrix)


# Mudando nome das colunas para dim_
colunas = val_base_matriz.columns
nome_colunas = ['dim'+str(colunas[i]) for i in range(100)]

val_base_matriz.columns = nome_colunas
val_base_matriz['classe'] = val_base.label

# from sklearn.model_selection import KFold, cross_val_score, cross_validate, GridSearchCV, train_test_split
# k_folds = KFold(n_splits = 5)
val_X = val_base_matriz.iloc[:,0:100]
val_y = val_base_matriz.iloc[:,-1]


In [None]:
base_matriz.to_csv(r'matrix_100.csv')
val_base_matriz.to_csv(r'val_matrix_100.csv')
# Mudando nome das colunas para dim_
colunas = base_matriz.columns
nome_colunas = ['dim'+str(colunas[i]) for i in range(100)]

base_matriz.columns = nome_colunas
base_matriz['classe'] = base.label


In [None]:
from sklearn.model_selection import KFold, cross_val_score, cross_validate, GridSearchCV, train_test_split
k_folds = KFold(n_splits = 5)
X = base_matriz.iloc[:,0:100]
y = base_matriz.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(len(y_train))

## Logistic Regression

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
modelo_RL = LogisticRegression()
scores = cross_val_score(modelo_RL, X_train, y_train, cv=5)
print(scores.mean())

In [None]:
modelo_RL.fit(X_train, y_train)

In [None]:
y_pred_RL = modelo_RL.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
print(classification_report(y_test, y_pred_RL))
print(accuracy_score(y_test, y_pred_RL))
print(confusion_matrix(y_test, y_pred_RL))


cnf_matrix = confusion_matrix(y_test,y_pred_RL)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

### Validation Logistic Regression

In [None]:
y_pred_LR = modelo_RL.predict(val_X)
print(classification_report(val_y, y_pred_LR))
print(accuracy_score(val_y, y_pred_LR))
print(confusion_matrix(val_y, y_pred_LR))

cnf_matrix = confusion_matrix(val_y,y_pred_LR)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
modelo_DT = DecisionTreeClassifier(random_state=0)
#modelo_2.fit(X_train, y_train)
para_DT = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
grid_DT = GridSearchCV(modelo_DT, para_DT, cv=5)
grid_DT.fit(X_train, y_train)
print(grid_DT.best_params_, grid_DT.best_score_)
print(cross_val_score(modelo_DT, X_train, y_train, cv=5).mean())

In [None]:
modelo_DT = DecisionTreeClassifier(criterion='gini', max_depth=5)
modelo_DT.fit(X_train, y_train)

In [None]:
y_pred_DT = modelo_DT.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_DT))
print(accuracy_score(y_test, y_pred_DT))
print(confusion_matrix(y_test, y_pred_DT))

cnf_matrix = confusion_matrix(y_test,y_pred_DT)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

### Validation Decision Tree

In [None]:
from sklearn.metrics import classification_report
y_pred_RL = modelo_DT.predict(X)
print(classification_report(y, y_pred_RL))
print(accuracy_score(y, y_pred_RL))
print(confusion_matrix(y, y_pred_RL))

cnf_matrix = confusion_matrix(y,y_pred_RL)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');


## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
modelo_RF = RandomForestClassifier()
para_RF = param_grid = { 'n_estimators': [50, 100, 200, 400], 'criterion':['gini','entropy'],
              'max_depth': [None, 5, 10]}
grid_RF = GridSearchCV(estimator=modelo_RF,
                           param_grid=para_RF,
                           cv=5)
grid_RF.fit(X_train, y_train)

In [None]:
print(grid_RF.best_params_, grid_RF.best_score_)
print(cross_val_score(modelo_RF, X_train, y_train, cv=5).mean())

In [None]:
modelo_RF = RandomForestClassifier(criterion='entropy', n_estimators=200, max_depth=10)
modelo_RF.fit(X_train, y_train)

In [None]:
y_pred_RF = modelo_RF.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_RF))
print(accuracy_score(y_test, y_pred_RF))
print(confusion_matrix(y_test, y_pred_RF))

cnf_matrix = confusion_matrix(y_test, y_pred_RF)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

### Validation Random Forest Classifier

In [None]:
y_pred_LR = modelo_RL.predict(val_X)
print(classification_report(val_y, y_pred_LR))
print(accuracy_score(val_y, y_pred_LR))
print(confusion_matrix(val_y, y_pred_LR))

cnf_matrix = confusion_matrix(val_y,y_pred_LR)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

## KNN classifier

In [None]:
from pandas.core.common import random_state
from sklearn.neighbors import KNeighborsClassifier
modelo_Knn = KNeighborsClassifier()
para_Knn = {'p':[1,2], 'n_neighbors':[3,5,7,9,13,21,51,61,71,81,91,101]}
grid_Knn = GridSearchCV(estimator=modelo_Knn,
                           param_grid=para_Knn,
                           cv=5)
grid_Knn.fit(X_train, y_train)

In [None]:
print(grid_Knn.best_params_, grid_Knn.best_score_)
modelo_Knn = KNeighborsClassifier(n_neighbors=61, p=2)
modelo_Knn.fit(X_train, y_train)

In [None]:
print(cross_val_score(modelo_Knn, X, y, cv=5).mean())
y_pred_Knn = modelo_Knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_Knn))
print(accuracy_score(y_test, y_pred_Knn))
print(confusion_matrix(y_test, y_pred_Knn))

cnf_matrix = confusion_matrix(y_test, y_pred_Knn)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

### Validation KNN

In [None]:
val_y_pred_knn = modelo_RL.predict(val_X)
print(classification_report(val_y, val_y_pred_knn))
print(accuracy_score(val_y, val_y_pred_knn))
print(confusion_matrix(val_y, val_y_pred_knn))

cnf_matrix = confusion_matrix(val_y,val_y_pred_knn)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

## SVM

In [None]:
from sklearn.svm import SVC
random_state = 0
modelo_svm = SVC()
parametros_svm = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

In [None]:
grid = GridSearchCV(modelo_svm, parametros_svm, refit = True, verbose = 3)

# fitting the model for grid search
grid.fit(X, y)

In [None]:
print("Best Hyperparameters: ", grid.best_params_)
print("Best Score: ", grid.best_score_)
modelo_svm = SVC(C=10, gamma=0.001, kernel='rbf')
modelo_svm.fit(X_train, y_train)

In [None]:
y_pred_svm = modelo_svm.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
print(classification_report(y_test, y_pred_svm))
print(accuracy_score(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))

cnf_matrix = confusion_matrix(y_test, y_pred_svm)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');

### Validation SVM

In [None]:
val_y_pred_svm = modelo_RL.predict(val_X)
print(classification_report(val_y, val_y_pred_svm))
print(accuracy_score(val_y, val_y_pred_svm))
print(confusion_matrix(val_y, val_y_pred_svm))

cnf_matrix = confusion_matrix(val_y,val_y_pred_svm)
group_names = ['TN','FP','FN','TP']
group_counts = ["{0:0.0f}".format(value) for value in cnf_matrix.flatten()]
labels = [f"{v1}\n{v2}" for v1, v2 in zip(group_names,group_counts)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf_matrix, annot=labels, fmt='', cmap='Blues');