### **Setup**

In [1]:
%%bash
pip install transformers -q
pip install gensim -q



### **Library Imports**

In [2]:
import os
import torch
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier 

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report

from transformers import AutoTokenizer, AutoModel
from gensim.parsing.preprocessing import remove_stopwords

### **Constants and Utilities**

In [3]:
class CFG(object):
    def __init__(self,
                 seed: int = 42,
                 n_splits: int = 5,
                 show_info: bool = False,
                 ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.train_features_path = "../input/dgsp-analysis/sentence_embeddings.npy"
        self.train_data_path = "../input/detecting-generated-scientific-papers/fake_papers_train_part_public.csv"
        self.test_data_path = "../input/detecting-generated-scientific-papers/fake_papers_test_public.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

            
cfg = CFG(seed=42, show_info=True)

In [4]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + 50*"*" + "\n")


def get_model(model_id: str) -> tuple:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id)

    return tokenizer, model


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_sentence_embeddings(model, tokenizer, query: list) -> torch.Tensor:
    if isinstance(query, list):
        # Convert all to lowercase and remove all the stopwords from the knowledge base
        query = [kb_item.lower() for kb_item in query]
        query = [remove_stopwords(kb_item) for kb_item in query]
    else:
        # Convert all to lowercase and remove all the stopwords from the query
        query = query.lower()
        query = remove_stopwords(query)

    # Tokenize sentences
    encoded_query = tokenizer(query, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad(): model_output = model(**encoded_query)

    # Perform pooling
    query_sentence_embeddings = mean_pooling(model_output, encoded_query['attention_mask'])

    # Normalize embeddings
    query_sentence_embeddings = F.normalize(query_sentence_embeddings, p=2, dim=1)

    # Return 
    return query_sentence_embeddings


def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision}")
    print(f"Recall    : {recall}")
    print(f"F-Score   : {f_score}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    auc = roc_auc_score(y_pred, y_true)
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true)

    return accuracy, auc, precision, recall, f_score

### **Model**

In [5]:
class Pipelines(object):
    def __init__(self, model_name: str, preprocessor, seed: int):
        self.model_name = model_name

        if self.model_name == "lgr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LogisticRegression(random_state=seed, max_iter=250)),
                ]
            )
        
        elif self.model_name == "knc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", KNeighborsClassifier()),
                ]
            )

        
        elif self.model_name == "dtc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", DecisionTreeClassifier(random_state=seed)),
                ]
            )

        elif self.model_name == "etc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreeClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "rfc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", RandomForestClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gbc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GradientBoostingClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "abc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", AdaBoostClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "etcs":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreesClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gnb":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GaussianNB()),
                ]
            )

### **Load Embeddings**

In [6]:
X = np.load(cfg.train_features_path)
X = X[1:]

df = pd.read_csv(cfg.train_data_path)
y = df.fake.copy().values

### **Train**

In [7]:
names = ["lgr", "knc", "gnb", "dtc", "etc", "abc", "gbc", "etcs", "rfc"]

if cfg.show_info:
    breaker()
    for val in set(df.fake):
        print(f"Class {val} count : {df[df.fake == val].shape[0]}")

features = [i for i in range(X.shape[1])]

feature_transformer = Pipeline(
    steps=[
        ("Simple_Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

best_auc = 0.0
for name in names:
    fold = 1
    if cfg.show_info: breaker()
    for tr_idx, va_idx in KFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X):
        X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
        my_pipeline = Pipelines(name, preprocessor, cfg.seed)
        my_pipeline.model.fit(X_train, y_train)

        y_pred = my_pipeline.model.predict(X_valid)
        if cfg.show_info:
            acc, auc, pre, rec, f1 = get_scores(y_valid, y_pred)
            print(f"{my_pipeline.model_name}, {fold}\n")
            print_scores(acc, auc, pre, rec, f1)
            print("")
        else:
            _, auc, _, _, _ = get_scores(y_valid, y_pred)

        if auc > best_auc:
            best_auc = auc
            model_fold_name = f"{name}_{fold}"
            
            with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp:
                pickle.dump(my_pipeline.model, fp)
        fold += 1
    

if cfg.show_info: 
    breaker()
    print(f"Best Model : {model_fold_name.split('_')[0]}, Best Fold : {model_fold_name.split('_')[1]}")

breaker()


**************************************************

Class 0 count : 1686
Class 1 count : 3664

**************************************************

lgr, 1

Accuracy  : 0.81215
ROC-AUC   : 0.78381
Precision : [0.71345029 0.85851648]
Recall    : [0.70317003 0.86445367]
F-Score   : [0.70827286 0.86147484]

lgr, 2

Accuracy  : 0.82897
ROC-AUC   : 0.80293
Precision : [0.66153846 0.90201342]
Recall    : [0.74652778 0.85933504]
F-Score   : [0.70146819 0.88015717]

lgr, 3

Accuracy  : 0.82150
ROC-AUC   : 0.79343
Precision : [0.67673716 0.88633288]
Recall    : [0.72727273 0.85958005]
F-Score   : [0.70109546 0.8727515 ]

lgr, 4

Accuracy  : 0.80561
ROC-AUC   : 0.78055
Precision : [0.67428571 0.86944444]
Recall    : [0.71515152 0.84594595]
F-Score   : [0.69411765 0.85753425]

lgr, 5

Accuracy  : 0.82243
ROC-AUC   : 0.79583
Precision : [0.69822485 0.87978142]
Recall    : [0.72839506 0.86327078]
F-Score   : [0.71299094 0.8714479 ]


**************************************************

knc, 1

Accura

### **Submission**

In [8]:
test_df = pd.read_csv(cfg.test_data_path)
tokenizer, model = get_model('sentence-transformers/all-MiniLM-L6-v2')

sentence_embeddings = torch.zeros(1, 384)

for text in test_df.text:
    sentence_embeddings = torch.cat((sentence_embeddings, get_sentence_embeddings(model, tokenizer, text)), dim=0)

sentence_embeddings = sentence_embeddings.detach().cpu().numpy()

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [9]:
model = pickle.load(open("models/best_model.pkl", "rb"))

y_pred = model.predict(sentence_embeddings[1:])

ss_df = pd.read_csv("../input/detecting-generated-scientific-papers/sample_submission.csv")
ss_df["fake"] = y_pred.astype("uint8")
ss_df.to_csv("submission.csv", index=False)