In [1]:
import pandas as pd
import numpy as np
import re
train_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv')
test_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/test_data.csv')

In [6]:
def preprocess_text(text: str, max_lines: int = 45) -> str:
    """
    Preprocesses a text by trimming it to a max number of lines
    and cleaning it to keep only English letters, numbers, and basic punctuation.
    """
    if not isinstance(text, str):
        return ""
    
    # 1. Limit the article to a maximum of 45-50 lines
    lines = text.splitlines()
    trimmed_lines = lines[:max_lines]
    trimmed_text = "\n".join(trimmed_lines)

    # 2. Keep only English letters, numbers, and specified marks
    # This regex pattern finds any character that is NOT a-z, A-Z, 0-9,
    # a whitespace character, or one of ? . ! ,
    # and replaces it with a space.
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s?.!,]', ' ', trimmed_text)
    
    # Optional: Clean up extra whitespace created by the substitution
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text
train_df['text_1'] = train_df['text_1'].apply(preprocess_text)
train_df['text_2'] = train_df['text_2'].apply(preprocess_text)
test_df['text_1'] = test_df['text_1'].apply(preprocess_text)
test_df['text_2'] = test_df['text_2'].apply(preprocess_text)

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
text1 = train_df['text_1'].to_list()
text2 = train_df['text_2'].to_list()
test_text1 = test_df['text_1'].to_list()
test_text2 = test_df['text_2'].to_list()

In [4]:
import math
from collections import Counter

def calculate_shannon_entropy(text: str) -> float:
    """
    Calculates the Shannon entropy of a text at the character level.
    Higher values indicate more randomness or complexity.
    """
    if not text:
        return 0.0

    char_counts = Counter(text)
    total_chars = len(text)

    entropy = 0.0
    for count in char_counts.values():
        probability = count / total_chars
        entropy -= probability * math.log2(probability)
        
    return entropy

entropy_train_text = []
entropy_test_text = []
train_df['text_1'].fillna(' ',inplace=True)
train_df['text_2'].fillna(' ',inplace=True)
for _,row in train_df.iterrows():
    ent_1 = calculate_shannon_entropy(row['text_1'])
    entropy_train_text.append(ent_1)
for _,row in train_df.iterrows():
    ent_2 = calculate_shannon_entropy(row['text_2'])
    entropy_train_text.append(ent_2)
    

test_df['text_1'].fillna('',inplace=True)
test_df['text_2'].fillna('',inplace=True)

for _,row in test_df.iterrows():
    ent_1 = calculate_shannon_entropy(row['text_1'])
    entropy_test_text.append(ent_1)
for _,row in test_df.iterrows():
    ent_2 = calculate_shannon_entropy(row['text_2'])
    entropy_test_text.append(ent_2)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['text_1'].fillna(' ',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['text_2'].fillna(' ',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [5]:
len(entropy_test_text) , len(entropy_train_text)

(2136, 190)

In [6]:
!pip install pandas spacy textstat pyspellchecker
!python -m spacy download en_core_web_sm
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.10-py3-none-any.whl.metadata (15 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of mkl-fft to determine which version is compatible with other requirements. This could take a while.
Collecting mkl_fft (from numpy>=1.23.2->pandas)
  Downloading mkl_fft-2.0.0-22-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (7.1 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
import pandas as pd
import spacy
import textstat
from spellchecker import SpellChecker
import re

nlp = spacy.load("en_core_web_sm")
spell = SpellChecker()

def extract_all_features(sentence_list):
    """
    Processes a list of sentences and extracts a comprehensive set of features.
    
    Args:
        sentence_list (list): A list of strings, where each string is a text to analyze.
        
    Returns:
        pandas.DataFrame: A DataFrame with the extracted features for each sentence.
    """
    feature_names = [
        'text', 'flesch_reading_ease', 'flesch_kincaid_grade', 
        'exclamation_count', 'misspelled_word_count', 
        'vocabulary_richness_ttr', 'named_entity_count', 'proper_noun_count'
    ]
    all_features = []

    for sentence in sentence_list:
        if not isinstance(sentence, str):
            default_features = {name: 0 for name in feature_names}
            default_features['text'] = str(sentence) 
            all_features.append(default_features)
            continue 
        doc = nlp(sentence)

        reading_ease = textstat.flesch_reading_ease(sentence)
        grade_level = textstat.flesch_kincaid_grade(sentence)

        exclamation_count = sentence.count('!')
        
        clean_tokens = [token.text.lower() for token in doc if token.is_alpha]
        misspelled_count = len(spell.unknown(clean_tokens))
        if len(clean_tokens) > 0:
            ttr = len(set(clean_tokens)) / len(clean_tokens)
        else:
            ttr = 0
            
        entity_count = len(doc.ents)
        proper_noun_count = sum(1 for token in doc if token.pos_ == 'PROPN')

        features = {
            'text': sentence,
            'flesch_reading_ease': reading_ease,
            'flesch_kincaid_grade': grade_level,
            'exclamation_count': exclamation_count,
            'misspelled_word_count': misspelled_count,
            'vocabulary_richness_ttr': ttr,
            'named_entity_count': entity_count,
            'proper_noun_count': proper_noun_count,
        }
        all_features.append(features)
    return pd.DataFrame(all_features)

In [9]:
feature_df1 = extract_all_features(text1)
feature_df2 = extract_all_features(text2)
feature_df_test_1 = extract_all_features(test_text1)
feature_df_test_2 = extract_all_features(test_text2)

In [11]:
perplexity_score_train = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/perplexity_score_features_train.csv')
perplexity_score_test = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/perplexity_score_features_test.csv')
llm_judge_train = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/llm_judge_feature_train_individual.csv')
llm_judge_test = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/llm_judge_feature_test_individual_.csv')
semantic_focus_train = np.load('/kaggle/input/fake-or-real-the-imposter-x-train/semantic_focus_train.npy')
semantic_focus_test = np.load('/kaggle/input/fake-or-real-the-imposter-x-train/semantic_focus_test.npy')
flow_coherence_train = np.load('/kaggle/input/fake-or-real-the-imposter-x-train/flow_coherence_train.npy')
flow_coherence_test = np.load('/kaggle/input/fake-or-real-the-imposter-x-train/flow_coherence_test.npy')

In [15]:
features_text1 = pd.DataFrame({
    'proper_noun_count': feature_df1['proper_noun_count'],
    'exclamation_count': feature_df1['exclamation_count'],
    'flesch_kincaid_grade': feature_df1['flesch_kincaid_grade'],
    'flesch_reading_ease':feature_df1['flesch_reading_ease'],
    'misspelled_word_count':feature_df1['misspelled_word_count'],
    'vocabulary_richness_ttr':feature_df1['vocabulary_richness_ttr']
})
features_text2 = pd.DataFrame({
    'proper_noun_count': feature_df2['proper_noun_count'],
    'exclamation_count': feature_df2['exclamation_count'],
    'flesch_kincaid_grade':feature_df2['flesch_kincaid_grade'],
    'flesch_reading_ease':feature_df2['flesch_reading_ease'],
    'misspelled_word_count':feature_df2['misspelled_word_count'],
    'vocabulary_richness_ttr':feature_df2['vocabulary_richness_ttr']
})
features_test_text1 = pd.DataFrame({
    'proper_noun_count': feature_df_test_1['proper_noun_count'],
    'exclamation_count': feature_df_test_1['exclamation_count'],
    'flesch_kincaid_grade': feature_df_test_1['flesch_kincaid_grade'],
    'flesch_reading_ease':feature_df_test_1['flesch_reading_ease'],
    'misspelled_word_count':feature_df_test_1['misspelled_word_count'],
    'vocabulary_richness_ttr':feature_df_test_1['vocabulary_richness_ttr']
})
features_test_text2 = pd.DataFrame({
    'proper_noun_count': feature_df_test_2['proper_noun_count'],
    'exclamation_count': feature_df_test_2['exclamation_count'],
    'flesch_kincaid_grade': feature_df_test_2['flesch_kincaid_grade'],
    'flesch_reading_ease':feature_df_test_2['flesch_reading_ease'],
    'misspelled_word_count':feature_df_test_2['misspelled_word_count'],
    'vocabulary_richness_ttr':feature_df_test_2['vocabulary_richness_ttr']
})
feature_df_train = pd.concat([features_text1,features_text2])
feature_df_test = pd.concat([features_test_text1,features_test_text2])
feature_df_train['target'] = np.concatenate([np.where(train_df['labels'] == 1, 1, 0),np.where(train_df['labels'] == 1, 0, 1)])

In [20]:
x_train_df = feature_df_train.copy()
x_final_test = feature_df_test.copy()
x_train_df = x_train_df.drop(['target'],axis=1)

In [21]:
x_train_df['perplexity_score'] = perplexity_score_train['perplexity_score']
x_train_df['llm_judge'] = llm_judge_train['llm_judge_verdict']
x_train_df['semantic_focus'] = semantic_focus_train
x_train_df['flow_coherence'] = flow_coherence_train
x_train_df['entropy_diff'] = entropy_train_text

x_final_test['perplexity_score'] = perplexity_score_test['perplexity_score']
x_final_test['llm_judge'] = llm_judge_test['llm_judge_verdict']
x_final_test['semantic_focus'] = semantic_focus_test
x_final_test['flow_coherence'] = flow_coherence_test
x_final_test['entropy_diff'] = entropy_test_text

In [50]:
x_final_test['perplexity_score'] = x_final_test['perplexity_score'].fillna(x_final_test['perplexity_score'].mean())
x_final_test['llm_judge'] = pd.to_numeric(x_final_test['llm_judge'],errors='coerce')
x_final_test['llm_judge'] = x_final_test['llm_judge'].fillna(0)

In [48]:
x_train_df['perplexity_score'] = x_train_df['perplexity_score'].fillna(x_train_df['perplexity_score'].mean())
x_train_df['llm_judge'] = pd.to_numeric(x_train_df['llm_judge'],errors='coerce')
x_train_df['llm_judge'] = x_train_df['llm_judge'].fillna(0)

In [22]:
selected_features = ['proper_noun_count','exclamation_count','flesch_kincaid_grade','flesch_reading_ease','perplexity_score',
                    'llm_judge','semantic_focus','flow_coherence','entropy_diff']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import math
sns.set_style("whitegrid")
features_to_plot = ['proper_noun_count', 'exclamation_count', 'flesch_kincaid_grade',
                    'flesch_reading_ease','perplexity_score' , 'llm_judge','semantic_focus','flow_coherence',
                    'entropy_diff'
                   ]
n_features = len(features_to_plot)
ncols = 3 
nrows = math.ceil(n_features / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 5 * nrows))

fig.suptitle('Distribution of Feature Differences by Class', fontsize=20)
for i, feature in enumerate(features_to_plot):
    ax = axes.flatten()[i] # Select the correct subplot
    sns.histplot(data=x_train_df, x=feature, hue='target', kde=True, ax=ax, palette='viridis')
    ax.set_title(f'Difference in "{feature}"')
    
    if x_train_df[feature].nunique() > 2:
      ax.axvline(x=0, color='red', linestyle='--', linewidth=1.5)
    
    # Clean up legends
    if i == 0:
      ax.legend(title='Is Text 1 Real?')
    else:
      if ax.get_legend() is not None:
        ax.get_legend().remove()

# --- Hide any unused subplots in the last row ---
for i in range(n_features, len(axes.flatten())):
    axes.flatten()[i].set_visible(False)

plt.tight_layout(rect=[0, 0, 1, 0.97]) # Adjust for suptitle
plt.show()
plt.savefig('all_feature_distribution_plot.png')

In [27]:
y_train_df = np.concatenate([np.where(train_df['labels'] == 1, 1, 0),np.where(train_df['labels'] == 1, 0, 1)])

In [51]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_train_df,
                                                 y_train_df,
                                                 random_state = 42,
                                                 test_size=0.15,
                                                 shuffle=True,
                                                 stratify=y_train_df)

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
parameters = {
    'classifier': [LogisticRegression(solver='liblinear', class_weight='balanced')],
    'classifier__C': [0.1, 1, 10, 100],
}
pipeline = Pipeline([
            ('classifier',LogisticRegression())
    ])
random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=parameters,
        n_iter=30, 
        cv=5,    
        scoring='accuracy',
        verbose=1,
        random_state=42
    )
random_search.fit(x_train.to_numpy(), y_train)
print('Completed search...')
print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best cross-validation score: {random_search.best_score_:.4f}")
best_model = random_search.best_estimator_
best_model.score(x_test, y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Completed search...
Best hyperparameters: {'classifier__C': 0.1, 'classifier': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear')}
Best cross-validation score: 0.8072




0.7241379310344828

In [53]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'classifier': [LogisticRegression(solver='liblinear', class_weight='balanced')],
    'classifier__C': [0.1, 1, 10, 100],
}
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=parameters,
    scoring='accuracy',
    cv=5
)
grid_search.fit(x_train, y_train)
print(f'completed search...')
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_
test_score = best_model.score(x_test, y_test)
print(f"Test set accuracy of the best model: {test_score:.4f}")

completed search...
Best hyperparameters: {'classifier': LogisticRegression(C=0.1, class_weight='balanced', solver='liblinear'), 'classifier__C': 0.1}
Best cross-validation score: 0.8072
Test set accuracy of the best model: 0.7241


In [55]:
train_set =  x_train_df
train_label = y_train_df

In [56]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
import gc
NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_classical = np.zeros(len(train_label))

for fold, (train_idx, val_idx) in enumerate(skf.split(train_set,train_label)):
    # clear_session()
    gc.collect()
    model = LogisticRegression(C=10, class_weight='balanced', solver='liblinear')
    print(f" \n========== FOLD {fold+1}/{NFOLDS} ==========")
    x_train_fold = train_set.iloc[train_idx]
    y_train_fold = train_label[train_idx]
    x_test_val = train_set.iloc[val_idx]
    model.fit(x_train_fold,y_train_fold)
    oof_classical[val_idx] = model.predict_proba(x_test_val)[:,1]

 
 
 
 
 


In [26]:
# x_final_test['perplexity_score'] = x_final_test['perplexity_score'].fillna(x_final_test['perplexity_score'].mean())

In [57]:
model = LogisticRegression(C=1, class_weight='balanced', solver='liblinear')
model.fit(x_train_df,y_train_df)
x_final_test['perplexity_score'].fillna(x_final_test['perplexity_score'].mean())
count_model_test_preds = model.predict_proba(x_final_test)[:,1]

In [None]:
# np.save('logreg_count_model_perplexity_llmverd_train.npy',oof_classical)
# np.save('logreg_count_model_perplexity_llmverd_test.npy',count_model_test_preds)

In [65]:
i = 0
count_model_test_preds[i]

0.00044593067874229683

In [81]:
count_model_test_preds
def label_preds(preds):
    pred_labels = []
    j = len(preds)/2
    for i in range(0,int(len(preds)/2)):
        if preds[int(i)] > preds[int(j+i)]:
            pred_labels.append(2)
        else:
            pred_labels.append(1)
    return pred_labels
final_labels = label_preds(count_model_test_preds)

In [28]:
# final_labels = np.where(np.round(count_model_test_preds)==1,1,2)

In [72]:
def make_submission_csv(results,name=None):
    df_results = pd.DataFrame(results)
    output_df = df_results.copy()
    output_df.columns = ['real_text_id']
    output_df.reset_index(inplace=True)
    output_df.rename(columns={'index': 'id'}, inplace=True)
    if name!=None:
        output_df.to_csv(name, index=False)
    return output_df

In [73]:
import pandas as pd
import numpy as np

def compare_submissions(df,prediction_col='real_text_id', id_col='id'):
    """
    Loads two Kaggle submission CSVs and compares their predictions.

    Args:
        df (pandas_dataframe): Dataframe of pandas
        prediction_col (str): The name of the column containing the predictions (1s and 2s).
        id_col (str): The name of the column containing the sample ID.
    """
    try:
        real_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/sample_submission_DEBERTA_ESEMBLED (1).csv')
        real_df = real_df.sort_values(by=id_col).reset_index(drop=True)
        
        df = df.sort_values(by=id_col).reset_index(drop=True)


        if len(df) != len(real_df):
            print(f"Error: Files have different numbers of rows ({len(df)} vs {len(real_df)}).")
            return


        preds1 = df[prediction_col]
        preds2 = real_df[prediction_col]

   
        num_agreements = (preds1 == preds2).sum()
        total_predictions = len(df)
        agreement_rate = num_agreements / total_predictions
        correct_ans_best = int(1068 * 0.88796)
        wrong_ans_best = 1068 - correct_ans_best
        worst_correct_ans_curr = (correct_ans_best * agreement_rate) / 1068
        best_correct_ans_curr = ((correct_ans_best * agreement_rate) + worst_correct_ans_curr) / 1068
        print(f"--- Comparison Report ---")
        print(f"Total Predictions: {total_predictions}")
        print(f"Number of Identical Predictions: {num_agreements}")
        print(f"Number of Changed Predictions: {total_predictions - num_agreements}")
        print(f"Agreement Rate: {agreement_rate:.2%}")
        print(f"Probable Score: {worst_correct_ans_curr:.4%} to {best_correct_ans_curr:.4%}")

    except FileNotFoundError:
        print(f"Error: Could not find one or both of the files. Please check the paths.")
    except KeyError:
        print(f"Error: One of the files is missing the required column '{prediction_col}' or '{id_col}'.")

In [82]:
soln_df = make_submission_csv(final_labels)

In [83]:
compare_submissions(soln_df)

--- Comparison Report ---
Total Predictions: 1068
Number of Identical Predictions: 199
Number of Changed Predictions: 869
Agreement Rate: 18.63%
Probable Score: 16.5394% to 16.5549%


# Meta Model

In [None]:
import numpy as np
import pandas as pd
train_df = pd.read_csv('/kaggle/input/fake-or-real-the-imposter-x-train/train_dataset.csv')
classical_preds_train = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/oof_classical.npy')
bert_preds_train = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/oof_bert_512.npy')
for_deberta_train = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/forsenic_deberta_tfidf_feature_model_trainProba.npy')
count_model_train = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/logreg_count_model_train.npy')
x_meta_train = np.column_stack([classical_preds_train,
                           bert_preds_train,
                           for_deberta_train[:93],
                           count_model_train])
y_meta_train = np.where(train_df['labels']==1,1,0)

In [None]:
classical_preds_test = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/classical_test_proba.npy')
bert_preds_test = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/final_bert_predictions_512_wt_decay.npy')
for_deberta_test = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/forsenic_deberta_tfidf_feature_model_testProba.npy')
count_model_preds_test = np.load('/kaggle/input/pred-probas-from-different-models-for-stacking/logreg_count_model_preds_test.npy')
x_meta_test = np.column_stack([classical_preds_test,
                               bert_preds_test,
                               for_deberta_test,
                               count_model_preds_test])

In [None]:

import xgboost as xgb
import lightgbm as lgbm
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint, uniform, loguniform
import warnings

warnings.filterwarnings('ignore')

X = x_meta_train 
y = y_meta_train

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models_and_params = [
 
    (xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': uniform(0, 0.5)
    }),

   
    (lgbm.LGBMClassifier(random_state=42,verbose=-1), {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.2),
        'num_leaves': randint(20, 100),
        'max_depth': randint(3, 15),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
    }),


    (LogisticRegression(solver='liblinear', random_state=42), {
        'C': loguniform(1e-4, 1e2),
        'penalty': ['l1', 'l2']
    })
]


print("Starting Randomized Search for each model...")
print("-" * 50)

for model, param_dist in models_and_params:
    try:

        random_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_dist,
            n_iter=20,  
            cv=5,     
            scoring='accuracy',
            n_jobs=-1, 
            random_state=42,
            verbose=0 
        )

        
        random_search.fit(X_train, y_train)

        
        print(f"Model: {type(model).__name__}")
        print(f"Best cross-validation score: {random_search.best_score_:.4f}")
        print(f"Best parameters found: {random_search.best_params_}")
        print("-" * 50)

    except Exception as e:
        print(f"An error occurred with model {type(model).__name__}: {e}")
        print("-" * 50)

print("Randomized Search completed for all models.")

In [None]:
random_search.best_estimator_

In [None]:
best_stack_model = LogisticRegression(C=0.0012606912518374083, random_state=42, solver='liblinear')
best_stack_model.fit(x_meta_train,y_meta_train)
count_model_test_preds = best_stack_model.predict_proba(x_meta_test)[:,1]

In [None]:
def make_submission_csv(results):
    df_results = pd.DataFrame(results)
    output_df = df_results.copy()
    output_df.columns = ['real_text_id']
    output_df.reset_index(inplace=True)
    output_df.rename(columns={'index': 'id'}, inplace=True)
    output_df.to_csv('logreg_count_model_perplexity_llmverdict_preds.csv', index=False)
    return output_df
make_submission_csv(final_labels)