In [None]:
import requests
import pandas as pd
import time
import os
import io
import re
import tempfile

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Load dataset
final_df = pd.read_csv('cleaned_data.csv')  # Update with your path

# Combine model columns and clean
final_df['model_label'] = final_df[['model1', 'model2', 'model3', 'model4', 'model5', 'model6']].values.tolist()
final_df['model_label'] = final_df['model_label'].apply(lambda x: sorted(list(set(filter(pd.notna, x)))))

print(final_df['model_label'].tolist())

# Define deep model groupings
model_groups = {
    # Deep learning categories
    'CNN-based': [
        'CNN', 'AlexNet', 'VGG', 'VGG-16', 'VGG-19', 'ResNet', 'GoogleNet',
        'LeNet', 'DenseNet', 'ZFNet', 'Xception', 'DCNN', 'CheXNet',
        'ResNeXt', 'MobileNetV1', 'MobileNetV2', 'EfficientNet', 'U-Net', 'U-Net++'
    ],
    'RNN-based': [
        'RNN', 'LSTM', 'BiLSTM', 'GRU', 'ConvLSTM'
    ],
    'Transformer-based': [
        'Transformer', 'Transformer-XL', 'BERT', 'RoBERTa', 'ALBERT', 'GPT',
        'GPT-2', 'GPT-3', 'GPT-4', 'T5', 'XLNet', 'DistilBERT', 'ELECTRA'
    ],
    'GAN-based': [
        'GAN', 'DCGAN', 'StyleGAN', 'StyleGAN2', 'CycleGAN', 'BigGAN', 'BigGAN-Deep'
    ],
    'Autoencoder-based': [
        'Autoencoder', 'VAE', 'Beta-VAE', 'Sparse Autoencoder'
    ],
    'Object Detection': [
        'YOLO', 'YOLOv3', 'YOLOv4', 'Faster R-CNN', 'RCNN', 'RetinaNet'
    ],
    'Graph-based': ['GCN', 'GAT', 'GNN', 'GIN'],
    'Ensemble-based': ['Ensemble', 'AdaBoost', 'Bagging', 'Stacking'],
    'Other Deep Models': [
        'MLP', 'ANN', 'DBN', 'Deep Belief Network', 'ELMo', 'Seq2Seq',
        'CheXNet', 'Capsule Network'  # fits CNN family but often used independently
    ],

    # Each traditional ML model gets its own group
    'SVM': ['SVM'],
    'Logistic Regression': ['Logistic Regression'],
    'Linear Regression': ['Linear Regression'],
    'Decision Tree': ['Decision Tree'],
    'Naive Bayes': ['Naive Bayes'],
    'KNN': ['KNN'],
    'Gaussian Process': ['Gaussian Process'],

    # Other ML models
    'Q-Learning': ['Q-Learning'],
}


# Traditional ML models to preserve as-is
traditional_ml_models = ['SVM', 'Logistic Regression', 'Naive Bayes', 'KNN', 'Decision Tree', 'Linear Regression']

# Build reverse mapping
reverse_map = {}
for group, models in model_groups.items():
    for model in models:
        reverse_map[model] = group

# Generalization function
def generalize_models(model_list):
    generalized = set()
    for model in model_list:
        if model in traditional_ml_models:
            generalized.add(model)
        else:
            generalized.add(reverse_map.get(model, model))
    return list(generalized)

# Apply mapping
final_df['generalized_model_label'] = final_df['model_label'].apply(generalize_models)

# One-hot encode
mlb = MultiLabelBinarizer()
one_hot = mlb.fit_transform(final_df['generalized_model_label'])
# Store class order for reference (important for downstream use!)
model_family_classes = mlb.classes_
print(model_family_classes)

# Create a new DataFrame with the one-hot encoded columns
# Convert each row of the one-hot matrix to a list and add to one column
final_df['model_family_vector'] = one_hot.tolist()
all_labels = [label for sublist in final_df['generalized_model_label'] for label in sublist]
print(all_labels)
# (Optional) Save model family class order to a separate file for decoding later
pd.Series(model_family_classes).to_csv('model_family_classes.csv', index=False, header=['class'])

# Save final DataFrame
final_df.to_csv('final_df_with_model_vector_column.csv', index=False)
print(final_df.columns)
# For new "discussion/findings" pipeline
proper_df = final_df[['section', 'model_family_vector']].rename(columns={
    'section': 'text'
})


[['GAN'], ['Logistic Regression'], ['Logistic Regression'], ['SVM'], ['Autoencoder'], ['SVM'], ['GAN'], ['GAN'], ['GAN'], ['Autoencoder', 'VAE'], ['Linear Regression'], ['Gaussian Process'], ['Autoencoder'], ['CNN', 'Faster R-CNN'], ['Autoencoder'], ['AlexNet', 'LeNet'], ['VAE'], ['LSTM', 'Seq2Seq'], ['LSTM'], ['Logistic Regression'], ['GRU'], ['GRU', 'LSTM'], ['GAN'], ['Logistic Regression'], ['Logistic Regression'], ['ResNet', 'VGG'], ['GAN'], ['VGG'], ['LSTM'], ['Logistic Regression'], ['Gaussian Process'], ['GoogleNet', 'LeNet'], ['LSTM'], ['Linear Regression'], ['Logistic Regression'], ['CNN', 'DCNN'], ['LSTM'], ['Gaussian Process'], ['GRU'], ['GAN'], ['LSTM', 'Seq2Seq'], ['Gaussian Process'], ['Linear Regression'], ['SVM'], ['SVM'], ['Logistic Regression'], ['GAN'], ['LSTM'], ['Gaussian Process'], ['SVM'], ['Gaussian Process'], ['Deep Belief Network'], ['SVM'], ['Gaussian Process'], ['Linear Regression'], ['GAN'], ['Linear Regression'], ['SqueezeNet'], ['Autoencoder'], ['SVM'], [

In [None]:
final_df

Unnamed: 0,paper_id,model1,model2,model3,model4,model5,model6,section,model_label,generalized_model_label,model_family_vector
0,on-enhancing-speech-emotion-recognition-using,GAN,,,,,,of a SVM trained on synthetic\nRealopenSMILE+i...,[GAN],[GAN-based],"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,snap-ml-a-hierarchical-framework-for-machine,Logistic Regression,,,,,,"InthisworkwehavedescribedSnapML,anewframeworkf...",[Logistic Regression],[Logistic Regression],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,laplacian-smoothing-gradient-descent,Logistic Regression,,,,,,on both a quadratic function and a\nsimple fin...,[Logistic Regression],[Logistic Regression],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,examining-the-use-of-neural-networks-for,SVM,,,,,,show tures used to classify the image datasets...,[SVM],[SVM],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,genesis-of-basic-and-multi-layer-echo-state,Autoencoder,,,,,,are. Feature extraction-based methods such as ...,[Autoencoder],[Autoencoder-based],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...
683,a-generative-model-to-synthesize-eeg-data-for,GAN,SVM,DCGAN,,,,show that the Epilepsy Society and Melbourne U...,"[DCGAN, GAN, SVM]","[GAN-based, SVM]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
684,using-machine-learning-to-calibrate-storm,Logistic Regression,,,,,,. Random forests (Breiman output focus on cohe...,[Logistic Regression],[Logistic Regression],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
685,fit-a-fast-and-accurate-framework-for-solving,Autoencoder,,,,,,could be\n𝑁 𝑀 NumberofMonteCarlosamples.\nobta...,[Autoencoder],[Autoencoder-based],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
686,improving-protein-gamma-turn-prediction-using,Capsule Network,,,,,,were unsatisfactory with Matthew correlation c...,[Capsule Network],[Other Deep Models],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [None]:
proper_df.to_csv('proper_df.csv', index=False)

In [1]:
%pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.6/541.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541530 sha256=e8a2f87dff945f4317d8bd70af863f624245e6b3e403fcc788df4f655becc6e3
  Stored in directory: /root/.cache/pip/wheels/e6/66/9c/712044a983337f5d44f90abcd244bd4b8ad28ee64750404b50
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


In [7]:
import pandas as pd
import wordninja

df = pd.read_csv("cleaned_data-1.csv")
df["section"] = df["section"].apply(lambda x: ' '.join(wordninja.split(x)))
df

Unnamed: 0,paper_id,model1,model2,model3,model4,model5,model6,section
0,on-enhancing-speech-emotion-recognition-using,GAN,,,,,,of a SV M trained on synthetic Real open SMILE...
1,snap-ml-a-hierarchical-framework-for-machine,Logistic Regression,,,,,,In this work we have described Snap ML a new f...
2,laplacian-smoothing-gradient-descent,Logistic Regression,,,,,,on both a quadratic function and a simple fini...
3,examining-the-use-of-neural-networks-for,SVM,,,,,,show ture s used to classify the image dataset...
4,genesis-of-basic-and-multi-layer-echo-state,Autoencoder,,,,,,are Feature extraction based methods such as a...
...,...,...,...,...,...,...,...,...
683,a-generative-model-to-synthesize-eeg-data-for,GAN,SVM,DCGAN,,,,show that the Epilepsy Society and Melbourne U...
684,using-machine-learning-to-calibrate-storm,Logistic Regression,,,,,,Random forests Bre iman output focus on cohere...
685,fit-a-fast-and-accurate-framework-for-solving,Autoencoder,,,,,,could be Number of Monte Carlo samples obtaine...
686,improving-protein-gamma-turn-prediction-using,Capsule Network,,,,,,were unsatisfactory with Matthew correlation c...


In [6]:
df.to_csv("cleaned_file.csv", index=False)