In [2]:
import pandas as pd

# Load both CSV files
model_df = pd.read_csv("final_fully_cleaned_models.csv")
abstract_df = pd.read_csv("paper_abstracts.csv")

# Merge on paper ID — assumes model_df has 'paper_id' and abstract_df has 'id'
merged_df = pd.merge(model_df, abstract_df, left_on="paper_id", right_on="id", how="left")

# Create a new section column called 'abstract_section'
merged_df["abstract_section"] = merged_df["abstract"].fillna('')

# (Optional) Save to a new file
merged_df.to_csv("merged_with_abstract_section.csv", index=False)

In [7]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Load the updated dataset that includes abstract_section
final_df = pd.read_csv('merged_with_abstract_section.csv')

# Combine model columns and clean
final_df['model_label'] = final_df[['model1', 'model2', 'model3', 'model4', 'model5', 'model6']].values.tolist()
final_df['model_label'] = final_df['model_label'].apply(lambda x: sorted(list(set(filter(pd.notna, x)))))

# Define deep model groupings
model_groups = {
    'CNN-based': ['CNN', 'AlexNet', 'VGG', 'VGG-16', 'VGG-19', 'ResNet', 'GoogleNet',
                  'LeNet', 'DenseNet', 'ZFNet', 'Xception', 'DCNN', 'CheXNet',
                  'ResNeXt', 'MobileNetV1', 'MobileNetV2', 'EfficientNet', 'U-Net', 'U-Net++'],
    'RNN-based': ['RNN', 'LSTM', 'BiLSTM', 'GRU', 'ConvLSTM'],
    'Transformer-based': ['Transformer', 'Transformer-XL', 'BERT', 'RoBERTa', 'ALBERT', 'GPT',
                          'GPT-2', 'GPT-3', 'GPT-4', 'T5', 'XLNet', 'DistilBERT', 'ELECTRA'],
    'GAN-based': ['GAN', 'DCGAN', 'StyleGAN', 'StyleGAN2', 'CycleGAN', 'BigGAN', 'BigGAN-Deep'],
    'Autoencoder-based': ['Autoencoder', 'VAE', 'Beta-VAE', 'Sparse Autoencoder'],
    'Object Detection': ['YOLO', 'YOLOv3', 'YOLOv4', 'Faster R-CNN', 'RCNN', 'RetinaNet'],
    'Graph-based': ['GCN', 'GAT', 'GNN', 'GIN'],
    'Ensemble-based': ['Ensemble', 'AdaBoost', 'Bagging', 'Stacking'],
    'Other Deep Models': ['MLP', 'ANN', 'DBN', 'Deep Belief Network', 'ELMo', 'Seq2Seq',
                          'CheXNet', 'Capsule Network'],
    'SVM': ['SVM'],
    'Logistic Regression': ['Logistic Regression'],
    'Linear Regression': ['Linear Regression'],
    'Decision Tree': ['Decision Tree'],
    'Naive Bayes': ['Naive Bayes'],
    'KNN': ['KNN'],
    'Gaussian Process': ['Gaussian Process'],
    'Q-Learning': ['Q-Learning']
}

# Traditional ML models to preserve as-is
traditional_ml_models = ['SVM', 'Logistic Regression', 'Naive Bayes', 'KNN', 'Decision Tree', 'Linear Regression']

# Reverse mapping
reverse_map = {}
for group, models in model_groups.items():
    for model in models:
        reverse_map[model] = group

# Generalize model labels
def generalize_models(model_list):
    generalized = set()
    for model in model_list:
        if model in traditional_ml_models:
            generalized.add(model)
        else:
            generalized.add(reverse_map.get(model, model))
    return list(generalized)

final_df['generalized_model_label'] = final_df['model_label'].apply(generalize_models)

# One-hot encoding
mlb = MultiLabelBinarizer()
one_hot = mlb.fit_transform(final_df['generalized_model_label'])
final_df['model_family_vector'] = one_hot.tolist()
model_family_classes = mlb.classes_

# Save class order for reference
pd.Series(model_family_classes).to_csv('model_family_classes.csv', index=False, header=['class'])

# Save full dataset
final_df.to_csv('final_df_with_model_vector_column.csv', index=False)

# Create proper_df with abstract as the 'text' column
proper_df = final_df[['abstract_section', 'section', 'model_family_vector']]



In [8]:
final_df

Unnamed: 0,paper_id,model1,model2,model3,model4,model5,model6,section,id,title,abstract,abstract_section,model_label,generalized_model_label,model_family_vector
0,on-enhancing-speech-emotion-recognition-using,GAN,,,,,,of a SVM trained on synthetic Real open SMILE ...,on-enhancing-speech-emotion-recognition-using,On Enhancing Speech Emotion Recognition using ...,Generative Adversarial Networks (GANs) have ga...,Generative Adversarial Networks (GANs) have ga...,[GAN],[GAN-based],"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,snap-ml-a-hierarchical-framework-for-machine,Logistic Regression,,,,,,In this work we have described Snap ML a new f...,snap-ml-a-hierarchical-framework-for-machine,Snap ML: A Hierarchical Framework for Machine ...,We describe a new software framework for fast ...,We describe a new software framework for fast ...,[Logistic Regression],[Logistic Regression],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,laplacian-smoothing-gradient-descent,Logistic Regression,,,,,,on both a quadratic function and a simple fini...,laplacian-smoothing-gradient-descent,Laplacian Smoothing Gradient Descent,We propose a class of very simple modification...,We propose a class of very simple modification...,[Logistic Regression],[Logistic Regression],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,examining-the-use-of-neural-networks-for,SVM,,,,,,show ture s used to classify the image dataset...,examining-the-use-of-neural-networks-for,Examining the Use of Neural Networks for Featu...,Neural networks in many varieties are touted a...,Neural networks in many varieties are touted a...,[SVM],[SVM],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,genesis-of-basic-and-multi-layer-echo-state,Autoencoder,,,,,,are Feature extraction based methods such as A...,genesis-of-basic-and-multi-layer-echo-state,Genesis of Basic and Multi-Layer Echo State Ne...,It is a widely accepted fact that data represe...,It is a widely accepted fact that data represe...,[Autoencoder],[Autoencoder-based],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,a-generative-model-to-synthesize-eeg-data-for,GAN,SVM,DCGAN,,,,show that the Epilepsy Society and Melbourne U...,a-generative-model-to-synthesize-eeg-data-for,A Generative Model to Synthesize EEG Data for ...,Prediction of seizure before they occur is vit...,Prediction of seizure before they occur is vit...,"[DCGAN, GAN, SVM]","[GAN-based, SVM]","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
684,using-machine-learning-to-calibrate-storm,Logistic Regression,,,,,,Random forests Bre iman output focus on cohere...,using-machine-learning-to-calibrate-storm,Using Machine Learning to Calibrate Storm-Scal...,A primary goal of the National Oceanic and Atm...,A primary goal of the National Oceanic and Atm...,[Logistic Regression],[Logistic Regression],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
685,fit-a-fast-and-accurate-framework-for-solving,Autoencoder,,,,,,could be Number of Monte Carlo samples obtaine...,fit-a-fast-and-accurate-framework-for-solving,BSODA: A Bipartite Scalable Framework for Onli...,A growing number of people are seeking healthc...,A growing number of people are seeking healthc...,[Autoencoder],[Autoencoder-based],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
686,improving-protein-gamma-turn-prediction-using,Capsule Network,,,,,,were unsatisfactory with Matthew correlation c...,improving-protein-gamma-turn-prediction-using,Improving Protein Gamma-Turn Prediction Using ...,Protein gamma-turn prediction is useful in pro...,Protein gamma-turn prediction is useful in pro...,[Capsule Network],[Other Deep Models],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [9]:
proper_df

Unnamed: 0,abstract_section,section,model_family_vector
0,Generative Adversarial Networks (GANs) have ga...,of a SVM trained on synthetic Real open SMILE ...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,We describe a new software framework for fast ...,In this work we have described Snap ML a new f...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
2,We propose a class of very simple modification...,on both a quadratic function and a simple fini...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Neural networks in many varieties are touted a...,show ture s used to classify the image dataset...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
4,It is a widely accepted fact that data represe...,are Feature extraction based methods such as A...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...
683,Prediction of seizure before they occur is vit...,show that the Epilepsy Society and Melbourne U...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
684,A primary goal of the National Oceanic and Atm...,Random forests Bre iman output focus on cohere...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
685,A growing number of people are seeking healthc...,could be Number of Monte Carlo samples obtaine...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
686,Protein gamma-turn prediction is useful in pro...,were unsatisfactory with Matthew correlation c...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [11]:
proper_df.to_csv('proper_df.csv', index=False)