<a href="https://colab.research.google.com/github/mapcrafter2048/Literature-Review-Generator-ML-17/blob/main/segmented_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

csv_file_path = '/content/train.csv'
df = pd.read_csv(csv_file_path)

# Check pandas version
# print(pd.__version__)

# Upgrade pandas if necessary
# !pip install --upgrade pandas

# df = pd.read_csv(csv_file_path, quoting=3, on_bad_lines='skip')
# For pandas versions >= 1.0.0, use 'on_bad_lines' instead of 'error_bad_lines' to skip bad lines.

# print(df.head())
print(df.shape)

(1992, 5)


In [None]:
import pandas as pd

# Experiment with different encodings
encodings_to_try = ['utf-8', 'latin-1', 'iso-8859-1', 'utf-16']

for encoding in encodings_to_try:
    try:
        df = pd.read_csv('/content/dblp-v10.csv', on_bad_lines='skip', encoding=encoding, quoting=3)
        print(f"Success with encoding: {encoding}")
        break  # Stop if successful
    except pd.errors.ParserError as e:
        print(f"Error with encoding {encoding}: {e}")

Success with encoding: utf-8


In [None]:
import spacy
import pandas as pd

# ... (rest of your code)
# Load the English NLP model
nlp = spacy.load('en_core_web_sm')

# Keywords for each section
intro_keywords = ['This study', 'We investigate', 'We examine', 'The objective', 'Our aim', 'Purpose',
                  'Motivation', 'Background', 'Context', 'Research focus', 'The problem', 'In this paper',
                  'Previous studies', 'Prior research', 'Current understanding', 'Existing methods',
                  'Literature review']

findings_keywords = ['Results', 'Findings', 'Observations', 'Data shows', 'We find', 'Our analysis',
                    'Significant', 'Increase', 'Decrease', 'Evidence', 'Measurements', 'Quantitative',
                    'Qualitative', 'Comparison', 'Calculated', 'Measured', 'Estimated', 'Determined',
                    'Observed', 'Analyzed', 'Revealed', 'Demonstrated', 'Quantified', 'Assessed', 'Found']

conclusion_keywords = ['Conclusion', 'In summary', 'To conclude', 'Implications', 'Significance',
                      'Importance', 'Summary', 'Overall', 'Therefore', 'As a result', 'Consequently',
                      'The findings suggest', 'Future work', 'Recommendations', 'Impact', 'Contribution',
                      'Insights', 'Applications', 'Broader implications', 'Interpretation']

# Function to segment abstract
def segment_target(target):
    intro, findings, conclusion = [], [], []

    if not pd.isna(target):  # Check if abstract is not NaN
        # Convert to string if it's not already
        if not isinstance(target, str):
            target= str(target)

        for sentence in target.split('. '):
            sent_text = sentence.strip()
            if any(keyword in sent_text for keyword in intro_keywords):
                intro.append(sent_text)
            elif any(keyword in sent_text for keyword in findings_keywords):
                findings.append(sent_text)
            elif any(keyword in sent_text for keyword in conclusion_keywords):
                conclusion.append(sent_text)

    return " ".join(intro), " ".join(findings), " ".join(conclusion)

# Load dataset
df = pd.read_csv('/content/train.csv')

# Segment each abstract and create new columns
df['Introduction'], df['Findings'], df['Conclusion'] = zip(*df['target'].apply(segment_target))

# Save the segmented dataset
df.to_csv('segmented_targets.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '/content/train.csv'

In [None]:
from transformers import T5Config, T5Tokenizer, TFAutoModelForSeq2SeqLM, AdamWeightDecay
import tensorflow as tf
import pandas as pd

# Load the segmented dataset
df = pd.read_csv('segmented_targets.csv')

# Prepare the data for TensorFlow model
class targetDataset(tf.data.Dataset):
    def __new__(cls, dataframe, tokenizer, max_len=256):  # Reduced max_len to 256
        def tokenize_data(row):
            input_text = f"source: {row['source']}"
            target_text = (f"Introduction: {row['Introduction']} "
                           f"Findings: {row['Findings']} Conclusion: {row['Conclusion']}")
            inputs = tokenizer(input_text, max_length=max_len, truncation=True, padding='max_length', return_tensors='tf')
            targets = tokenizer(target_text, max_length=max_len, truncation=True, padding='max_length', return_tensors='tf')
            return {
                'input_ids': inputs['input_ids'][0],
                'attention_mask': inputs['attention_mask'][0],
                'decoder_input_ids': targets['input_ids'][0],
                'decoder_attention_mask': targets['attention_mask'][0],
                'labels': targets['input_ids'][0]
            }

        # Convert dataframe rows to dataset
        data = [tokenize_data(row) for _, row in dataframe.iterrows()]
        input_ids = [d['input_ids'] for d in data]
        attention_mask = [d['attention_mask'] for d in data]
        decoder_input_ids = [d['decoder_input_ids'] for d in data]
        decoder_attention_mask = [d['decoder_attention_mask'] for d in data]
        labels = [d['labels'] for d in data]

        dataset = tf.data.Dataset.from_tensor_slices((
            {
                'input_ids': tf.stack(input_ids),
                'attention_mask': tf.stack(attention_mask),
                'decoder_input_ids': tf.stack(decoder_input_ids),
                'decoder_attention_mask': tf.stack(decoder_attention_mask)
            },
            tf.stack(labels)
        ))

        dataset = dataset.shuffle(buffer_size=len(dataframe)).batch(2)  # Reduced batch size to 4
        return dataset

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Configure the model with dropout
config = T5Config.from_pretrained('t5-base')
config.dropout_rate = 0.1  # Add dropout rate

model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-base', config=config)

# Create dataset
dataset = targetDataset(df, tokenizer)

# Prepare the model for TensorFlow training
optimizer = AdamWeightDecay(learning_rate=1e-5)  # Lowered learning rate

# Define a custom loss function to handle SymbolicTensors
def custom_loss(y_true, y_pred):
    # Ensure y_true and y_pred have the correct shapes
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    active_loss = tf.not_equal(y_true, 0)
    reduced_logits = tf.boolean_mask(y_pred, active_loss)
    labels = tf.boolean_mask(y_true, active_loss)
    return loss_fn(labels, reduced_logits)

# Compile the model, using the custom loss function
model.compile(optimizer=optimizer, loss=custom_loss)

# Learning rate scheduler
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-5 * (0.1 ** (epoch // 10))
)

# Training loop (simplified)
model.fit(dataset, epochs=3, callbacks=[lr_scheduler])  # Adjust the number of epochs as needed

# Save the trained model
model.save_pretrained('segmentation_model')
tokenizer.save_pretrained('segmentation_model')


FileNotFoundError: [Errno 2] No such file or directory: 'segmented_targets.csv'

In [None]:
from transformers import T5Tokenizer, TFAutoModelForSeq2SeqLM

# Load the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('segmentation_model')
model = TFAutoModelForSeq2SeqLM.from_pretrained('segmentation_model')

# Function to generate summary
def generate_summary(source):
    # Prefix the source text for T5 summarization
    input_text = f"summarize: {source}"
    input_ids = tokenizer.encode(input_text, return_tensors='tf')  # Use TensorFlow tensors

    # Generate summary
    outputs = model.generate(
        input_ids,
        max_length=150,  # Adjust based on the expected length of summaries
        num_beams=5,     # Beam search for better results
        early_stopping=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Example usage
source = "Climate change, driven by anthropogenic greenhouse gas emissions, poses a significant threat to global agricultural productivity. This study investigates the effects of rising temperatures, altered precipitation patterns, and increased frequency of extreme weather events on key crops such as wheat, rice, and maize. Our findings indicate that a 2°C rise in global temperatures could reduce wheat yields by up to 15%, rice yields by 10%, and maize yields by 8%, primarily due to accelerated crop maturation and impaired grain filling. Altered precipitation patterns, including more intense droughts and floods, particularly affect rain-fed agriculture in regions like Sub-Saharan Africa and South Asia, causing crop failures and waterlogging issues. Extreme weather events such as hurricanes, heatwaves, and frosts further exacerbate these impacts by destroying crops and disrupting agricultural schedules. The research highlights significant regional disparities, with developed countries better equipped to adapt compared to developing nations that lack resources. To mitigate these adverse effects, we recommend developing climate-resilient crop varieties, improving water management practices, and investing in agricultural research and extension services. Policymakers must prioritize climate adaptation in agricultural planning and support vulnerable regions through financial and technical assistance. Proactive measures and international collaboration are essential to safeguarding agricultural productivity and ensuring global food security in the face of a changing climate."
summary = generate_summary(source)
print(summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: segmentation_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
# from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import T5Tokenizer, TFAutoModelForSeq2SeqLM

# Load the trained model and tokenizer, specifying the TensorFlow variant
# model = TFAutoModelForSeq2SeqLM.from_pretrained('segmentation_model')
# tokenizer = T5Tokenizer.from_pretrained('segmentation_model')


# Function to generate structured summary
def generate_summary(source):
    input_text = f"source: {source}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Example usage
# title = "Your title here"
# authors = "Author names here"
source="Incremental class learning involves sequentially learning classes in bursts of examples from the same class.', 'This violates the assumptions that underlie  methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting.', 'Arguably, the best method for incremental class learning is iCaRL, but it requires storing  training examples for each class, making it challenging to scale.', 'Here, we propose FearNet for incremental class learning."

summary = generate_summary(source)
print(summary)


True


In [None]:
from transformers import T5Tokenizer, TFAutoModelForSeq2SeqLM

# Load the trained model and tokenizer, specifying the TensorFlow variant
model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-small')  # Use a pre-trained summarization model
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Function to generate structured summary
def generate_summary(source):
    input_text = f"summarize: {source}"  # Using T5's specific prefix for summarization
    input_ids = tokenizer.encode(input_text, return_tensors='tf')  # Change to 'tf' for TensorFlow
    outputs = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Example usage
source = "Incremental class learning involves sequentially learning classes in bursts of examples from the same class. This violates the assumptions that underlie methods for training standard deep neural networks, and will cause them to suffer from catastrophic forgetting. Arguably, the best method for incremental class learning is iCaRL, but it requires storing training examples for each class, making it challenging to scale. Here, we propose FearNet for incremental class learning."

summary = generate_summary(source)
print(summary)


In [None]:
Title: Calculation of prompt diphoton production cross sections at Tevatron and LHC energies
Authors: C. Balázs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan
Abstract: In this study, we investigate the prompt diphoton production cross sections at the Tevatron and LHC energies. Our analysis shows significant discrepancies between the theoretical predictions and the experimental data. The results indicate that the next-to-leading order calculations are necessary for accurate predictions. In conclusion, this work highlights the importance of higher-order corrections in diphoton production cross sections.


In [None]:
Title: Calculation of prompt diphoton production cross sections at Tevatron and LHC energies
Authors: C. Balázs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan
Introduction: In this study, we investigate the prompt diphoton production cross sections at the Tevatron and LHC energies.
Findings: Our analysis shows significant discrepancies between the theoretical predictions and the experimental data. The results indicate that the next-to-leading order calculations are necessary for accurate predictions.
Conclusion: In conclusion, this work highlights the importance of higher-order corrections in diphoton production cross sections.
