# Used Libraries: 
/langchain /transformers /torch /pandas /openpyxl

In [1]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline




In [2]:
# Function to convert LeetSpeak to regular Arabic text
def leetspeak_to_arabic(text, leetspeak_dict):
    for leet, arabic in leetspeak_dict.items():
        text = text.replace(leet, arabic)
    return text

# Load the lexicon data
lexicon_path = "processed_lexioms.xlsx"
try:
    lexicon_data = pd.read_excel(lexicon_path)
except FileNotFoundError:
    print(f"Error: Lexicon file '{lexicon_path}' not found.")
    exit()

# Create a dictionary from the lexicon dataframe
lexicon_dict = pd.Series(lexicon_data['Term'].values, index=lexicon_data['Term_leetspeak']).to_dict()


In [5]:
# Load text data from .xlsx file
def load_data(file_path):
    try:
        df = pd.read_excel(file_path)
        return df['text_column']
    except FileNotFoundError:
        print(f"Error: Data file '{file_path}' not found.")
        create_data_file(file_path)
        print(f"A new file '{file_path}' has been created. Please fill in the 'text_column' and rerun the script.")
        exit()

def create_data_file(file_path):
    df = pd.DataFrame(columns=['text_column'])
    df.to_excel(file_path, index=False)

# Example usage
file_path = 'data.xlsx'
data = load_data(file_path)
print(data)

Series([], Name: text_column, dtype: object)


# Define the LLM wrappers for Command R+ and AraBERT

In [4]:
class CommandRPlusLLM:
    def __init__(self, model_name):
        try:
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.pipeline = pipeline('text-classification', model=self.model, tokenizer=self.tokenizer)
        except Exception as e:
            print(f"Error initializing Command R+ LLM: {e}")
            exit()

    def __call__(self, text):
        return self.pipeline(text)

class AraBERTLLM:
    def __init__(self, model_name):
        try:
            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.pipeline = pipeline('text-classification', model=self.model, tokenizer=self.tokenizer)
        except Exception as e:
            print(f"Error initializing AraBERT LLM: {e}")
            exit()

    def __call__(self, text):
        return self.pipeline(text)


In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Dictionary of common model names and their sizes
common_models = {
    "bert-base-uncased": "Base",
    "bert-large-uncased": "Large",
    "distilbert-base-uncased": "Base",
    "roberta-base": "Base",
    "roberta-large": "Large",
    # Add more models as needed
    "aubmindlab/bert-base-arabertv02": "Base",
    # Add Command R+ and other models here
}

print("Available Models:")
for model_name, size in common_models.items():
    print(f"{model_name} ({size})")

Available Models:
bert-base-uncased (Base)
bert-large-uncased (Large)
distilbert-base-uncased (Base)
roberta-base (Base)
roberta-large (Large)
aubmindlab/bert-base-arabertv02 (Base)


In [7]:
# Specify the model size during initialization
model_name = "aubmindlab/bert-base-arabertv02"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Initialize AraBERT
try:
    arabert = AraBERTLLM('aubmindlab/bert-base-arabertv02')
except Exception as e:
    print(f"Error initializing AraBERT LLM: {e}")
    exit()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Initialize Command R+
#try:
#    command_r_plus = CommandRPlusLLM('path_to_command_r_plus_model')
#    arabert = AraBERTLLM('aubmindlab/bert-base-arabertv02')
#except Exception as e:
#    print(f"Error initializing models: {e}")
#    exit()


# Define prompt templates

In [9]:
# Define prompt templates for sentiment analysis
sentiment_prompt = PromptTemplate(
    input_variables=["text"],
    template="Analyze the sentiment of the following Arabic text: {text}"
)

# Define prompt templates for topic modeling
topic_modeling_prompt = PromptTemplate(
    input_variables=["text"],
    template="Identify the main topics in the following Arabic text: {text}"
)

# Create LLMChains for Command R+ to perform sentiment analysis and topic modeling
#command_r_plus_sentiment_chain = LLMChain(
#    llm=HuggingFacePipeline(pipeline=command_r_plus.pipeline),
#    prompt=sentiment_prompt
#)

#command_r_plus_topic_chain = LLMChain(
#    llm=HuggingFacePipeline(pipeline=command_r_plus.pipeline),
#    prompt=topic_modeling_prompt
#)

# Create LLMChains for AraBERT to perform sentiment analysis and topic modeling
arabert_sentiment_chain = LLMChain(
    llm=HuggingFacePipeline(pipeline=arabert.pipeline),
    prompt=sentiment_prompt
)

arabert_topic_chain = LLMChain(
    llm=HuggingFacePipeline(pipeline=arabert.pipeline),
    prompt=topic_modeling_prompt
)


  warn_deprecated(


In [10]:
# Function to perform sentiment analysis and topic modeling
def analyze_text(text):
#    command_r_plus_sentiment_result = command_r_plus_sentiment_chain.run({"text": text})
#    command_r_plus_topic_result = command_r_plus_topic_chain.run({"text": text})
    
    arabert_sentiment_result = arabert_sentiment_chain.run({"text": text})
    arabert_topic_result = arabert_topic_chain.run({"text": text})
    
    return (arabert_sentiment_result, arabert_topic_result)
#    return (command_r_plus_sentiment_result, command_r_plus_topic_result)

# Load text data from .xlsx file
def load_data(file_path):
    try:
        df = pd.read_excel(file_path)
        return df['text_column']  # Adjust column name as necessary
    except FileNotFoundError:
        print(f"Error: Data file '{file_path}' not found.")
        exit()


# Save to .xlsx file

In [11]:
# Load text data from .xlsx file
def load_data(file_path):
    try:
        df = pd.read_excel(file_path)
        return df['text_column']
    except FileNotFoundError:
        print(f"Error: Data file '{file_path}' not found.")
        create_data_file(file_path)
        print(f"A new file '{file_path}' has been created. Please fill in the 'text_column' and rerun the script.")
        exit()

def create_data_file(file_path):
    df = pd.DataFrame(columns=['text_column'])
    df.to_excel(file_path, index=False)

# function to process the text
def leetspeak_to_arabic(text, lexicon_dict):
    return text

# function to analyze the text
def analyze_text(processed_text):

    return 'sentiment', 'topic', 'arabert_sentiment', 'arabert_topic'

file_path = 'Post_Process_RAW.xlsx'
texts = load_data(file_path)

if texts is not None:
    results = []

    for text in texts:
        processed_text = leetspeak_to_arabic(text, lexicon_dict)
#        command_r_plus_sentiment, command_r_plus_topic,
        arabert_sentiment, arabert_topic = analyze_text(processed_text)
        
        results.append({
            'original_text': text,
            'processed_text': processed_text,
#            'command_r_plus_sentiment': command_r_plus_sentiment,
#            'command_r_plus_topic': command_r_plus_topic,
            'arabert_sentiment': arabert_sentiment,
            'arabert_topic': arabert_topic
        })

    results_df = pd.DataFrame(results)
    print(results_df)

    # Save results to a new Excel file
    results_df.to_excel('sentiment_topic_analysis_results.xlsx', index=False)
else:
    print("No data to process. Exiting.")


Empty DataFrame
Columns: []
Index: []


# Indexes for Data Retrieval

In [12]:
index = {i: results_df.iloc[i].to_dict() for i in range(len(results_df))}

def retrieve_by_index(index, idx):
    return index.get(idx, None)

# Example of retrieving data by index
example_idx = 0  # Change the index as needed
retrieved_data = retrieve_by_index(index, example_idx)
print(retrieved_data)

None
