In [11]:
import pandas as pd
import json
import re
import numpy as np

# Function to clean Arabic text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove emojis (this is a basic approach)
    text = re.sub(r'[^\w\s,.]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    return text

# Load the datasets correctly with proper delimiter
try:
    # First attempt with tab delimiter
    train_df = pd.read_csv('train-data-final.csv', sep='\t')
    test_df = pd.read_csv('testing-data-final.csv', sep='\t')
except:
    # If that fails, try comma delimiter
    train_df = pd.read_csv('train-data-final.csv')
    test_df = pd.read_csv('testing-data-final.csv')

# Display the columns to check if they were parsed correctly
print("Training data columns:", train_df.columns.tolist())
print("Testing data columns:", test_df.columns.tolist())

Training data columns: ['tweet,sarcasm,sentiment,dialect']
Testing data columns: ['tweet,sarcasm,sentiment,dialect']


In [12]:
# If the data is still not parsed correctly (only one column),
# let's fix the parsing issue
def fix_parsing(df):
    if len(df.columns) == 1 and 'tweet,sarcasm,sentiment,dialect' in df.columns:
        # The header is in the column name, and data is not properly split
        # First, get the correct column names
        column_names = df.columns[0].split(',')
        
        # Create a new DataFrame with the correct structure
        rows = []
        for _, row in df.iterrows():
            # Split each row by comma, but be careful with text that may contain commas
            parts = row[0].split(',')
            # The tweet might contain commas, so join all except the last 3 parts
            if len(parts) > 3:
                tweet = ','.join(parts[:-3])
                sarcasm = parts[-3]
                sentiment = parts[-2]
                dialect = parts[-1]
                rows.append([tweet, sarcasm, sentiment, dialect])
            else:
                # If row doesn't have enough parts, add it as is
                rows.append(parts + [''] * (4 - len(parts)))
        
        return pd.DataFrame(rows, columns=column_names)
    return df

# Apply the fix if needed
train_df = fix_parsing(train_df)
test_df = fix_parsing(test_df)

# Check if the fix worked
print("Training data columns after fix:", train_df.columns.tolist())
print("Testing data columns after fix:", test_df.columns.tolist())

# Display a few rows to verify
print("\nTraining data sample:")
print(train_df.head(2))
print("\nTesting data sample:")
print(test_df.head(2))

  parts = row[0].split(',')


Training data columns after fix: ['tweet', 'sarcasm', 'sentiment', 'dialect']
Testing data columns after fix: ['tweet', 'sarcasm', 'sentiment', 'dialect']

Training data sample:
                                               tweet sarcasm sentiment dialect
0  د محمودالعلايليأرى أن الفريق أحمدشفيق رقم مهم ...   FALSE       NEU     msa
1                          مع فيدرر يا آجا والكبار     FALSE       NEU     msa

Testing data sample:
                                               tweet sarcasm sentiment dialect
0  اخوي حانق يالغلا وشفيك معصب؟ عادي تراهم بشر يف...   FALSE       NEG     msa
1                            اف مو متعوده عليهم سته     TRUE       NEG     msa


In [13]:
# Clean the text data
train_df['tweet'] = train_df['tweet'].apply(clean_text)
test_df['tweet'] = test_df['tweet'].apply(clean_text)

# Convert boolean strings to actual boolean values
def convert_boolean(value):
    if isinstance(value, str):
        return value.upper() == 'TRUE'
    return bool(value)

train_df['sarcasm'] = train_df['sarcasm'].apply(convert_boolean)
test_df['sarcasm'] = test_df['sarcasm'].apply(convert_boolean)

# Check for missing values
print("Missing values in training data:\n", train_df.isnull().sum())
print("Missing values in testing data:\n", test_df.isnull().sum())

# Fill missing values
train_df = train_df.fillna({'tweet': '', 'sarcasm': False, 'sentiment': 'NEU', 'dialect': 'msa'})
test_df = test_df.fillna({'tweet': '', 'sarcasm': False, 'sentiment': 'NEU', 'dialect': 'msa'})

# Check data shapes
print("\nTraining data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

Missing values in training data:
 tweet        0
sarcasm      0
sentiment    0
dialect      0
dtype: int64
Missing values in testing data:
 tweet        0
sarcasm      0
sentiment    0
dialect      0
dtype: int64

Training data shape: (12549, 4)
Testing data shape: (3000, 4)


In [15]:
# Function to convert DataFrame to JSON format suitable for LLM fine-tuning
def create_json_for_finetuning(df, output_file):
    # Create a list to hold the formatted examples
    json_data = []
    
    # Define the instructions for each task with very strict response formats
    sarcasm_instruction = "Analyze if the given Arabic text contains sarcasm. Return ONLY 'TRUE' or 'FALSE' with no additional text or explanation."
    sentiment_instruction = "Classify the sentiment of the given Arabic text. Return ONLY one of these labels: 'POS', 'NEG', or 'NEU' with no additional text or explanation."
    dialect_instruction = "Identify the Arabic dialect in the given text. Return ONLY one of these labels: 'msa', 'egypt', 'gulf', 'levant', or 'magreb' with no additional text or explanation."
    
    for _, row in df.iterrows():
        # Create a sample with input (tweet) and output labels, along with instructions and predicted fields
        sample = {
            "input": row['tweet'],
            "is_sarcastic": bool(row['sarcasm']),
            "sentiment": row['sentiment'],
            "dialect": row['dialect'],
            "sarcasm_instruction": sarcasm_instruction,
            "sentiment_instruction": sentiment_instruction,
            "dialect_instruction": dialect_instruction,
            "predicted_sarcasm": "",
            "predicted_sentiment": "",
            "predicted_dialect": ""
        }
        json_data.append(sample)
    
    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, ensure_ascii=False, indent=2)
    
    print(f"Saved {len(json_data)} examples to {output_file}")
    return json_data

# Convert to JSON and save
train_json = create_json_for_finetuning(train_df, 'arsarcasm_train.json')
test_json = create_json_for_finetuning(test_df, 'arsarcasm_test.json')

# Show an example of the JSON format
print("\nExample JSON format:")
print(json.dumps(train_json[0], ensure_ascii=False, indent=2))

Saved 12549 examples to arsarcasm_train.json
Saved 3000 examples to arsarcasm_test.json

Example JSON format:
{
  "input": "د محمودالعلايليأرى أن الفريق أحمدشفيق رقم مهم في المعادلة السياسية المصرية ولا يمكن إغفالههل ترى أن هذا صحيح أربعةزائدواحد",
  "is_sarcastic": false,
  "sentiment": "NEU",
  "dialect": "msa",
  "sarcasm_instruction": "Analyze if the given Arabic text contains sarcasm. Return ONLY 'TRUE' or 'FALSE' with no additional text or explanation.",
  "sentiment_instruction": "Classify the sentiment of the given Arabic text. Return ONLY one of these labels: 'POS', 'NEG', or 'NEU' with no additional text or explanation.",
  "dialect_instruction": "Identify the Arabic dialect in the given text. Return ONLY one of these labels: 'msa', 'egypt', 'gulf', 'levant', or 'magreb' with no additional text or explanation.",
  "predicted_sarcasm": "",
  "predicted_sentiment": "",
  "predicted_dialect": ""
}
Saved 3000 examples to arsarcasm_test.json

Example JSON format:
{
  "input": "د م