In [1]:
!pip install pyspark transformers torch tqdm scikit-learn sparknlp huggingface_hub fasttext

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting spark-nlp (from sparknlp)
  Downloading spark_nlp-5.3.3-py2.py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB)
Downloading spark_nlp-5.3.3-py2.py3-none-any.whl (568 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.4/568.4 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493

# LOGISTIC REGRESSION

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import fasttext
import fasttext.util

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
logistic_regression_model = LogisticRegression(max_iter=1000)

# Step 6: Train Model in Epochs
epochs = 1
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    logistic_regression_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, logistic_regression_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, logistic_regression_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")




Epoch 1/1


  0%|          | 0/7171 [00:00<?, ?it/s]

Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311





# RANDOM FOREST

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import fasttext.util

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 6: Train Model in Epochs
epochs = 1
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    random_forest_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, random_forest_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, random_forest_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")




Epoch 1/1


  0%|          | 0/7171 [00:11<?, ?it/s]

Train Loss: None, Train Accuracy: 0.9995816483056756, Train Precision: 0.9995819474761473, Train Recall: 0.9995816483056756, Train F1-score: 0.999581616854672
Validation Accuracy: 0.6296709425543782, Validation Precision: 0.6450759085951593, Validation Recall: 0.6296709425543782, Validation F1-score: 0.5991409452797823





# SVM

In [6]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import fasttext

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
svm_model = SVC(kernel='linear', random_state=42)

# Step 6: Train Model in Epochs
epochs = 1
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    svm_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, svm_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, svm_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")




Epoch 1/1


  0%|          | 0/7171 [01:00<?, ?it/s]
  0%|          | 0/7171 [00:31<?, ?it/s]

Train Loss: None, Train Accuracy: 0.6584855668665458, Train Precision: 0.6789439172896629, Train Recall: 0.6584855668665458, Train F1-score: 0.6297087537970312
Validation Accuracy: 0.6291132180702733, Validation Precision: 0.6349227133713751, Validation Recall: 0.6291132180702733, Validation F1-score: 0.5983744133620498





# BIDIRECTIONAL LSTM WITH ATTENTION LAYER

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Activation, Lambda, RepeatVector, Permute, Flatten
import tensorflow.keras.backend as K
import os
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
        dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# print(df.size)

df = df.dropna(subset=['clean_content'])

print(df.size)
df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Preprocessing
maxlen = 10000  # Maximum sequence length
max_words = 200000  # Maximum number of words in vocabulary

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['content'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['content'])

# Pad sequences to maxlen
X = pad_sequences(sequences, maxlen=maxlen)

# Label encoding for sentiments (assuming you have 'sentiment' column)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Bidirectional LSTM with Attention Model
input_seq = Input(shape=(maxlen,))
embedding = Embedding(max_words, 128, input_length=maxlen)(input_seq)
lstm = Bidirectional(LSTM(64, return_sequences=True))(embedding)

# Attention Mechanism
attention = Dense(1, activation='tanh')(lstm)
attention = Flatten()(attention)
attention = Activation('softmax', name='attention_weights')(attention)
attention = RepeatVector(128 * 2)(attention)
attention = Permute([2, 1])(attention)

sent_representation = Concatenate(axis=-1)([lstm, attention])
sent_representation = Lambda(lambda x: K.sum(x, axis=1))(sent_representation)

output = Dense(1, activation='sigmoid')(sent_representation)

model = Model(inputs=input_seq, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

clear_gpu_memory()

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Print training and validation metrics
print("Training Loss:", history.history['loss'])
print("Training Accuracy:", history.history['accuracy'])
print("Validation Loss:", history.history['val_loss'])
print("Validation Accuracy:", history.history['val_accuracy'])

64680
64659




Epoch 1/10
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 573ms/step - accuracy: 0.4114 - loss: -25902.7051 - val_accuracy: 0.4154 - val_loss: -68951.6797
Epoch 2/10
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 576ms/step - accuracy: 0.4128 - loss: -81250.4688 - val_accuracy: 0.4154 - val_loss: -120757.0312
Epoch 3/10
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 578ms/step - accuracy: 0.4089 - loss: -157105.2656 - val_accuracy: 0.4154 - val_loss: -192876.8438
Epoch 4/10
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 577ms/step - accuracy: 0.4001 - loss: -311555.7812 - val_accuracy: 0.4154 - val_loss: -268831.4688
Epoch 5/10
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 577ms/step - accuracy: 0.4020 - loss: -418399.0312 - val_accuracy: 0.4154 - val_loss: -341662.7812
Epoch 6/10
[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 577ms/step - accuracy: 0.4063 - lo

# BERT

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
import os
import pandas as pd
import torch

def clear_gpu_memory():
    """Frees memory allocated on the GPU."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Load Malayalam BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('l3cube-pune/malayalam-bert')

# Define dataset class
class MalayalamDataset(Dataset):
    def __init__(self, dataframe, max_len=128):
        self.data = dataframe
        self.max_len = max_len
        self.texts = self.data.content.tolist()
        self.targets = self.data.sentiment.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        target = self.targets[index]

        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }

# Load and preprocess dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

def create_malayalam_dataset(content_col, sentiment_col, max_len=128):
    tokenizer = BertTokenizer.from_pretrained('l3cube-pune/malayalam-bert')

    def tokenize_text(content, sentiment):
        inputs = tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': content,
            'input_ids': inputs['input_ids'].flatten().tolist(),
            'attention_mask': inputs['attention_mask'].flatten().tolist(),
            'target': int(sentiment)
        }

    return udf(tokenize_text, StructType([
        StructField('text', StringType(), True),
        StructField('input_ids', ArrayType(IntegerType()), True),
        StructField('attention_mask', ArrayType(IntegerType()), True),
        StructField('target', IntegerType(), True)
    ]))

# Fine-tune BERT for sentiment analysis
def train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total_targets = 0
        for batch in tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            correct += (preds == targets.cpu().numpy()).sum()
            total_targets += len(targets)

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total_targets
        print(f"Average training loss for epoch {epoch+1}: {avg_train_loss}")
        print(f"Training Accuracy for epoch {epoch+1}: {train_accuracy}")

        val_loss, val_acc, val_report = evaluate_model(model, val_loader, device)
        print(f"Validation loss: {val_loss}, Accuracy: {val_acc}")
        print("Validation Report:")
        print(val_report)

        scheduler.step()

# Evaluate the model
def evaluate_model(model, val_loader, device):
    model.eval()
    val_targets = []
    val_outputs = []

    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()

            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(preds)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_targets, val_outputs)
    val_report = classification_report(val_targets, val_outputs, target_names=["Negative", "Neutral", "Positive"])
    return avg_val_loss, val_accuracy, val_report

# Main function to train and evaluate
def main():
    tokenizer.save_pretrained("bert-tokenizer")

    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("Malayalam Tweets Analysis") \
        .getOrCreate()

    #   return
    # # Directory containing your CSV files
    directory = '/kaggle/input/malayalam-tweets/'

    # # List to store DataFrames from each CSV file
    dfs = []

    # # Loop through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv") and "total" not in filename:
            file_path = os.path.join(directory, filename)
            # Read the CSV file into a DataFrame
            a_df = pd.read_csv(file_path)

            if "datetimee" in a_df.columns:
                # print("has datetimeee")
                a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

    # # Combine all DataFrames into a single DataFrame
    df = pd.concat(dfs, ignore_index=True)

    df = df.dropna(subset=['clean_content'])

    df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})
    
    df_spark = spark.createDataFrame(df)

    # Define UDF to create Malayalam Dataset
    create_dataset_udf = create_malayalam_dataset("clean_content", "sentiment")

    # Apply UDF to create train_dataset and val_dataset
    df_spark = df_spark.withColumn("malayalam_dataset", create_dataset_udf(df_spark["clean_content"], df_spark["sentiment"]))
    
    # Split dataset into train and validation
    train_df, val_df = df_spark.randomSplit([0.9, 0.1], seed=42)

    # Convert Spark DataFrame to pandas DataFrame (optional)
    train_df_pandas = train_df.select("malayalam_dataset").toPandas()
    val_df_pandas = val_df.select("malayalam_dataset").toPandas()

    # Convert pandas DataFrames to datasets
    train_dataset = [row.malayalam_dataset for _, row in train_df_pandas.iterrows()]
    val_dataset = [row.malayalam_dataset for _, row in val_df_pandas.iterrows()]
    
    # Split dataset into train and validation
    # train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

    # Create datasets and dataloaders
    # train_dataset = MalayalamDataset(train_df)
    # val_dataset = MalayalamDataset(val_df)

    batch_size = 16
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Load pre-trained BERT model for sequence classification
    model = BertForSequenceClassification.from_pretrained('l3cube-pune/malayalam-bert', num_labels=3)

    # Send model to GPU, if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Create optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Create scheduler
    scheduler = ExponentialLR(optimizer, gamma=0.9)

    # Train the model
    train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=9)

    clear_gpu_memory()

    # Save the trained model
    model.save_pretrained("malayalam_sentiment_model")

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/malayalam-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/9: 100%|██████████| 520/520 [03:20<00:00,  2.60it/s]


Average training loss for epoch 1: 0.8310751753930862
Training Accuracy for epoch 1: 0.6555996631781547
Validation loss: 0.693876336874633, Accuracy: 0.7132034632034632
Validation Report:
              precision    recall  f1-score   support

    Negative       0.75      0.38      0.51       149
     Neutral       0.71      0.62      0.66       397
    Positive       0.71      0.94      0.81       378

    accuracy                           0.71       924
   macro avg       0.72      0.65      0.66       924
weighted avg       0.72      0.71      0.70       924



Epoch 2/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 2: 0.5039584225473496
Training Accuracy for epoch 2: 0.8046433297245279
Validation loss: 0.5500391087141531, Accuracy: 0.762987012987013
Validation Report:
              precision    recall  f1-score   support

    Negative       0.65      0.68      0.67       149
     Neutral       0.74      0.75      0.74       397
    Positive       0.84      0.81      0.82       378

    accuracy                           0.76       924
   macro avg       0.74      0.75      0.74       924
weighted avg       0.77      0.76      0.76       924



Epoch 3/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 3: 0.292569544682136
Training Accuracy for epoch 3: 0.9042463611211355
Validation loss: 0.597180958066521, Accuracy: 0.7727272727272727
Validation Report:
              precision    recall  f1-score   support

    Negative       0.76      0.66      0.71       149
     Neutral       0.74      0.75      0.74       397
    Positive       0.81      0.84      0.83       378

    accuracy                           0.77       924
   macro avg       0.77      0.75      0.76       924
weighted avg       0.77      0.77      0.77       924



Epoch 4/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 4: 0.1573878629777867
Training Accuracy for epoch 4: 0.958739323950439
Validation loss: 0.6981892177257044, Accuracy: 0.7683982683982684
Validation Report:
              precision    recall  f1-score   support

    Negative       0.71      0.68      0.70       149
     Neutral       0.72      0.78      0.75       397
    Positive       0.85      0.79      0.82       378

    accuracy                           0.77       924
   macro avg       0.76      0.75      0.76       924
weighted avg       0.77      0.77      0.77       924



Epoch 5/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 5: 0.09801629768469586
Training Accuracy for epoch 5: 0.9770239384097197
Validation loss: 0.7760323874074323, Accuracy: 0.7857142857142857
Validation Report:
              precision    recall  f1-score   support

    Negative       0.74      0.68      0.71       149
     Neutral       0.76      0.76      0.76       397
    Positive       0.83      0.85      0.84       378

    accuracy                           0.79       924
   macro avg       0.78      0.77      0.77       924
weighted avg       0.78      0.79      0.79       924



Epoch 6/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 6: 0.07370285351509945
Training Accuracy for epoch 6: 0.9829183207025142
Validation loss: 0.8403855656948069, Accuracy: 0.7640692640692641
Validation Report:
              precision    recall  f1-score   support

    Negative       0.64      0.80      0.71       149
     Neutral       0.74      0.72      0.73       397
    Positive       0.86      0.80      0.83       378

    accuracy                           0.76       924
   macro avg       0.75      0.77      0.76       924
weighted avg       0.77      0.76      0.77       924



Epoch 7/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 7: 0.0568852247374777
Training Accuracy for epoch 7: 0.9878503548658727
Validation loss: 0.8888104273455923, Accuracy: 0.7619047619047619
Validation Report:
              precision    recall  f1-score   support

    Negative       0.72      0.64      0.68       149
     Neutral       0.71      0.77      0.74       397
    Positive       0.84      0.80      0.82       378

    accuracy                           0.76       924
   macro avg       0.76      0.74      0.75       924
weighted avg       0.76      0.76      0.76       924



Epoch 8/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 8: 0.04368877311040146
Training Accuracy for epoch 8: 0.9909779862865391
Validation loss: 0.930901533185408, Accuracy: 0.7738095238095238
Validation Report:
              precision    recall  f1-score   support

    Negative       0.66      0.81      0.72       149
     Neutral       0.77      0.70      0.73       397
    Positive       0.84      0.84      0.84       378

    accuracy                           0.77       924
   macro avg       0.75      0.78      0.76       924
weighted avg       0.78      0.77      0.77       924



Epoch 9/9: 100%|██████████| 520/520 [03:23<00:00,  2.56it/s]


Average training loss for epoch 9: 0.02881202327553183
Training Accuracy for epoch 9: 0.9941056177072056
Validation loss: 0.9129899114627262, Accuracy: 0.7835497835497836
Validation Report:
              precision    recall  f1-score   support

    Negative       0.72      0.70      0.71       149
     Neutral       0.75      0.76      0.76       397
    Positive       0.84      0.84      0.84       378

    accuracy                           0.78       924
   macro avg       0.77      0.77      0.77       924
weighted avg       0.78      0.78      0.78       924



In [8]:
from pyspark.sql.functions import udf

def create_malayalam_dataset(content_col, sentiment_col, max_len=128):
    tokenizer = BertTokenizer.from_pretrained('l3cube-pune/malayalam-bert')

    def tokenize_text(content, sentiment):
        inputs = tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': content,
            'input_ids': inputs['input_ids'].flatten().tolist(),
            'attention_mask': inputs['attention_mask'].flatten().tolist(),
            'target': int(sentiment)
        }

    return udf(tokenize_text, StructType([
        StructField('text', StringType(), True),
        StructField('input_ids', ArrayType(IntegerType()), True),
        StructField('attention_mask', ArrayType(IntegerType()), True),
        StructField('target', IntegerType(), True)
    ]))

def main():
    # Create a SparkSession
    spark = SparkSession.builder \
        .appName("Malayalam Tweets Analysis") \
        .getOrCreate()

    # Read the CSV files into a Spark DataFrame
    dfs_spark = []
    directory = '/kaggle/input/malayalam-tweets/'
    for filename in os.listdir(directory):
        if filename.endswith(".csv") and "total" not in filename:
            file_path = os.path.join(directory, filename)
            df_spark = spark.read.csv(file_path, header=True, inferSchema=True)
            dfs_spark.append(df_spark)

    # Combine all Spark DataFrames into a single Spark DataFrame
    df_spark = reduce(lambda df1, df2: df1.union(df2), dfs_spark)
    
    df_spark = spark.createDataFrame(df)

    # Define UDF to create Malayalam Dataset
    create_dataset_udf = create_malayalam_dataset("clean_content", "sentiment")

    # Apply UDF to create train_dataset and val_dataset
    df_spark = df_spark.withColumn("malayalam_dataset", create_dataset_udf(df_spark["clean_content"], df_spark["sentiment"]))

    # Split dataset into train and validation
    train_df, val_df = df_spark.randomSplit([0.9, 0.1], seed=42)

    # Convert Spark DataFrame to pandas DataFrame (optional)
    train_df_pandas = train_df.select("malayalam_dataset").toPandas()
    val_df_pandas = val_df.select("malayalam_dataset").toPandas()

    # Convert pandas DataFrames to datasets
    train_dataset = [row.malayalam_dataset for _, row in train_df_pandas.iterrows()]
    val_dataset = [row.malayalam_dataset for _, row in val_df_pandas.iterrows()]

    # Other steps (not included in this snippet)
    # Initialize model, optimizer, scheduler
    # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    # val_loader = DataLoader(val_dataset, batch_size=batch_size)
    # train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10)
