<a href="https://www.kaggle.com/code/mohamedarish/with-dataset-as-input?scriptVersionId=175022245" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install pyspark transformers torch tqdm scikit-learn sparknlp huggingface_hub fasttext

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting spark-nlp (from sparknlp)
  Downloading spark_nlp-5.3.3-py2.py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB)
Downloading spark_nlp-5.3.3-py2.py3-none-any.whl (568 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.4/568.4 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493

# LOGISTIC REGRESSION

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import fasttext
import fasttext.util

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
logistic_regression_model = LogisticRegression(max_iter=1000)

# Step 6: Train Model in Epochs
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    logistic_regression_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, logistic_regression_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, logistic_regression_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")




Epoch 1/10


  0%|          | 0/7171 [00:35<?, ?it/s]
  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 2/10


  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 3/10


  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 4/10


  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 5/10


  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 6/10


  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 7/10


  0%|          | 0/7171 [00:00<?, ?it/s]


Train Loss: None, Train Accuracy: 0.6739645795565472, Train Precision: 0.6817086427052726, Train Recall: 0.6739645795565472, Train F1-score: 0.6576976785981656
Validation Accuracy: 0.637479085331846, Validation Precision: 0.6396109859134467, Validation Recall: 0.637479085331846, Validation F1-score: 0.617633474977311
Epoch 8/10


  0%|          | 0/7171 [00:00<?, ?it/s]

KeyboardInterrupt: 

# RANDOM FOREST

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import fasttext.util

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 6: Train Model in Epochs
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    random_forest_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, random_forest_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, random_forest_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")


# SVM

In [None]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import fasttext

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
svm_model = SVC(kernel='linear', random_state=42)

# Step 6: Train Model in Epochs
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    svm_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, svm_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, svm_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")


# BIDIRECTIONAL LSTM WITH ATTENTION LAYER

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Activation, Lambda, RepeatVector, Permute, Flatten
import tensorflow.keras.backend as K
import os
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
        dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

print(df.size)

df = df.dropna(subset=['clean_content'])

print(df.size)
df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Preprocessing
maxlen = 10000  # Maximum sequence length
max_words = 200000  # Maximum number of words in vocabulary

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['content'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['content'])

# Pad sequences to maxlen
X = pad_sequences(sequences, maxlen=maxlen)

# Label encoding for sentiments (assuming you have 'sentiment' column)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Bidirectional LSTM with Attention Model
input_seq = Input(shape=(maxlen,))
embedding = Embedding(max_words, 128, input_length=maxlen)(input_seq)
lstm = Bidirectional(LSTM(64, return_sequences=True))(embedding)

# Attention Mechanism
attention = Dense(1, activation='tanh')(lstm)
attention = Flatten()(attention)
attention = Activation('softmax', name='attention_weights')(attention)
attention = RepeatVector(128 * 2)(attention)
attention = Permute([2, 1])(attention)

sent_representation = Concatenate(axis=-1)([lstm, attention])
sent_representation = Lambda(lambda x: K.sum(x, axis=1))(sent_representation)

output = Dense(1, activation='sigmoid')(sent_representation)

model = Model(inputs=input_seq, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, epochs=40, batch_size=32, validation_split=0.1)

clear_gpu_memory()

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Print training and validation metrics
print("Training Loss:", history.history['loss'])
print("Training Accuracy:", history.history['accuracy'])
print("Validation Loss:", history.history['val_loss'])
print("Validation Accuracy:", history.history['val_accuracy'])

# BERT

In [7]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
import os
import pandas as pd
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load Malayalam BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Define dataset class
class MalayalamDataset(Dataset):
    def __init__(self, dataframe, max_len=128):
        self.data = dataframe
        self.max_len = max_len
        self.texts = self.data.content.tolist()
        self.targets = self.data.sentiment.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        target = self.targets[index]

        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }

# Load and preprocess dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Fine-tune BERT for sentiment analysis
def train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total_targets = 0
        for batch in tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            correct += (preds == targets.cpu().numpy()).sum()
            total_targets += len(targets)

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total_targets
        print(f"Average training loss for epoch {epoch+1}: {avg_train_loss}")
        print(f"Training Accuracy for epoch {epoch+1}: {train_accuracy}")

        val_loss, val_acc, val_report = evaluate_model(model, val_loader, device)
        print(f"Validation loss: {val_loss}, Accuracy: {val_acc}")
        print("Validation Report:")
        print(val_report)

        scheduler.step()

# Evaluate the model
def evaluate_model(model, val_loader, device):
    model.eval()
    val_targets = []
    val_outputs = []

    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()

            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(preds)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_targets, val_outputs)
    val_report = classification_report(val_targets, val_outputs, target_names=["Negative", "Neutral", "Positive"])
    return avg_val_loss, val_accuracy, val_report
# Main function to train and evaluate
def main():
  tokenizer.save_pretrained("bert-tokenizer")

#   return

  # # Directory containing your CSV files
  directory = '/kaggle/input/malayalam-tweets/'

  # # List to store DataFrames from each CSV file
  dfs = []

  # # Loop through each file in the directory
  for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
      file_path = os.path.join(directory, filename)
      # Read the CSV file into a DataFrame
      a_df = pd.read_csv(file_path)

      if "datetimee" in a_df.columns:
        # print("has datetimeee")
        a_df = a_df.rename(columns={"datetimee": "datetime"})
      # Append the DataFrame to the list
      dfs.append(a_df)

  # # Combine all DataFrames into a single DataFrame
  df = pd.concat(dfs, ignore_index=True)

  df = df.dropna(subset=['clean_content'])

  df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

  # Split dataset into train and validation
  train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

  # Create datasets and dataloaders
  train_dataset = MalayalamDataset(train_df)
  val_dataset = MalayalamDataset(val_df)

  batch_size = 16
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # Load pre-trained BERT model for sequence classification
  model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

  # Send model to GPU, if available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Create optimizer
  optimizer = AdamW(model.parameters(), lr=2e-5)

  # Create scheduler
  scheduler = ExponentialLR(optimizer, gamma=0.9)

  # Train the model
  train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10)

  clear_gpu_memory()

  # Save the trained model
  model.save_pretrained("malayalam_sentiment_model")

if __name__ == "__main__":
  main()

In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
import os
import pandas as pd
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load Malayalam BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('l3cube-pune/malayalam-bert')

# Define dataset class
class MalayalamDataset(Dataset):
    def __init__(self, dataframe, max_len=128):
        self.data = dataframe
        self.max_len = max_len
        self.texts = self.data.content.tolist()
        self.targets = self.data.sentiment.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        target = self.targets[index]

        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }

# Load and preprocess dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Fine-tune BERT for sentiment analysis
def train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total_targets = 0
        for batch in tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            correct += (preds == targets.cpu().numpy()).sum()
            total_targets += len(targets)

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total_targets
        print(f"Average training loss for epoch {epoch+1}: {avg_train_loss}")
        print(f"Training Accuracy for epoch {epoch+1}: {train_accuracy}")

        val_loss, val_acc, val_report = evaluate_model(model, val_loader, device)
        print(f"Validation loss: {val_loss}, Accuracy: {val_acc}")
        print("Validation Report:")
        print(val_report)

        scheduler.step()

# Evaluate the model
def evaluate_model(model, val_loader, device):
    model.eval()
    val_targets = []
    val_outputs = []

    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()

            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(preds)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_targets, val_outputs)
    val_report = classification_report(val_targets, val_outputs, target_names=["Negative", "Neutral", "Positive"])
    return avg_val_loss, val_accuracy, val_report
# Main function to train and evaluate
def main():
  tokenizer.save_pretrained("bert-tokenizer")

#   return
  # # Directory containing your CSV files
  directory = '/kaggle/input/malayalam-tweets/'

  # # List to store DataFrames from each CSV file
  dfs = []

  # # Loop through each file in the directory
  for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
      file_path = os.path.join(directory, filename)
      # Read the CSV file into a DataFrame
      a_df = pd.read_csv(file_path)

      if "datetimee" in a_df.columns:
        # print("has datetimeee")
        a_df = a_df.rename(columns={"datetimee": "datetime"})
      # Append the DataFrame to the list
      dfs.append(a_df)

  # # Combine all DataFrames into a single DataFrame
  df = pd.concat(dfs, ignore_index=True)

  df = df.dropna(subset=['clean_content'])

  df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

  # Split dataset into train and validation
  train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

  # Create datasets and dataloaders
  train_dataset = MalayalamDataset(train_df)
  val_dataset = MalayalamDataset(val_df)

  batch_size = 16
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # Load pre-trained BERT model for sequence classification
  model = BertForSequenceClassification.from_pretrained('l3cube-pune/malayalam-bert', num_labels=3)

  # Send model to GPU, if available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Create optimizer
  optimizer = AdamW(model.parameters(), lr=2e-5)

  # Create scheduler
  scheduler = ExponentialLR(optimizer, gamma=0.9)

  # Train the model
  train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=9)

  clear_gpu_memory()

  # Save the trained model
  model.save_pretrained("malayalam_sentiment_model")

if __name__ == "__main__":
  main()

tokenizer_config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/951M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/malayalam-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/9: 100%|██████████| 520/520 [03:20<00:00,  2.59it/s]


Average training loss for epoch 1: 0.8334679492964194
Training Accuracy for epoch 1: 0.6507879225309756
Validation loss: 0.6741909456664118, Accuracy: 0.7067099567099567
Validation Report:
              precision    recall  f1-score   support

    Negative       0.54      0.79      0.64       149
     Neutral       0.67      0.70      0.69       397
    Positive       0.88      0.68      0.77       378

    accuracy                           0.71       924
   macro avg       0.70      0.72      0.70       924
weighted avg       0.74      0.71      0.71       924



Epoch 2/9: 100%|██████████| 520/520 [03:18<00:00,  2.62it/s]


Average training loss for epoch 2: 0.5040055001584384
Training Accuracy for epoch 2: 0.8011548177553229
Validation loss: 0.5185192160565277, Accuracy: 0.7716450216450217
Validation Report:
              precision    recall  f1-score   support

    Negative       0.68      0.72      0.70       149
     Neutral       0.77      0.70      0.73       397
    Positive       0.81      0.87      0.84       378

    accuracy                           0.77       924
   macro avg       0.75      0.76      0.76       924
weighted avg       0.77      0.77      0.77       924



Epoch 3/9: 100%|██████████| 520/520 [03:18<00:00,  2.62it/s]


Average training loss for epoch 3: 0.30103175736104065
Training Accuracy for epoch 3: 0.9007578491519307
Validation loss: 0.569028909478722, Accuracy: 0.762987012987013
Validation Report:
              precision    recall  f1-score   support

    Negative       0.68      0.68      0.68       149
     Neutral       0.75      0.71      0.73       397
    Positive       0.81      0.85      0.83       378

    accuracy                           0.76       924
   macro avg       0.74      0.75      0.75       924
weighted avg       0.76      0.76      0.76       924



Epoch 4/9: 100%|██████████| 520/520 [03:18<00:00,  2.62it/s]


Average training loss for epoch 4: 0.17391317379302704
Training Accuracy for epoch 4: 0.9506796583664141
Validation loss: 0.7287946420496908, Accuracy: 0.7727272727272727
Validation Report:
              precision    recall  f1-score   support

    Negative       0.67      0.68      0.68       149
     Neutral       0.74      0.76      0.75       397
    Positive       0.85      0.82      0.83       378

    accuracy                           0.77       924
   macro avg       0.75      0.76      0.75       924
weighted avg       0.77      0.77      0.77       924



Epoch 5/9: 100%|██████████| 520/520 [03:18<00:00,  2.63it/s]


Average training loss for epoch 5: 0.11596146999106098
Training Accuracy for epoch 5: 0.9704077950198484
Validation loss: 0.7401616664539123, Accuracy: 0.7640692640692641
Validation Report:
              precision    recall  f1-score   support

    Negative       0.71      0.66      0.69       149
     Neutral       0.74      0.72      0.73       397
    Positive       0.80      0.85      0.83       378

    accuracy                           0.76       924
   macro avg       0.75      0.74      0.75       924
weighted avg       0.76      0.76      0.76       924



Epoch 6/9: 100%|██████████| 520/520 [03:18<00:00,  2.63it/s]


Average training loss for epoch 6: 0.07799256583675743
Training Accuracy for epoch 6: 0.9823168531216168
Validation loss: 0.8601820849781406, Accuracy: 0.7683982683982684
Validation Report:
              precision    recall  f1-score   support

    Negative       0.73      0.59      0.65       149
     Neutral       0.72      0.78      0.75       397
    Positive       0.84      0.83      0.83       378

    accuracy                           0.77       924
   macro avg       0.76      0.73      0.74       924
weighted avg       0.77      0.77      0.77       924



Epoch 7/9: 100%|██████████| 520/520 [03:17<00:00,  2.63it/s]


Average training loss for epoch 7: 0.05464559578014394
Training Accuracy for epoch 7: 0.9885721159629496
Validation loss: 0.9335898223879009, Accuracy: 0.7597402597402597
Validation Report:
              precision    recall  f1-score   support

    Negative       0.71      0.66      0.69       149
     Neutral       0.69      0.82      0.75       397
    Positive       0.88      0.73      0.80       378

    accuracy                           0.76       924
   macro avg       0.76      0.74      0.75       924
weighted avg       0.77      0.76      0.76       924



Epoch 8/9: 100%|██████████| 520/520 [03:18<00:00,  2.63it/s]


Average training loss for epoch 8: 0.046070847681795174
Training Accuracy for epoch 8: 0.9906171057380008
Validation loss: 0.9735342007258843, Accuracy: 0.762987012987013
Validation Report:
              precision    recall  f1-score   support

    Negative       0.74      0.58      0.65       149
     Neutral       0.69      0.84      0.76       397
    Positive       0.88      0.76      0.81       378

    accuracy                           0.76       924
   macro avg       0.77      0.72      0.74       924
weighted avg       0.77      0.76      0.76       924



Epoch 9/9: 100%|██████████| 520/520 [03:18<00:00,  2.63it/s]


Average training loss for epoch 9: 0.035382372439982226
Training Accuracy for epoch 9: 0.9927823890292313
Validation loss: 1.0158813428776017, Accuracy: 0.7575757575757576
Validation Report:
              precision    recall  f1-score   support

    Negative       0.70      0.60      0.64       149
     Neutral       0.70      0.80      0.74       397
    Positive       0.86      0.78      0.82       378

    accuracy                           0.76       924
   macro avg       0.75      0.72      0.74       924
weighted avg       0.76      0.76      0.76       924



In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
import os
import pandas as pd
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load Malayalam BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('RahulRaman/Malayalam-LM-RoBERTa')

# Define dataset class
class MalayalamDataset(Dataset):
    def __init__(self, dataframe, max_len=128):
        self.data = dataframe
        self.max_len = max_len
        self.texts = self.data.content.tolist()
        self.targets = self.data.sentiment.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        target = self.targets[index]

        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }

# Load and preprocess dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Fine-tune BERT for sentiment analysis
def train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total_targets = 0
        for batch in tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            correct += (preds == targets.cpu().numpy()).sum()
            total_targets += len(targets)

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total_targets
        print(f"Average training loss for epoch {epoch+1}: {avg_train_loss}")
        print(f"Training Accuracy for epoch {epoch+1}: {train_accuracy}")

        val_loss, val_acc, val_report = evaluate_model(model, val_loader, device)
        print(f"Validation loss: {val_loss}, Accuracy: {val_acc}")
        print("Validation Report:")
        print(val_report)

        scheduler.step()

# Evaluate the model
def evaluate_model(model, val_loader, device):
    model.eval()
    val_targets = []
    val_outputs = []

    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()

            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(preds)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_targets, val_outputs)
    val_report = classification_report(val_targets, val_outputs, target_names=["Negative", "Neutral", "Positive"])
    return avg_val_loss, val_accuracy, val_report
# Main function to train and evaluate
def main():
  tokenizer.save_pretrained("bert-tokenizer")

#   return
  # # Directory containing your CSV files
  directory = '/kaggle/input/malayalam-tweets/'

  # # List to store DataFrames from each CSV file
  dfs = []

  # # Loop through each file in the directory
  for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
      file_path = os.path.join(directory, filename)
      # Read the CSV file into a DataFrame
      a_df = pd.read_csv(file_path)

      if "datetimee" in a_df.columns:
        # print("has datetimeee")
        a_df = a_df.rename(columns={"datetimee": "datetime"})
      # Append the DataFrame to the list
      dfs.append(a_df)

  # # Combine all DataFrames into a single DataFrame
  df = pd.concat(dfs, ignore_index=True)

  df = df.dropna(subset=['clean_content'])

  df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

  # Split dataset into train and validation
  train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

  # Create datasets and dataloaders
  train_dataset = MalayalamDataset(train_df)
  val_dataset = MalayalamDataset(val_df)

  batch_size = 16
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # Load pre-trained BERT model for sequence classification
  model = RobertaForSequenceClassification.from_pretrained('RahulRaman/Malayalam-LM-RoBERTa', num_labels=7)

  # Send model to GPU, if available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Create optimizer
  optimizer = AdamW(model.parameters(), lr=2e-5)

  # Create scheduler
  scheduler = ExponentialLR(optimizer, gamma=0.9)

  # Train the model
  train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=6)

  clear_gpu_memory()

  # Save the trained model
  model.save_pretrained("malayalam_sentiment_model")

if __name__ == "__main__":
  main()

vocab.json:   0%|          | 0.00/728k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/513k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/708 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/273M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at RahulRaman/Malayalam-LM-RoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 520/520 [01:26<00:00,  6.03it/s]


Average training loss for epoch 1: 0.9113095859495494
Training Accuracy for epoch 1: 0.5694695055936485
Validation loss: 0.8002225474036974, Accuracy: 0.6212121212121212
Validation Report:
              precision    recall  f1-score   support

    Negative       0.41      0.43      0.42       149
     Neutral       0.66      0.51      0.58       397
    Positive       0.67      0.81      0.73       378

    accuracy                           0.62       924
   macro avg       0.58      0.58      0.58       924
weighted avg       0.62      0.62      0.61       924



Epoch 2/10: 100%|██████████| 520/520 [01:28<00:00,  5.85it/s]


Average training loss for epoch 2: 0.702523318792765
Training Accuracy for epoch 2: 0.6861542162877421
Validation loss: 0.7496802503692692, Accuracy: 0.6536796536796536
Validation Report:
              precision    recall  f1-score   support

    Negative       0.47      0.36      0.41       149
     Neutral       0.61      0.73      0.66       397
    Positive       0.79      0.69      0.73       378

    accuracy                           0.65       924
   macro avg       0.62      0.59      0.60       924
weighted avg       0.66      0.65      0.65       924



Epoch 3/10: 100%|██████████| 520/520 [01:28<00:00,  5.86it/s]


Average training loss for epoch 3: 0.435906805169697
Training Accuracy for epoch 3: 0.8289426199927824
Validation loss: 0.859563747870511, Accuracy: 0.6525974025974026
Validation Report:
              precision    recall  f1-score   support

    Negative       0.44      0.51      0.47       149
     Neutral       0.67      0.57      0.61       397
    Positive       0.73      0.80      0.76       378

    accuracy                           0.65       924
   macro avg       0.61      0.63      0.62       924
weighted avg       0.66      0.65      0.65       924



Epoch 4/10: 100%|██████████| 520/520 [01:28<00:00,  5.86it/s]


Average training loss for epoch 4: 0.1363291963773708
Training Accuracy for epoch 4: 0.9578972693371828
Validation loss: 1.2854431194478069, Accuracy: 0.6461038961038961
Validation Report:
              precision    recall  f1-score   support

    Negative       0.46      0.34      0.39       149
     Neutral       0.65      0.62      0.63       397
    Positive       0.69      0.79      0.74       378

    accuracy                           0.65       924
   macro avg       0.60      0.59      0.59       924
weighted avg       0.64      0.65      0.64       924



Epoch 5/10: 100%|██████████| 520/520 [01:28<00:00,  5.87it/s]


Average training loss for epoch 5: 0.03447052451217762
Training Accuracy for epoch 5: 0.9909779862865391
Validation loss: 1.5608019253303265, Accuracy: 0.6352813852813853
Validation Report:
              precision    recall  f1-score   support

    Negative       0.41      0.54      0.47       149
     Neutral       0.69      0.52      0.59       397
    Positive       0.70      0.80      0.75       378

    accuracy                           0.64       924
   macro avg       0.60      0.62      0.60       924
weighted avg       0.65      0.64      0.63       924



Epoch 6/10: 100%|██████████| 520/520 [01:28<00:00,  5.86it/s]


Average training loss for epoch 6: 0.010315783748340506
Training Accuracy for epoch 6: 0.9986767713220257
Validation loss: 1.6579756700787052, Accuracy: 0.6536796536796536
Validation Report:
              precision    recall  f1-score   support

    Negative       0.45      0.47      0.46       149
     Neutral       0.65      0.63      0.64       397
    Positive       0.75      0.75      0.75       378

    accuracy                           0.65       924
   macro avg       0.61      0.62      0.61       924
weighted avg       0.66      0.65      0.65       924



Epoch 7/10: 100%|██████████| 520/520 [01:28<00:00,  5.86it/s]


Average training loss for epoch 7: 0.004184326890390366
Training Accuracy for epoch 7: 0.9992782389029231
Validation loss: 1.7631581240686878, Accuracy: 0.6439393939393939
Validation Report:
              precision    recall  f1-score   support

    Negative       0.41      0.53      0.46       149
     Neutral       0.66      0.58      0.62       397
    Positive       0.74      0.75      0.75       378

    accuracy                           0.64       924
   macro avg       0.61      0.62      0.61       924
weighted avg       0.65      0.64      0.65       924



Epoch 8/10:  73%|███████▎  | 382/520 [01:05<00:23,  5.85it/s]


KeyboardInterrupt: 

This would've been used for getting files from gDrive but now it's a kaggle dataset
!gdown --folder https://drive.google.com/drive/folders/1_-3L_E3MxkijO8rLd-fsPCYZv5bZFh2m?usp=sharing

In [None]:
from huggingface_hub import hf_hub_download
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os
import pandas as pd
from pyspark.ml.linalg import Vectors, VectorUDT
import numpy as np
import fasttext

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Sentiment Analysis with FastText") \
    .config("spark.executor.memory", "6g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.memoryOverhead", "1g") \
    .config("spark.shuffle.memoryFraction", "0.4") \
    .config("spark.broadcast.blockSize", "512m") \
    .getOrCreate()

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# Precompute document vectors
document_vectors = []
for text in df["clean_content"]:
    vector = model.get_sentence_vector(text)
    print(text)
    vector_list = vector.tolist()
    document_vectors.append(vector_list)

# Add document vectors to DataFrame
df["document_vector"] = document_vectors

print(f'model cleared {df.shape}')

embedding_dim = len(model.words)  # Dimension of the fastText word vectors

vocab_size = len(model.words)

model = None

# Load data
data = spark.createDataFrame(df)

print("loaded dataset")

batch_size = 32

# Download and load FastText model

# def generate_doc_vector(text):
#     vector = model.get_sentence_vector(text)
#     print(vector)
#     return Vectors.dense(vector)

# print(model_path)

# broadcast_model = spark.sparkContext.broadcast(model)

# # Function to generate document vectors using FastText model
def generate_doc_vector(text):
#     model = broadcast_model.value
#     vector = model.get_sentence_vector(text)
    return Vectors.dense(text)

# Register UDF
generate_doc_vector_udf = udf(generate_doc_vector, VectorUDT())

# Generate document vectors for Malayalam text data
data = data.withColumn("vector", generate_doc_vector_udf("document_vector"))

print("vectorized")

# Convert sentiment labels to numeric indices
indexer = StringIndexer(inputCol="sentiment", outputCol="label")
data = indexer.fit(data).transform(data)

print("Stirign indexed labels")

# # Assemble features into a vector
# assembler = VectorAssembler(inputCols=["vector"], outputCol="features")
# data = assembler.transform(data)

# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=123)

print("train text split done")

# # Define logistic regression model
# lr = LogisticRegression(featuresCol="vector", labelCol="label")

# print("spark mem bad")

# # sample_data = train_data.sample(fraction=0.5, seed=123)
# # Specify number of epochs
# num_epochs = 10

# for epoch in range(num_epochs):
#     print(f"Epoch {epoch + 1}/{num_epochs}")

#     # Train the model
#     lr_model = lr.fit(train_data)
    
#     print("WooHoo YAY WOW MEME BIG BOY COWABANGA THIS IS AWESOME")

#     # Make predictions on training data
#     train_predictions = lr_model.transform(train_data)
    
#     print("MODEL TRANSFORMATION DONE YOOHOO WOOHOO FORTNITE MOUNTAIN DEW AND DORITOS MON+M LET ME STAY UP LATE TO PLAY FORTNITE")
    

#     # Evaluate model performance on training data
#     train_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
#     train_accuracy = train_evaluator.evaluate(train_predictions)
#     print(f"Training Accuracy: {train_accuracy}")

#     # Make predictions on testing data
#     test_predictions = lr_model.transform(test_data)

#     # Evaluate model performance on testing data
#     test_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
#     test_accuracy = test_evaluator.evaluate(test_predictions)
#     print(f"Testing Accuracy: {test_accuracy}")

train_df_pandas = train_data.toPandas()
test_df_pandas = test_data.toPandas()

print("Converted to pandas")

# Convert text data to numerical sequences (e.g., using tokenization and padding)
# Preprocess labels (e.g., convert to numerical format if necessary)

# TensorFlow model building and training
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention
from tensorflow.keras.models import Model

print("Imports done")

# Split the data into features (document vectors) and labels (sentiment)
X_train = train_df_pandas["document_vector"].values
y_train = train_df_pandas["sentiment"].values

X_test = test_df_pandas["document_vector"].values
y_test = test_df_pandas["sentiment"].values

print("Split inott x and Y")

# Define the maximum sequence length and other parameters
max_sequence_length = 8092  # Example value, adjust based on your data
lstm_units = 64  # Example value, adjust based on your requirements
num_classes = 3  # Number of sentiment classes (0, 1, 2)

# Perform the train-test split using sklearn
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print("More split?")

# Convert the data to numpy arrays
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())
y_train = np.array(y_train)
y_test = np.array(y_test)

print("FRom numpy to array done")

# Define model architecture
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
lstm_layer = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(embedding_layer)
attention_layer = Attention()([lstm_layer, lstm_layer])
output_layer = Dense(num_classes, activation='softmax')(attention_layer)

print("LSTM DEFINED")

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

print("MODEL COMPILED")

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=10)


print("MODEL FITTED")

# Evaluate the model
loss, accuracy = model.evaluate(test_sequences, test_labels)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

# Stop SparkSession
spark.stop()