<a href="https://www.kaggle.com/code/mohamedarish/with-dataset-as-input?scriptVersionId=175059668" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install pyspark transformers torch tqdm scikit-learn sparknlp huggingface_hub fasttext

# LOGISTIC REGRESSION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import fasttext
import fasttext.util

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
logistic_regression_model = LogisticRegression(max_iter=1000)

# Step 6: Train Model in Epochs
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    logistic_regression_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, logistic_regression_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, logistic_regression_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, logistic_regression_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, logistic_regression_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")


# RANDOM FOREST

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import fasttext.util

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 6: Train Model in Epochs
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    random_forest_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, random_forest_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, random_forest_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, random_forest_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, random_forest_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")


# SVM

In [None]:
from sklearn.svm import SVC
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import fasttext

model_path = hf_hub_download(repo_id="facebook/fasttext-ml-vectors", filename="model.bin")
model = fasttext.load_model(model_path)

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
            dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

df = df.dropna(subset=['clean_content'])

df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Step 3: Tokenization and Feature Extraction
def extract_features(text):
    tokens = text.split()  # Tokenize text into words
    vector_sum = sum(model.get_sentence_vector(token) for token in tokens)  # Get FastText vector for each word and sum them
    return vector_sum / len(tokens)  # Average the word vectors to get text vector

# Apply feature extraction to each text in the dataset
df['text_vector'] = df['clean_content'].apply(extract_features)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment'], test_size=0.2, random_state=42)

# Step 5: Initialize Logistic Regression Model
svm_model = SVC(kernel='linear', random_state=42)

# Step 6: Train Model in Epochs
epochs = 10
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    tqdm_bar = tqdm(total=len(X_train), position=0, leave=True)  # Create tqdm progress bar
    
    # Train the model
    svm_model.fit(X_train.tolist(), y_train)
    
    # Calculate training metrics
    train_loss = None  # Logistic regression does not have a loss attribute
    train_accuracy = accuracy_score(y_train, svm_model.predict(X_train.tolist()))
    train_precision = precision_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    train_recall = recall_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    train_f1_score = f1_score(y_train, svm_model.predict(X_train.tolist()), average='weighted')
    
    # Calculate validation metrics
    val_accuracy = accuracy_score(y_test, svm_model.predict(X_test.tolist()))
    val_precision = precision_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    val_recall = recall_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    val_f1_score = f1_score(y_test, svm_model.predict(X_test.tolist()), average='weighted')
    
    tqdm_bar.close()  # Close tqdm progress bar
    
    # Print metrics
    print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}, Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1-score: {train_f1_score}")
    print(f"Validation Accuracy: {val_accuracy}, Validation Precision: {val_precision}, Validation Recall: {val_recall}, Validation F1-score: {val_f1_score}")


# BIDIRECTIONAL LSTM WITH ATTENTION LAYER

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Activation, Lambda, RepeatVector, Permute, Flatten
import tensorflow.keras.backend as K
import os
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# # Directory containing your CSV files
directory = '/kaggle/input/malayalam-tweets/'

# # List to store DataFrames from each CSV file
dfs = []

# # Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        a_df = pd.read_csv(file_path)

        if "datetimee" in a_df.columns:
            # print("has datetimeee")
            a_df = a_df.rename(columns={"datetimee": "datetime"})
            # Append the DataFrame to the list
        dfs.append(a_df)

# # Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

print(df.size)

df = df.dropna(subset=['clean_content'])

print(df.size)
df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

# Preprocessing
maxlen = 10000  # Maximum sequence length
max_words = 200000  # Maximum number of words in vocabulary

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['content'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['content'])

# Pad sequences to maxlen
X = pad_sequences(sequences, maxlen=maxlen)

# Label encoding for sentiments (assuming you have 'sentiment' column)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Bidirectional LSTM with Attention Model
input_seq = Input(shape=(maxlen,))
embedding = Embedding(max_words, 128, input_length=maxlen)(input_seq)
lstm = Bidirectional(LSTM(64, return_sequences=True))(embedding)

# Attention Mechanism
attention = Dense(1, activation='tanh')(lstm)
attention = Flatten()(attention)
attention = Activation('softmax', name='attention_weights')(attention)
attention = RepeatVector(128 * 2)(attention)
attention = Permute([2, 1])(attention)

sent_representation = Concatenate(axis=-1)([lstm, attention])
sent_representation = Lambda(lambda x: K.sum(x, axis=1))(sent_representation)

output = Dense(1, activation='sigmoid')(sent_representation)

model = Model(inputs=input_seq, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train, y_train, epochs=40, batch_size=32, validation_split=0.1)

clear_gpu_memory()

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

# Print training and validation metrics
print("Training Loss:", history.history['loss'])
print("Training Accuracy:", history.history['accuracy'])
print("Validation Loss:", history.history['val_loss'])
print("Validation Accuracy:", history.history['val_accuracy'])

# BERT

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.optim import AdamW
from torch.optim.lr_scheduler import ExponentialLR
import os
import pandas as pd
import torch

def clear_gpu_memory():
  """Frees memory allocated on the GPU."""
  if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load Malayalam BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('l3cube-pune/malayalam-bert')

# Define dataset class
class MalayalamDataset(Dataset):
    def __init__(self, dataframe, max_len=128):
        self.data = dataframe
        self.max_len = max_len
        self.texts = self.data.content.tolist()
        self.targets = self.data.sentiment.tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        target = self.targets[index]

        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'target': torch.tensor(target, dtype=torch.long)
        }

# Load and preprocess dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Fine-tune BERT for sentiment analysis
def train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total_targets = 0
        for batch in tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            correct += (preds == targets.cpu().numpy()).sum()
            total_targets += len(targets)

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total_targets
        print(f"Average training loss for epoch {epoch+1}: {avg_train_loss}")
        print(f"Training Accuracy for epoch {epoch+1}: {train_accuracy}")

        val_loss, val_acc, val_report = evaluate_model(model, val_loader, device)
        print(f"Validation loss: {val_loss}, Accuracy: {val_acc}")
        print("Validation Report:")
        print(val_report)

        scheduler.step()

# Evaluate the model
def evaluate_model(model, val_loader, device):
    model.eval()
    val_targets = []
    val_outputs = []

    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            total_val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()

            val_targets.extend(targets.cpu().numpy())
            val_outputs.extend(preds)

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = accuracy_score(val_targets, val_outputs)
    val_report = classification_report(val_targets, val_outputs, target_names=["Negative", "Neutral", "Positive"])
    return avg_val_loss, val_accuracy, val_report
# Main function to train and evaluate
def main():
  tokenizer.save_pretrained("bert-tokenizer")

#   return
  # # Directory containing your CSV files
  directory = '/kaggle/input/malayalam-tweets/'

  # # List to store DataFrames from each CSV file
  dfs = []

  # # Loop through each file in the directory
  for filename in os.listdir(directory):
    if filename.endswith(".csv") and "total" not in filename:
      file_path = os.path.join(directory, filename)
      # Read the CSV file into a DataFrame
      a_df = pd.read_csv(file_path)

      if "datetimee" in a_df.columns:
        # print("has datetimeee")
        a_df = a_df.rename(columns={"datetimee": "datetime"})
      # Append the DataFrame to the list
      dfs.append(a_df)

  # # Combine all DataFrames into a single DataFrame
  df = pd.concat(dfs, ignore_index=True)

  df = df.dropna(subset=['clean_content'])

  df['sentiment'] = df['sentiment'].map({-1: 0, 0: 1, 1: 2})

  # Split dataset into train and validation
  train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

  # Create datasets and dataloaders
  train_dataset = MalayalamDataset(train_df)
  val_dataset = MalayalamDataset(val_df)

  batch_size = 8
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # Load pre-trained BERT model for sequence classification
  model = BertForSequenceClassification.from_pretrained('l3cube-pune/malayalam-bert', num_labels=3)

  # Send model to GPU, if available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # Create optimizer
  optimizer = AdamW(model.parameters(), lr=2e-5)

  # Create scheduler
  scheduler = ExponentialLR(optimizer, gamma=0.9)

  # Train the model
  train_model(train_loader, val_loader, model, optimizer, scheduler, device, epochs=9)

  clear_gpu_memory()

  # Save the trained model
  model.save_pretrained("malayalam_sentiment_model")

if __name__ == "__main__":
  main()