In [None]:
# @title Google Drive Link
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Read csv file
done = [
    "അതിഗംഭീര",
    "അഭിനന്ദന",
    "അഭിവൃദ്ധി",
    "ആത്മവിശ്വാസം",
    "ആരോപണ",
    "ഇഷ്ട",
    "ഉത്തമ",
    "കഷ്ട",
    "പരാജയ",
    "പ്രവൃത്തി",
    "വിശ്വാസം",
    "സമാധാനം",
    "സാഹചര്യം",
    "സ്നേഹ",
    "ആഘാത",
    "അഭിമാന",
    "ഉന്മേഷം",
    "വിജയം",
    "ശാന്തം",
    "സ്ത്രീ",
    "ബാധകം",
    "വിജ്ഞാനം",
    "കഴിവും",
    "സാക്ഷരത",
    # "അനുഭൂതി", # This is invalid
    "അംഗീക",
    "നേരായ",
    # "സ്വാതന്ത്ര്യം", # This is invalid
    ""
]

import glob

g = glob.glob("/content/drive/MyDrive/Dataset/new data/*.csv")

not_done = []

for file in g:
    yes = False

    for d in done:
        if d in file:
            yes = True
            continue

    if not yes:
        not_done.append(file)

print(not_done)

In [None]:
import pandas as pd

filename = "സ്വാതന്ത്ര്യം"

csv_df = pd.read_csv(f"/content/drive/MyDrive/Dataset/new data/{filename}.csv")

In [None]:
# @title Remove duplicates
print("old count", csv_df.count())

#drop_duplicates for drop duplicate values
csv_df = csv_df.drop_duplicates(['content'], keep='first')

print("new count", csv_df.count())

In [None]:
# @title Clean the data
import re

def remove_usertags_hashtags_and_urls(tweet):
    if isinstance(tweet, str):
        new_line_pattern = re.compile(r'\\n')
        no_new_line = new_line_pattern.sub(" ", tweet)

        show_more_pattern = re.compile(r'Show more\s*$')
        no_show_more = show_more_pattern.sub("", no_new_line)

        usertag_pattern = re.compile(r'@[^\s]+')
        no_usertags = usertag_pattern.sub("", no_show_more)

        hashtag_pattern = re.compile(r'#[^\s]+')
        no_hashtags = hashtag_pattern.sub("", no_usertags)
        # Some people do not know that hashtag followed by a space does not create a hashtag and hence we need to remove the hashtags appearing alone
        dumb_hashtag_pattern = re.compile(r'#')
        no_dumb_hashtags = dumb_hashtag_pattern.sub("", no_hashtags)

        url_pattern = re.compile(r'https?://[^\s]+')
        no_urls = url_pattern.sub("", no_dumb_hashtags)

        no_urls = no_urls.strip()

        return no_urls
    else:
        return ""

csv_df['clean_content'] = csv_df['content'].apply(remove_usertags_hashtags_and_urls)


In [None]:
# @title Translate the tweets and store it in english column
import requests, uuid, json

def translate_tweet(tweet):
    # Add your key and endpoint
    key = "9c0bfabc26264af690254eef3ff6a81a" # Minhaj Key Feb done 2M
    # key = "5cbd5d35830e4409ae9c2d072d58b5f7"
    endpoint = "https://api.cognitive.microsofttranslator.com/"

    # location, also known as region.
    # required if you're using a multi-service or regional (not global) resource. It can be found in the Azure portal on the Keys and Endpoint page.
    location = "southeastasia"

    path = '/translate'
    constructed_url = endpoint + path

    params = {
        'api-version': '3.0',
        'from': 'ml',
        'to': ['en']
    }

    headers = {
        'Ocp-Apim-Subscription-Key': key,
        # location required if you're using a multi-service or regional (not global) resource.
        'Ocp-Apim-Subscription-Region': location,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }

    # You can pass more than one object in body.
    body = [{
        'text': tweet
    }]

    request = requests.post(constructed_url, params=params, headers=headers, json=body)

    response = request.json()

    print(response)

    if (isinstance(response, dict)):
        return "rate limited"

    if len(response) > 0 and response[0] and len(response[0]["translations"]) > 0:
        return response[0]["translations"][0]["text"]
    else:
        return ""

csv_df['english'] = csv_df['clean_content'].apply(translate_tweet)

In [None]:
# @title Fix if any translations have been missed
for (index, row) in csv_df.iterrows():
    content = row['clean_content']
    if (row['english'] == "rate limited") or (row['english'] == ""):
        row['english'] = translate_tweet(content)

In [None]:
!pip install transformers[sentencepiece] torch

In [None]:
# @title Get sentiment
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the model and tokenizer
model_name = "MarieAngeA13/Sentiment-Analysis-BERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment(english_tweet):

    encoded_text = tokenizer(english_tweet, return_tensors="pt")  # Tokenize and convert to tensors

    # Run inference
    with torch.no_grad():
        outputs = model(**encoded_text)  # Forward pass through the model
        logits = outputs.logits  # Extract logits (model predictions)
        predictions = torch.argmax(logits, dim=-1)  # Get the class labels

        # Since this model outputs numerical labels, map them back to sentiment classes
        label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
        predicted_sentiment = label_map[predictions.item()]  # Get the predicted sentiment

        print(f"Predicted sentiment: {predicted_sentiment}")

        sentiment_table = {
            'Positive': 1,
            'Neutral': 0,
            'Negative': -1
        }

        return sentiment_table[predicted_sentiment]

csv_df['sentiment'] = csv_df['english'].apply(get_sentiment)

In [None]:
# @title Create a new csv with the joined content with all new columns, the new file shall be named month(first 3 letters)_day.csv, Eg: Feb_24
from datetime import datetime

today = datetime.today()
month_abbreviation = today.strftime("%b")
day = today.strftime("%d")

# csv_df.to_csv(f"/content/drive/MyDrive/Dataset/new data/{month_abbreviation}_{day}.csv", index=False)
csv_df.to_csv(f"/content/drive/MyDrive/Dataset/scored/{filename}.csv", index=False)