In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [37]:
with open("finnish.txt", "r", encoding="utf-8") as files:
    lines = files.readlines()

for line in lines:
    print(line.strip())

1. Hyvää huomenta kaikille! Toivottavasti teillä on ihana päivä! ☀️ #hyväähuomenta #uusipäivä
2. Illalla suunnitelmissa leffa-ilta sohvan nurkassa viltin alla. 🍿🎥 #leffailta #koti-ilta
3. Tänään on herkkujen päivä! Mansikkakakku maistuu taivaalliselta! 🍰🍓 #herkkupäivä #mansikkakakku
4. Syksyinen sää houkuttelee kävelylle metsään. 🍂🍁 #syksy #metsäkävely
5. Rakastan viikonloppuja! Rentoutumista ja hauskanpitoa luvassa. 🎉🍻 #viikonloppu #rentoutuminen
6. Tänään pääsen viimeinkin tapaamaan vanhoja ystäviäni. Odotan innolla! 👭🥰 #ystävät #tapaaminen
7. Olen niin kiitollinen perheestäni. He ovat aina tukenani. ❤️ #kiitollisuus #perhe
8. Uusi kirja löytynyt! Odotan malttamattomana lukuhetkeä. 📚🤓 #kirja #lukuhetki
9. Aamulenkillä tuli vastaan kaunis auringonnousu. Luonto on upea! 🌅🏃 #aamulenkillä #auringonnousu
10. Tervetuloa uudet seuraajat! Kiitos, kun jaatte matkaani kanssani. 🙏🌟 #uudetseuraajat #kiitos
11. Aika laittaa musiikki soimaan ja siivota koti viikonlopun kunniaksi! 🎶🧹 #siivous #musi

In [38]:
#Create an empty list
finnish_list = []

#Iterate over the txt file
for line in lines:
    finnish_list.append(line)
    
#get into data frame
finnish_df = pd.DataFrame({"finnish_comments":finnish_list})

In [39]:
import re
# Function to remove numbers and punctuations using regular expression
 
def remove_numbers_and_punctuation(text):
    # Replace all numbers and punctuation with an empty string
    cleaned_text = re.sub(r'[0-9!@#$%^&*(),.?":{}|<>]', '', text)
    return cleaned_text

# Apply the function to the 'Text' column in the DataFrame
finnish_df['Cleaned_Text'] = finnish_df['finnish_comments'].apply(remove_numbers_and_punctuation)

#removeemoji
import emoji
import unicodedata

# Function to remove emojis from text
def remove_emojis(text):
    # Replace emojis with their names using the demojize() function
    cleaned_text = emoji.demojize(text)
    return cleaned_text

# Apply the function to the 'Text' column in the DataFrame
finnish_df['Cleaned_Text'] = finnish_df['Cleaned_Text'].apply(remove_emojis)


In [40]:
from googletrans import Translator
import time

In [41]:
# Function to translate text from Finnish to English with a retry mechanism
def translate_to_english(text, max_retry=5, sleep_duration=1.0):
    retry_count = 0
    translator = Translator()
    while retry_count < max_retry:
        try:
            translated_text = translator.translate(text, src='fi', dest='en').text
            return translated_text
        except Exception as e:
            print(f"Translation failed. Retrying... ({retry_count + 1}/{max_retry})")
            retry_count += 1
            time.sleep(sleep_duration)
    print("Translation failed after maximum retries.")
    return None

# Apply the translation function to the 'Finnish_Text' column in the DataFrame
finnish_df['English_Text'] = finnish_df['Cleaned_Text'].apply(translate_to_english)

finnish_df.drop(columns=["finnish_comments", 'Cleaned_Text'], inplace=True)
finnish_df

Unnamed: 0,English_Text
0,Good morning to everyone hope you have a wonde...
1,"In the evening, planned from a movie in the co..."
2,Today is a delicious day with a strawberry cak...
3,Autumn weather attracts a walk to the woods: F...
4,I love weekends relaxing and having fun coming...
5,Today I can finally meet my old friends lookin...
6,I am so grateful for my family they always sup...
7,New Book Found I look forward to reading: Book...
8,Morning jog came across a beautiful sunrise na...
9,Welcome to the new Followers thanks for sharin...


In [42]:
# Download the stopwords for English from NLTK
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# to remove english stopwords
!pip install textblob
from textblob import TextBlob
from nltk.corpus import stopwords # get stopwords from NLTK library
from nltk.tokenize import word_tokenize # to create word tokens
from nltk.stem import WordNetLemmatizer # to reduce words to orginal form
from nltk.corpus import words # Get all words in english language

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\johns\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\johns\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!




In [43]:
# Function to remove English stopwords from text
def remove_english_stopwords(text):
    if pd.notna(text):  # Check if the value is not None
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
        return ' '.join(filtered_text)
    return text  # Return the original text if it's None

# Apply the function to the 'Text' column in the DataFrame
finnish_df['English_Text'] = finnish_df['English_Text'].apply(remove_english_stopwords)

In [44]:
#sentiment analysis
# Function to perform sentiment analysis using TextBlob
def get_sentiment(text):
    if text is not None:  # Check if the value is not None
        analysis = TextBlob(text)
        polarity = analysis.sentiment.polarity
        if polarity > 0:
            return 'Positive'
        elif polarity < 0:
            return 'Negative'
        else:
            return 'Neutral'
    return 'Neutral'  # Return 'Neutral' for None values


# Apply the function to the 'Text' column in the DataFrame
finnish_df['Sentiment'] = finnish_df['English_Text'].apply(get_sentiment)

# Create a new DataFrame to store the results
sentiment_df = pd.DataFrame(finnish_df, columns=['English_Text', 'Sentiment'])

# See quick results of the Sentiment Analysis in a table format
print(sentiment_df)

                                         English_Text Sentiment
0   Good morning everyone hope wonderful day : Sun...  Positive
1   evening , planned movie corner sofa blanket : ...   Neutral
2   Today delicious day strawberry cake tastes hea...  Positive
3   Autumn weather attracts walk woods : Fallen Le...   Neutral
4   love weekends relaxing fun coming : Party_Popp...  Positive
5   Today finally meet old friends looking forward...  Positive
6   grateful family always support : red_heart : g...   Neutral
7   New Book Found look forward reading : Books : ...  Positive
8   Morning jog came across beautiful sunrise natu...  Positive
9   Welcome new Followers thanks sharing trip : Fo...  Positive
10  Time make music ring clean home celebrate week...  Positive
11  Thank congratulations birthday moved love rece...  Positive
12  Beautiful Autumn Evening tempting enjoy peacef...  Positive
13  Today day relaxation enjoy good book tea : Ope...  Positive
14  Relaxed Sunday morning bed 'm going 

In [45]:
# See quick results of the Sentiment Analysis
finnishsentiment_stored = sentiment_df['Sentiment'].value_counts()
sentiment_df['Sentiment'].value_counts()

Sentiment
Positive    22
Neutral      7
Negative     1
Name: count, dtype: int64

In [46]:
#performing back translation and sentiment analysis

#drop the sentiment column in the finnish_df dataframe
finnish_df.drop(columns=["Sentiment"], inplace=True)

In [47]:
#translate back into finnish
# Function to translate text from english to finnish with a retry mechanism
def translate_to_finnish(text, max_retry=5, sleep_duration=1.0):
    retry_count = 0
    translator = Translator()
    while retry_count < max_retry:
        try:
            translated_text = translator.translate(text, src='en', dest='fi').text
            return translated_text
        except Exception as e:
            print(f"Translation failed. Retrying... ({retry_count + 1}/{max_retry})")
            retry_count += 1
            time.sleep(sleep_duration)
    print("Translation failed after maximum retries.")
    return None
# Apply the translation function to the 'finnish_Tweet' column in the DataFrame
finnish_df['Finnish_Translation'] = finnish_df['English_Text'].apply(translate_to_finnish)

#drop english column after translation
finnish_df.drop(columns=["English_Text"], inplace=True)
finnish_df

Unnamed: 0,Finnish_Translation
0,"Hyvää huomenta, kaikki toivovat upea päivä: au..."
1,"Ilta, suunniteltu elokuvan nurkka -sohva viltt..."
2,Tänään herkullinen päivä mansikkakakku maistuu...
3,Syksyn sää houkuttelee Walk Woods: Fallen Leaf...
4,Rakkausviikonloppuja rentouttava hauska Tule: ...
5,"Tänään tapaa vihdoin vanhoja ystäviä, jotka od..."
6,kiitollinen perhe aina tuki: Red_heart: kiitol...
7,Uusi kirja Löydetty LUKUMINEN: KIRJAT :: Nerd ...
8,Aamu -lenkkeily tuli kauniin auringonnousun lu...
9,Tervetuloa uudet seuraajat kiitos jakamismatka...


In [48]:
#to perform sentiment analysis

!pip install transformers 
!pip install torch




In [49]:

from transformers import BertTokenizer, BertForSequenceClassification
import torch


In [50]:
# Load the FinBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
model = BertForSequenceClassification.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at TurkuNLP/bert-base-finnish-cased-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# Define a function to perform sentiment analysis
def perform_sentiment_analysis(texts):
    # Tokenize the texts and convert them to tensors
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    
    # Forward pass through the model to obtain sentiment scores
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        
    # Apply softmax to obtain probabilities
    probabilities = torch.softmax(logits, dim=1)
    
    # Choose the sentiment with the highest probability (0: Negative, 1: Neutral, 2: Positive)
    _, predicted_classes = torch.max(probabilities, dim=1)
    return predicted_classes.tolist()

# convert dataframe to list and filter out any none values
texts = [text for text in finnish_df['Finnish_Translation'].tolist() if text is not None]

# Perform sentiment analysis for each batch of texts (batch size = 10)
predicted_sentiments = []
batch_size = 10
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    batch_predictions = perform_sentiment_analysis(batch_texts)
    predicted_sentiments.extend(batch_predictions)

# Make sure the length of predicted_sentiments matches the length of the DataFrame
if len(predicted_sentiments) < len(finnish_df):
    # If the last batch is smaller than batch_size, pad the predictions with zeros
    num_missing = len(finnish_df) - len(predicted_sentiments)
    predicted_sentiments.extend([0] * num_missing)

# Reset the DataFrame index to ensure the lengths match
finnish_df.reset_index(drop=True, inplace=True)

# Assign the predictions to the "Sentiment_Score" column
finnish_df["Sentiment_Score"] = predicted_sentiments


In [52]:
finnish_df

Unnamed: 0,Finnish_Translation,Sentiment_Score
0,"Hyvää huomenta, kaikki toivovat upea päivä: au...",0
1,"Ilta, suunniteltu elokuvan nurkka -sohva viltt...",0
2,Tänään herkullinen päivä mansikkakakku maistuu...,0
3,Syksyn sää houkuttelee Walk Woods: Fallen Leaf...,0
4,Rakkausviikonloppuja rentouttava hauska Tule: ...,0
5,"Tänään tapaa vihdoin vanhoja ystäviä, jotka od...",0
6,kiitollinen perhe aina tuki: Red_heart: kiitol...,0
7,Uusi kirja Löydetty LUKUMINEN: KIRJAT :: Nerd ...,1
8,Aamu -lenkkeily tuli kauniin auringonnousun lu...,0
9,Tervetuloa uudet seuraajat kiitos jakamismatka...,1


In [53]:
print(finnishsentiment_stored)

Sentiment
Positive    22
Neutral      7
Negative     1
Name: count, dtype: int64


In [55]:
# (0: Negative, 1: Neutral, 2: Positive)
finnish_df['Sentiment_Score'].value_counts()

Sentiment_Score
0    23
1     7
Name: count, dtype: int64