In [34]:
from google.colab import files
uploaded = files.upload()

Saving multi_language_sentiment_50.xlsx to multi_language_sentiment_50.xlsx


In [24]:
pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=a86712da62b9042e857216424f7c91f4e5d85fdbb77ed611f7e8c8b546a668c5
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [37]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
from deep_translator import GoogleTranslator
from langdetect import detect

# Download necessary resources
nltk.download('stopwords')

# Load dataset
file_path = '/content/multi_language_sentiment_50.xlsx'
df = pd.read_excel(file_path)

# Ensure necessary columns exist
if 'Text' not in df.columns or 'Emotion' not in df.columns:
    raise ValueError("Dataset must contain 'Text' and 'Emotion' columns")

# Detect language
df['Language'] = df['Text'].astype(str).apply(lambda x: detect(x) if isinstance(x, str) else 'unknown')
language_counts = df['Language'].value_counts()
print("Languages in Dataset:")
print(language_counts)

# Translate text to English
df['Translated_Text'] = df['Text'].astype(str).apply(lambda x: GoogleTranslator(source='auto', target='en').translate(x))

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Apply preprocessing
df['Cleaned_Text'] = df['Translated_Text'].apply(preprocess_text)

# Encode labels
df['Emotion_Label'] = df['Emotion'].astype('category').cat.codes

# Check if dataset is sufficient for training
if df['Cleaned_Text'].nunique() < 2 or df['Emotion_Label'].nunique() < 2:
    raise ValueError("Dataset must have at least two unique texts and labels")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['Cleaned_Text'], df['Emotion_Label'], test_size=0.2, random_state=42, stratify=df['Emotion_Label']
)

# Convert text to numerical features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train model
model = LogisticRegression(max_iter=500)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# Calculate percentage of each sentiment
emotion_counts = df['Emotion'].value_counts(normalize=True) * 100
print("Sentiment Distribution (%):\n", emotion_counts)

# Save model and vectorizer
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Languages in Dataset:
Language
fr    11
hi    10
de    10
en     9
es     9
tl     1
Name: count, dtype: int64
Model Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1
           2       1.00      0.50      0.67         2
           3       0.50      0.50      0.50         2
           4       0.33      1.00      0.50         1
           5       1.00      0.50      0.67         2

    accuracy                           0.60        10
   macro avg       0.72      0.67      0.61        10
weighted avg       0.78      0.60      0.62        10

Sentiment Distribution (%):
 Emotion
Happy      22.0
Sad        16.0
Angry      16.0
Excited    16.0
Fearful    16.0
Neutral    14.0
Name: proportion, dtype: float64


['tfidf_vectorizer.pkl']

In [38]:
import pandas as pd

file_path = '/content/multi_language_sentiment_50.xlsx'

# Read the Excel file
df = pd.read_excel(file_path)

# Display the first few rows
print(df.head())


   ID                                      Text Language  Emotion
0   1                 I am feeling great today!  English    Happy
1   2                आज मेरा दिन बहुत अच्छा है!    Hindi    Happy
2   3                   Estoy muy triste ahora.  Spanish      Sad
3   4              Je suis tellement en colère!   French    Angry
4   5  Ich habe heute keine Lust auf irgendwas.   German  Neutral
