# **SENTIMENT ANALYSIS OF TELUGU PRODUCT REVIEWS WITH MACHINE LEARNING**

## **Importing libraries**

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from sklearn.utils import shuffle
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import string
import re
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## **Importing the dataset**



A part of Sentiraama corpus (created by IIIT Hyderabad) is used for this project. specifically electronical product reviews are taken and both positive and negative reviews are mixed and stored in a dataframe.

In [None]:
def read_file_sections(filename):
    sections = []
    current_section = ""

    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == "__________________________":
                if current_section:
                    sections.append(current_section.strip())
                    current_section = ""
            else:
                current_section += line

    if current_section:
        sections.append(current_section.strip())

    return sections


In [None]:
positive = read_file_sections(r"/content/product_pos.txt")

In [None]:
negative = read_file_sections(r"/content/product_neg.txt")

In [None]:
df_negative = pd.DataFrame({'text': negative, 'label': 0})
df_positive = pd.DataFrame({'text': positive, 'label': 1})

# Concatenate and shuffle
df = pd.concat([df_negative, df_positive], ignore_index=True)
df = shuffle(df).reset_index(drop=True)


In [None]:
df

Unnamed: 0,text,label
0,ఇది ఉపయోగించడానికి భయంకరమైన కెమెరా ఈ కెమెరా న...,0
1,LG43LH600T LEDTV పిక్చర్ నాణ్యత అస్సలు బాగోలే...,0
2,బడ్జెట్ ఫోన్గా నేటి మధ్యతరగతి ప్రజలకు అందుబాటు...,1
3,HD రెడీ LED TV బ్లాక్ 5 లో 4 3 మొత్తం 16 కస...,0
4,సామ్సంగ్ ఒక ప్రపంచ ఉనికిని కలిగిన దక్షిణ ఖొరియ...,1
...,...,...
195,హలో ఫ్రెండ్స్ నేను ఈ ఫోన్ ను 4 నెలల నుండి ఉపయో...,0
196,ఈ కెమెరా ఉపయోగించడానికి మంచిది కాదు దాని కెమెర...,0
197,సామ్సంగ్ LED చాలా మంచి టెలివిజన్ మీ ఎంపిక ప్రక...,1
198,ఈ ప్రోడక్ట్ లోని సౌండ్ నాణ్యత పిక్చర్ నాణ్యతత...,1


## **Get n-grams**

 n-grams (bi-grams and tri-grams) are used to generate features from text data, aiding in understanding local context and dependencies within reviews.

In [None]:
def get_ngrams(text, n):
  """
  Extracts n-grams from a text string.

  Args:
      text (str): Text string.
      n (int): Number of words for n-grams.

  Returns:
      list: List of n-grams.
  """

  tokens = text.split()
  ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
  return ngrams

## **Feature selection**

Feature selection is primarily achieved through the generation of n-grams, specifically bi-grams and tri-grams, from the text data. These n-grams capture local context and dependencies within the reviews, providing valuable information for sentiment analysis. By extracting features at the word sequence level, the model gains insights into how certain phrases or combinations of words influence sentiment, enhancing its ability to accurately classify text based on sentiment.

In [None]:
def create_features_custom(df):
  """
  Creates a list of lists containing n-grams for each row.

  Args:
      df (pandas.DataFrame): DataFrame containing a text column ('text' assumed).

  Returns:
      pandas.DataFrame: DataFrame with a new column containing lists of n-grams.
  """

  features = []
  for index, row in df.iterrows():
      text = row['text']
      unigrams = get_ngrams(text, 1)
      bigrams = get_ngrams(text, 2)
      trigrams = get_ngrams(text, 3)

      # Combine n-grams into a single list
      features.append(unigrams + bigrams + trigrams)

  # Add a new column containing the list of n-grams for each row
  df['n_grams'] = features

  return df


In [None]:
df = create_features_custom(df.copy())

In [None]:
df.head()

Unnamed: 0,text,label,n_grams
0,ఇది ఉపయోగించడానికి భయంకరమైన కెమెరా ఈ కెమెరా న...,0,"[(ఇది,), (ఉపయోగించడానికి,), (భయంకరమైన,), (కెమె..."
1,LG43LH600T LEDTV పిక్చర్ నాణ్యత అస్సలు బాగోలే...,0,"[(LG43LH600T,), (LEDTV,), (పిక్చర్,), (నాణ్యత,..."
2,బడ్జెట్ ఫోన్గా నేటి మధ్యతరగతి ప్రజలకు అందుబాటు...,1,"[(బడ్జెట్,), (ఫోన్గా,), (నేటి,), (మధ్యతరగతి,),..."
3,HD రెడీ LED TV బ్లాక్ 5 లో 4 3 మొత్తం 16 కస...,0,"[(HD,), (రెడీ,), (LED,), (TV,), (బ్లాక్,), (5,..."
4,సామ్సంగ్ ఒక ప్రపంచ ఉనికిని కలిగిన దక్షిణ ఖొరియ...,1,"[(సామ్సంగ్,), (ఒక,), (ప్రపంచ,), (ఉనికిని,), (క..."


Loading a pre-trained Telugu BERT model and tokenizer, a function to generate embeddings for Telugu text using the model, and applying this function to each row of a DataFrame to compute embeddings for the text in the 'text' column, storing the results in a new column named 'text_embeddings' in the DataFrame.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

# Load pre-trained Telugu BERT model and tokenizer
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get embeddings for Telugu text
def get_telugu_embeddings(text):
    # Tokenize the text
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**tokens)
    # Extract embeddings from the output
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)  # Taking mean of token embeddings
    return embeddings

# Assuming df is your DataFrame with the column 'text'
# Apply the function to each row of the DataFrame
df['text_embeddings'] = df['text'].apply(get_telugu_embeddings)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
df.head()

Unnamed: 0,text,label,n_grams,text_embeddings
0,ఇది ఉపయోగించడానికి భయంకరమైన కెమెరా ఈ కెమెరా న...,0,"[(ఇది,), (ఉపయోగించడానికి,), (భయంకరమైన,), (కెమె...","[[tensor(0.3579), tensor(-0.2924), tensor(0.10..."
1,LG43LH600T LEDTV పిక్చర్ నాణ్యత అస్సలు బాగోలే...,0,"[(LG43LH600T,), (LEDTV,), (పిక్చర్,), (నాణ్యత,...","[[tensor(0.3046), tensor(-0.4699), tensor(0.01..."
2,బడ్జెట్ ఫోన్గా నేటి మధ్యతరగతి ప్రజలకు అందుబాటు...,1,"[(బడ్జెట్,), (ఫోన్గా,), (నేటి,), (మధ్యతరగతి,),...","[[tensor(0.4140), tensor(-0.6023), tensor(-0.0..."
3,HD రెడీ LED TV బ్లాక్ 5 లో 4 3 మొత్తం 16 కస...,0,"[(HD,), (రెడీ,), (LED,), (TV,), (బ్లాక్,), (5,...","[[tensor(0.1105), tensor(-0.4242), tensor(-0.0..."
4,సామ్సంగ్ ఒక ప్రపంచ ఉనికిని కలిగిన దక్షిణ ఖొరియ...,1,"[(సామ్సంగ్,), (ఒక,), (ప్రపంచ,), (ఉనికిని,), (క...","[[tensor(0.1758), tensor(-0.5474), tensor(0.00..."


## **Data Splitting**

In [None]:
# Split data into training and testing sets (assuming you still need this)
X_train, X_test, y_train, y_test = train_test_split(df.drop(['label','text'], axis=1), df['label'], test_size=0.2, random_state=42)

In [None]:
print(X_train)

                                               n_grams  \
79   [(ఇది,), (నేను,), (ఇప్పటివరకు,), (చూసిన,), (చె...   
197  [(సామ్సంగ్,), (LED,), (చాలా,), (మంచి,), (టెలివ...   
38   [(నేను,), (ఈ,), (కెమెరాని,), (ఉపయోగించాను,), (...   
24   [(హలో,), (ఫ్రెండ్స్,), (నేను,), (నేడు,), (ఒక,)...   
122  [(నికోన్,), (D70,), (నా,), (దగ్గర,), (ఉన్న,), ...   
..                                                 ...   
106  [(హాయ్,), (గైస్,), (ఈ,), (కోర్యో,), (lcd,), (T...   
14   [(ఇది,), (మంచి,), (TV,), (కాదు,), (రంగు,), (నా...   
92   [(హలో,), (ఫ్రెండ్స్,), (జనవరి,), (28,), (2004,...   
179  [(జియోమి,), (రెడ్మి,), (నోట్,), (3,), (మంచి,),...   
102  [(ఈ,), (శ్రేణిలో,), (ఇది,), (చాలా,), (మంచి,), ...   

                                       text_embeddings  
79   [[tensor(0.1961), tensor(-0.2596), tensor(0.04...  
197  [[tensor(0.3107), tensor(-0.3924), tensor(0.00...  
38   [[tensor(0.1761), tensor(-0.3943), tensor(0.07...  
24   [[tensor(0.4012), tensor(-0.2932), tensor(0.06...  
122  [[tensor(0.27

## **Model Selection**

In [None]:
# Model Building
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Model Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
print(X_train)

                                               n_grams  \
79   [(ఇది,), (నేను,), (ఇప్పటివరకు,), (చూసిన,), (చె...   
197  [(సామ్సంగ్,), (LED,), (చాలా,), (మంచి,), (టెలివ...   
38   [(నేను,), (ఈ,), (కెమెరాని,), (ఉపయోగించాను,), (...   
24   [(హలో,), (ఫ్రెండ్స్,), (నేను,), (నేడు,), (ఒక,)...   
122  [(నికోన్,), (D70,), (నా,), (దగ్గర,), (ఉన్న,), ...   
..                                                 ...   
106  [(హాయ్,), (గైస్,), (ఈ,), (కోర్యో,), (lcd,), (T...   
14   [(ఇది,), (మంచి,), (TV,), (కాదు,), (రంగు,), (నా...   
92   [(హలో,), (ఫ్రెండ్స్,), (జనవరి,), (28,), (2004,...   
179  [(జియోమి,), (రెడ్మి,), (నోట్,), (3,), (మంచి,),...   
102  [(ఈ,), (శ్రేణిలో,), (ఇది,), (చాలా,), (మంచి,), ...   

                                       text_embeddings  
79   [[tensor(0.1961), tensor(-0.2596), tensor(0.04...  
197  [[tensor(0.3107), tensor(-0.3924), tensor(0.00...  
38   [[tensor(0.1761), tensor(-0.3943), tensor(0.07...  
24   [[tensor(0.4012), tensor(-0.2932), tensor(0.06...  
122  [[tensor(0.27

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": BernoulliNB()
}

X_train_concatenated = np.concatenate(X_train['text_embeddings'].values).reshape(len(X_train), -1)
X_test_concatenated = np.concatenate(X_test['text_embeddings'].values).reshape(len(X_test), -1)

trained_models = {}  # Dictionary to store trained models

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_concatenated, y_train)
    y_pred = model.predict(X_test_concatenated)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print()

    # Save the trained model in the dictionary
    trained_models[name] = model



Decision Tree Accuracy: 0.50
              precision    recall  f1-score   support

           0       0.39      0.60      0.47        15
           1       0.65      0.44      0.52        25

    accuracy                           0.50        40
   macro avg       0.52      0.52      0.50        40
weighted avg       0.55      0.50      0.51        40

[[ 9  6]
 [14 11]]

Logistic Regression Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.63      0.80      0.71        15
           1       0.86      0.72      0.78        25

    accuracy                           0.75        40
   macro avg       0.74      0.76      0.74        40
weighted avg       0.77      0.75      0.75        40

[[12  3]
 [ 7 18]]

SVC Accuracy: 0.42
              precision    recall  f1-score   support

           0       0.39      1.00      0.57        15
           1       1.00      0.08      0.15        25

    accuracy                           0.42        40
   ma

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.57
              precision    recall  f1-score   support

           0       0.47      0.93      0.62        15
           1       0.90      0.36      0.51        25

    accuracy                           0.57        40
   macro avg       0.68      0.65      0.57        40
weighted avg       0.74      0.57      0.55        40

[[14  1]
 [16  9]]

Naive Bayes Accuracy: 0.57
              precision    recall  f1-score   support

           0       0.45      0.67      0.54        15
           1       0.72      0.52      0.60        25

    accuracy                           0.57        40
   macro avg       0.59      0.59      0.57        40
weighted avg       0.62      0.57      0.58        40

[[10  5]
 [12 13]]



## **Hyper-paramter tuning**

## **Hyper-parameter tuning for RandomForest**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_concatenated, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters:", best_params)
print("Best score:", best_score)


Best parameters: {'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.71875


## **Hyper-parameter tuning for Logistic regression**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid for logistic regression
param_grid_logistic = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],               # Penalty norm
    'solver': ['liblinear', 'saga']        # Optimization algorithm
}

# Instantiate the Logistic Regression model
logistic_model = LogisticRegression()

# Create a GridSearchCV object
grid_search_logistic = GridSearchCV(estimator=logistic_model, param_grid=param_grid_logistic, cv=5, scoring='accuracy')

# Perform the grid search
grid_search_logistic.fit(X_train_concatenated, y_train)




In [None]:
# Get the best parameters and best score
best_params_logistic = grid_search_logistic.best_params_
best_score_logistic = grid_search_logistic.best_score_

# Print the best parameters and best score
print("Best parameters for Logistic Regression:", best_params_logistic)
print("Best score for Logistic Regression:", best_score_logistic)

Best parameters for Logistic Regression: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score for Logistic Regression: 0.80625


## **Best Model**

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=300, max_depth=5, min_samples_split=2,
                                  min_samples_leaf=2, max_features='log2')

In [None]:
rf_model.fit(X_train_concatenated, y_train)

In [None]:
y_pred_rf_best = rf_model.predict(X_test_concatenated)
accuracy_rf_best = accuracy_score(y_test, y_pred_rf_best)
print(f"Random Forest after Hyperparameter tuning Accuracy: {accuracy_rf_best :.2f}")
print(classification_report(y_test, y_pred_rf_best))
print(confusion_matrix(y_test, y_pred_rf_best))

Random Forest after Hyperparameter tuning Accuracy: 0.57
              precision    recall  f1-score   support

           0       0.46      0.80      0.59        15
           1       0.79      0.44      0.56        25

    accuracy                           0.57        40
   macro avg       0.62      0.62      0.57        40
weighted avg       0.66      0.57      0.57        40

[[12  3]
 [14 11]]


## Logistic regression

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

# Instantiate Logistic Regression model with the best parameters
best_logistic_model = LogisticRegression(**best_params_logistic)


In [None]:
# Train the Logistic Regression model on the training data
best_logistic_model.fit(X_train_concatenated, y_train)


In [None]:
# Predict labels for the test data
y_pred_logistic_best = best_logistic_model.predict(X_test_concatenated)

# Calculate accuracy
accuracy_logistic_best = accuracy_score(y_test, y_pred_logistic_best)

# Print accuracy and other metrics
print(f"Logistic Regression after Hyperparameter tuning Accuracy: {accuracy_logistic_best:.2f}")
print(classification_report(y_test, y_pred_logistic_best))
print(confusion_matrix(y_test, y_pred_logistic_best))

Logistic Regression after Hyperparameter tuning Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.62      0.67      0.65        15
           1       0.79      0.76      0.78        25

    accuracy                           0.73        40
   macro avg       0.71      0.71      0.71        40
weighted avg       0.73      0.72      0.73        40

[[10  5]
 [ 6 19]]


# **Model Prediction**

In [None]:
import torch

def get_text_embeddings(text, model_name, tokenizer_name="bert-base-uncased"):
    """
    Gets text embeddings for a new text input using a pre-trained model.

    Args:
        text (str): The new text input.
        model_name (str): The name of the pre-trained model (e.g., "ai4bharat/indic-bert", "bert-base-uncased").
        tokenizer_name (str, optional): The name of the tokenizer (defaults to "bert-base-uncased").
        device (str or torch.device, optional): The device to use for computations (CPU or GPU). Defaults to None (CPU).

    Returns:
        torch.Tensor: The text embedding vector.
    """

    from transformers import AutoTokenizer, AutoModel

    # Load tokenizer and model based on provided names
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize the text
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)


    # Forward pass through the model (without gradients)
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract embeddings (consider different pooling strategies)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1)  # Averaging

    return embeddings

In [None]:
new_review = "ఈ ఫోన్ డి అద్భుతం అయినా డిజైన్ మంచి బాటరీ లేటెస్ట్ ఆండ్రాయిడ్ కూడా ఉండటం ప్లస్ అని చెపొచ్చు"
model_name = "ai4bharat/indic-bert"
emb = get_text_embeddings(new_review,model_name)
unigrams_new = get_ngrams(new_review, 1)
bigrams_new = get_ngrams(new_review, 2)
trigrams_new = get_ngrams(new_review, 3)

n_gram_new_review = unigrams_new + bigrams_new + trigrams_new

rf_prediction = rf_model.predict(emb.cpu().numpy().reshape(1, -1))
lr_prediction = best_logistic_model.predict(emb.cpu().numpy().reshape(1, -1))
print(f"Random Forest prediction{rf_prediction}")
print(f"Logistic Regression prediction{lr_prediction}")


Random Forest prediction[1]
Logistic Regression prediction[1]


In [None]:
import pickle

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('best_logistic_model.pkl', 'wb') as f:
    pickle.dump(best_logistic_model, f)
