# Imports

In [1]:
import pandas as pd

import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.text import Tokenizer
from keras.layers import SpatialDropout1D
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import TextVectorization
from keras import layers

import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans
from sklearn import metrics

from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt

# Import Data

In [2]:
np.random.seed(44)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
colab = 'colab'
local = 'local'
env = colab
#env = local

In [4]:
if env == colab:
  from google.colab import drive
  drive.mount('/content/drive')
  data_path = '/content/drive/My Drive/eece5644_final_project/'

Mounted at /content/drive


In [5]:
train_df = pd.read_csv(data_path + 'tamil_movie_reviews_train.csv')
test_df = pd.read_csv(data_path + 'tamil_movie_reviews_test.csv')

In [6]:
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df = test_df.drop_duplicates().reset_index(drop=True)

# Preprocess

In [7]:
#binary split
def binary_split(y):
  mid_point = 7
  y_mini = y.copy()
  for i in range(len(y_mini)):
    if y[i] <= mid_point:
      y_mini[i] = 0
    else:
      y_mini[i] = 1
  return y_mini

In [8]:
def tertiary_split(y):
  lower_bound = 6
  upper_bound = 8
  y_mini = y.copy()
  for i in range(len(y_mini)):
    if y[i] <= lower_bound:
      y_mini[i] = 0
    elif lower_bound < y[i] <= upper_bound:
      y_mini[i] = 1
    elif upper_bound < y[i]:
      y_mini[i] = 2
  return y_mini

In [9]:
def polarize(x, y):
  lower_bound = 6
  upper_bound = 8
  y_mini = y.copy()
  x_new = np.array([])
  y_mini_new = np.array([])
  for i in range(len(y_mini)):
    if y[i] <= lower_bound:
      x_new = np.append(x_new, x[i])
      y_mini_new = np.append(y_mini_new, 0)
    elif upper_bound < y[i]:
      x_new = np.append(x_new, x[i])
      y_mini_new = np.append(y_mini_new, 1)
  return x_new, y_mini_new

In [10]:
# Ratings are given in increments of 0.25, so let's just normalize them
df = pd.concat([train_df, test_df], ignore_index=True)

df.Rating = df.Rating * 4 - 4
df.Rating = df.Rating.astype(int)

In [11]:
# Cite ChatGPT
def read_stop_words(filename):
    stop_words = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            stop_words.append(line.strip())
    return stop_words

filename = data_path + "TamilStopWords.txt"
stop_words_list = read_stop_words(filename)

In [12]:
def create_vectorizer(stop_words):
  return TfidfVectorizer(stop_words = stop_words)

In [13]:
def vectorize_tamil_data():
  filename = data_path + "TamilStopWords.txt"
  stop_words_list = read_stop_words(filename)

  vectorizer = create_vectorizer(stop_words = stop_words_list)

  X = vectorizer.fit_transform(df.ReviewInTamil).toarray()
  y = df.Rating

  return (X, y)


In [14]:
def vectorize_english_data():
  pass

In [15]:
vectorizer = create_vectorizer(stop_words = 'english')

# vectorizer = TfidfVectorizer(stop_words = 'english')
#vectorizer = TextVectorization()

In [16]:
X, y = vectorize_tamil_data()



In [17]:
num_features = X.shape[1]
print("Total number of features:", num_features)

Total number of features: 2589


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

In [19]:
max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

# Evaluation



In [20]:
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, multilabel_confusion_matrix, precision_recall_curve, precision_recall_fscore_support, roc_auc_score, roc_curve, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
import time

def evaluate_model(y_true, x_test, clf, round=False, dim=3):
  '''
  y_true: true class labels for test data
  x_test: test data features
  clf: model to evaluate
  '''
  y_pred = clf.predict(x_test)
  if(round):
    y_pred = np.round(y_pred)
  report = classification_report(y_true, y_pred, output_dict=True)
  print(classification_report(y_true, y_pred))
  ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
  for c in range(0, dim):
    #print(f'Class: {c}')
    # precision = report[str(c)]['precision']
    # recall = report[str(c)]['recall']
    predictions = []
    gold = []
    for i in y_pred:
      if i == c:
        predictions.append(0)
      else:
        predictions.append(1)
    for i in y_true:
      if i == c:
        gold.append(0)
      else:
        gold.append(1)
    PrecisionRecallDisplay.from_predictions(gold, predictions, name = f'Precison/Recall, class {str(c)}')
    RocCurveDisplay.from_predictions(gold, predictions, name = f'ROC, class {str(c)}')
    inference_times = []
  for sample in x_test:
    temp_array = [sample]
    start_time = time.time()
    clf.predict(temp_array)
    end_time = time.time()
    inference_times.append(end_time - start_time)
  fig = px.box(inference_times)
  fig.show()

def compute_inference_times(y_true, x_test, clf, round=False, dim=3):
  '''
  y_true: true class labels for test data
  x_test: test data features
  clf: model to evaluate
  '''
  inference_times = []
  print(x_test)
  print(type(x_test))
  print(x_test.shape)
  for sample in x_test:
    temp_array = [sample]
    print(temp_array)
    print(sample)
    print(type(temp_array))
    print(type(sample))
    start_time = time.time()
    clf.predict(temp_array)
    end_time = time.time()
    inference_times.append(end_time - start_time)
  fig = px.box(inference_times)
  fig.show()

# Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns

model = GaussianNB()

y_mini = tertiary_split(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)

model.fit(X_train, y_train)

tertiary_bayes_model = model

y_pred = model.predict(X_test)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

y_mini = binary_split(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)

model.fit(X_train, y_train)

binary_bayes_model = model

y_pred = model.predict(X_test)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(stop_words_list)
print(df.ReviewInTamil)

# k-NN


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# load data from csv files
train_data = pd.read_csv(data_path + 'tamil_movie_reviews_train.csv')
test_data = pd.read_csv(data_path + 'tamil_movie_reviews_test.csv')

# merge train and test data together
df = pd.concat([train_data, test_data], ignore_index = True)

# assign intelligible rating values for every rating in df
min_rating = 1.0
max_rating = 4.50
alpha = 0.25
i = 0

# dictionary to map ratings to ints
df.Rating = df.Rating * 4 - 4
df.Rating = df.Rating.astype(int)

# convert text data into numerical features using CountVectorizer
X, y = vectorize_tamil_data()

## Ternary Model
# convert ratings into categorical labels: negative, neutral, and positive
neg_bound = 6
pos_bound = 8
y_categories = y.copy()
for i in range(len(y_categories)):
    if y_categories[i] <= neg_bound:
        y_categories[i] = 'Negative'
    elif neg_bound < y_categories[i] <= pos_bound:
        y_categories[i] = 'Neutral'
    elif y_categories[i] > pos_bound:
        y_categories[i] = 'Positive'
print(y_categories.value_counts())

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categories, test_size=0.2, random_state = 0)

# neighbor values to check
neighbors = list(range(1, 51))
optimal_accuracy = 0
optimal_neighbors = 0

# loop through different neighbor values and choose value with highest accuracy
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > optimal_accuracy:
        optimal_accuracy = accuracy
        optimal_neighbors = n

# train KNN model
knn = KNeighborsClassifier(n_neighbors = optimal_neighbors)
knn.fit(X_train, y_train)

knn_tertiary_model = knn

# predict sentiment labels from test data
y_pred = knn.predict(X_test)
y_pred_series = pd.Series(y_pred)
print(y_pred_series.value_counts())

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

## Binary Model
# convert ratings into categorical labels: negative, neutral, and positive
bound = 7
y_categories = y.copy()
for i in range(len(y_categories)):
    if y_categories[i] <= bound:
        y_categories[i] = 'Negative'
    elif y_categories[i] > bound:
        y_categories[i] = 'Positive'
print(y_categories.value_counts())

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_categories, test_size=0.2, random_state = 0)

# neighbor values to check
neighbors = list(range(1, 51))
optimal_accuracy = 0
optimal_neighbors = 0

# loop through different neighbor values and choose value with highest accuracy
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > optimal_accuracy:
        optimal_accuracy = accuracy
        optimal_neighbors = n

# train KNN model
knn = KNeighborsClassifier(n_neighbors = optimal_neighbors)
knn.fit(X_train, y_train)

knn_binary_model = knn


# predict sentiment labels from test data
y_pred = knn.predict(X_test)
y_pred_series = pd.Series(y_pred)
print(y_pred_series.value_counts())

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# SVM - Linear Kernel

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

# Read the data
train_df = pd.read_csv(data_path + 'tamil_movie_reviews_train.csv')
test_df = pd.read_csv(data_path + 'tamil_movie_reviews_test.csv')

# Remove duplicates from the data
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df = test_df.drop_duplicates().reset_index(drop=True)

# Ratings are given in increments of 0.25, so let's just normalize them
df = pd.concat([train_df, test_df], ignore_index=True)

df.Rating = df.Rating * 4 - 4
df.Rating = df.Rating.astype(int)

# Convert Rating column to integer type
df.Rating = df.Rating.astype(int)

# TF-IDF Vectorization
X, y = vectorize_tamil_data()


y_mini = tertiary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
# Initialize and train the SVM classifier with Linear kernel
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

tertiary_linear_svm_classifier = svm_classifier
# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()



y_mini = binary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
# Initialize and train the SVM classifier with Linear kernel
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

binary_linear_svm_classifier = svm_classifier


# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

#SVM - Polynomial Kernel

In [None]:
#polynomial Kernel
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

# Set random seed for reproducibility
np.random.seed(44)

# Download stopwords from NLTK
nltk.download('stopwords')

# Read the data
train_df = pd.read_csv(data_path + 'tamil_movie_reviews_train.csv')
test_df = pd.read_csv(data_path + 'tamil_movie_reviews_test.csv')

# Remove duplicates from the data
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df = test_df.drop_duplicates().reset_index(drop=True)

# Concatenate train and test data for preprocessing
df = pd.concat([train_df, test_df], ignore_index=True)

df.Rating = df.Rating * 4 - 4
df.Rating = df.Rating.astype(int)

# TF-IDF Vectorization
X, y = vectorize_tamil_data()

y_mini = tertiary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)

# Initialize and train the SVM classifier with polynomial kernel
svm_classifier = SVC(kernel='poly', degree=3)  # Set degree to the desired degree of the polynomial
svm_classifier.fit(X_train, y_train)

tertiary_poly_svm_classifier = svm_classifier

y_mini = binary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
# Initialize and train the SVM classifier with Linear kernel
svm_classifier = SVC(kernel='poly')
svm_classifier.fit(X_train, y_train)

binary_poly_svm_classifier = svm_classifier

# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

#SVM - RBF Kernel

In [None]:
#RBF Kernel
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

# Set random seed for reproducibility
np.random.seed(44)

# Download stopwords from NLTK
nltk.download('stopwords')

# Read the data
train_df = pd.read_csv(data_path + 'tamil_movie_reviews_train.csv')
test_df = pd.read_csv(data_path + 'tamil_movie_reviews_test.csv')

# Remove duplicates from the data
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df = test_df.drop_duplicates().reset_index(drop=True)

# Concatenate train and test data for preprocessing
df = pd.concat([train_df, test_df], ignore_index=True)

df.Rating = df.Rating * 4 - 4
df.Rating = df.Rating.astype(int)

# TF-IDF Vectorization
X, y = vectorize_tamil_data()

y_mini = tertiary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)

# Initialize and train the SVM classifier with RBF kernel
svm_classifier = SVC(kernel='rbf')
svm_classifier.fit(X_train, y_train)

tertiary_rbf_svm_classifier = svm_classifier

y_mini = binary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
# Initialize and train the SVM classifier with Linear kernel
svm_classifier = SVC(kernel='rbf')
svm_classifier.fit(X_train, y_train)

binary_rbf_svm_classifier = svm_classifier

# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

#SVM - Sigmoid Kernel

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

# Set random seed for reproducibility
np.random.seed(44)

# Download stopwords from NLTK
nltk.download('stopwords')

# Read the data
train_df = pd.read_csv(data_path + 'tamil_movie_reviews_train.csv')
test_df = pd.read_csv(data_path + 'tamil_movie_reviews_test.csv')

# Remove duplicates from the data
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df = test_df.drop_duplicates().reset_index(drop=True)

# Concatenate train and test data for preprocessing
df = pd.concat([train_df, test_df], ignore_index=True)

df.Rating = df.Rating * 4 - 4
df.Rating = df.Rating.astype(int)

# TF-IDF Vectorization
X, y = vectorize_tamil_data()

y_mini = tertiary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)

# Initialize and train the SVM classifier with Sigmoid kernel
svm_classifier = SVC(kernel='sigmoid')
svm_classifier.fit(X_train, y_train)

tertiary_sigmoid_svm_classifier = svm_classifier

y_mini = binary_split(y)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
# Initialize and train the SVM classifier with Linear kernel
svm_classifier = SVC(kernel='sigmoid')
svm_classifier.fit(X_train, y_train)

binary_sigmoid_svm_classifier = svm_classifier

# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# PCA

In [None]:
X, y = vectorize_tamil_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

In [None]:
fig = px.histogram(y)
# fig.show()

In [None]:
y_mini = tertiary_split(y)
y_mini.value_counts()

In [None]:
fig = px.histogram(y_mini)
# fig.show()

In [None]:
pca = PCA(n_components=3)
components = pca.fit_transform(X)
X_pca = components

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

In [None]:
fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=y_mini,
    title=f'pca',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.show()

In [None]:
fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(0, 3),
    color=y_mini
)
fig.update_traces(diagonal_visible=False)
fig.show()

# Cluster

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_mini, test_size=.20)
np.random.seed(40)
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X_train)
    kmeanModel.fit(X_train)

    distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X_train.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X_train.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
for key, val in mapping1.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

In [None]:
for key, val in mapping2.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

In [None]:
# Specify the number of clusters (k)
k = 3

# Initialize K-means clustering
kmeans = KMeans(n_clusters=k)
kmeans.fit(X_train)

# Get the cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
# Create a DataFrame with cluster labels and 3D coordinates
df_plot = pd.DataFrame(X_train, columns=['Feature 1', 'Feature 2', 'Feature 3'])
df_plot['Cluster'] = cluster_labels.astype(str)  # Convert cluster labels to string for categorical coloring

# Create a 3D scatter plot using Plotly Express
fig = px.scatter_3d(df_plot, x='Feature 1', y='Feature 2', z='Feature 3', color='Cluster',
                    title='K-means Clustering in 3D', opacity=0.7)


# Set layout parameters for the 3D plot
fig.update_layout(scene=dict(
                    xaxis_title='Feature 1',
                    yaxis_title='Feature 2',
                    zaxis_title='Feature 3'))

# Show the 3D plot
fig.show()


In [None]:
score = metrics.accuracy_score(y_train,kmeans.predict(X_train))
score

In [None]:
evaluate_model(y_test, X_test, kmeans)

# PCA (Binary)

In [None]:
X, y = vectorize_tamil_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

In [None]:
fig = px.histogram(y)
# fig.show()

In [None]:
y_mini = binary_split(y)
y_mini.value_counts()

In [None]:
fig = px.histogram(y_mini)
# fig.show()

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(X)
X_pca = components

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

In [None]:
fig = px.scatter(
    components, x=0, y=1, color=y_mini,
    title=f'pca',
    labels={'0': 'PC 1', '1': 'PC 2'}
)
fig.show()

In [None]:
fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(0, 2),
    color=y_mini
)
fig.update_traces(diagonal_visible=False)
fig.show()

# Cluster (Binary)

In [None]:
np.random.seed(40)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y_mini, test_size=.20)
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X_train)
    kmeanModel.fit(X_train)

    distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X_train.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X_train.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
for key, val in mapping1.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

In [None]:
for key, val in mapping2.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

In [None]:
# Specify the number of clusters (k)
k = 2

# Initialize K-means clustering
kmeans = KMeans(n_clusters=k)
kmeans.fit(X_train)

# Get the cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
# Create a DataFrame with cluster labels and 3D coordinates
df_plot = pd.DataFrame(X_train, columns=['Feature 1', 'Feature 2'])
df_plot['Cluster'] = cluster_labels.astype(str)  # Convert cluster labels to string for categorical coloring

# Create a 3D scatter plot using Plotly Express
fig = px.scatter(df_plot, x='Feature 1', y='Feature 2', color='Cluster',
                    title='K-means Clustering in 3D', opacity=0.7)


# Set layout parameters for the 3D plot
fig.update_layout(scene=dict(
                    xaxis_title='Feature 1',
                    yaxis_title='Feature 2'))

# Show the 3D plot
fig.show()


In [None]:
score = metrics.accuracy_score(y_test,kmeans.predict(X_test))
score

In [None]:
evaluate_model(y_test, X_test, kmeans)

# LSTM | Neural Net

In [None]:
max_features = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review

In [None]:
def save(model):
  # Save the model
  model.save('/content/drive/My Drive/eece5644_final_project/lstm_model.keras')

def load():
  return keras.models.load_model('/content/drive/My Drive/eece5644_final_project/lstm_model.keras')

In [None]:
def eval(model, x_test, y_test, batch_size=32):
  y_pred = model.predict(x_test)
  # classification report
  report = classification_report(y_test, y_pred)
  print("Classification Report:")
  print(report)

  # confusion matrix
  plt.figure(figsize=(8, 6))
  conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
  sns.heatmap(conf_matrix, annot=True, fmt='d')
  plt.title('Confusion Matrix')
  plt.show()


In [None]:
# Code based off of this: https://keras.io/examples/nlp/bidirectional_lstm_imdb/

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

y_mini = tertiary_split(y)
x_train, x_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.1), loss="categorical_crossentropy", metrics=['accuracy'])

In [None]:
# PLEASE DON'T RUN COSTS GPU
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))
model.save('/content/drive/My Drive/eece5644_final_project/tertiary_lstm_model.keras')

In [None]:
# Code based off of this: https://keras.io/examples/nlp/bidirectional_lstm_imdb/

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

y_mini = binary_split(y)
x_train, x_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.1), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# PLEASE DON'T RUN COSTS GPU
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))
model.save('/content/drive/My Drive/eece5644_final_project/binary_lstm_model.keras')

# English Preprocessing

In [None]:
# https://keras.io/api/datasets/imdb/

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
    num_words=max_features
)
# Use pad_sequence to standardize sequence length:
# this will truncate sequences longer than 200 words and zero-pad sequences shorter than 200 words.
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.utils.pad_sequences(x_test, maxlen=maxlen)

# English Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns

model = GaussianNB()

model.fit(x_train, y_train)

english_bayes_model = model

y_pred = model.predict(x_test)

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# k-NN on English Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# neighbor values to check
neighbors = list(range(1, 51))
optimal_accuracy = 0
optimal_neighbors = 0

# loop through different neighbor values and choose value with highest accuracy
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > optimal_accuracy:
        optimal_accuracy = accuracy
        optimal_neighbors = n

# train KNN model
knn = KNeighborsClassifier(n_neighbors = optimal_neighbors)
knn.fit(x_train, y_train)

english_knn_classifier = knn

# predict sentiment labels from test data
y_pred = knn.predict(x_test)
y_pred_series = pd.Series(y_pred)
print(y_pred_series.value_counts())

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# SVM on English Data

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Initialize and train the SVM classifier with Sigmoid kernel
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(x_train, y_train)

# Predict the labels for the test set
y_pred = svm_classifier.predict(x_test)

english_svm_classifier = svm_classifier

# classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# confusion matrix
plt.figure(figsize=(8, 6))
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()

# LSTM on English Data

In [None]:
# CODE TAKEN FROM: https://keras.io/examples/nlp/bidirectional_lstm_imdb/

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
print(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(x_train, y_train, batch_size=32, epochs=2, validation_data=(x_test, y_test))

In [None]:
model.save('/content/drive/My Drive/eece5644_final_project/english_lstm_model.keras')

# PCA | English

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
    num_words=max_features
)
# Use pad_sequence to standardize sequence length:
# this will truncate sequences longer than 200 words and zero-pad sequences shorter than 200 words.
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.utils.pad_sequences(x_test, maxlen=maxlen)

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(x_train)
X_pca = components

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

In [None]:
fig = px.scatter(
    components, x=0, y=1, color=y_train,
    title=f'pca',
    labels={'0': 'PC 1', '1': 'PC 2'}
)
fig.show()

In [None]:
fig = px.scatter_matrix(
    components,
    labels=labels,
    dimensions=range(0, 2),
    color=y_train
)
fig.update_traces(diagonal_visible=False)
fig.show()

# K-Means | English

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_train, test_size=.20)
np.random.seed(40)
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X_train)
    kmeanModel.fit(X_train)

    distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X_train.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X_train.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
for key, val in mapping1.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

In [None]:
for key, val in mapping2.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

In [None]:
# Specify the number of clusters (k)
k = 2

# Initialize K-means clustering
kmeans = KMeans(n_clusters=k)
kmeans.fit(X_train)

# Get the cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
# Create a DataFrame with cluster labels and 3D coordinates
df_plot = pd.DataFrame(X_train, columns=['Feature 1', 'Feature 2'])
df_plot['Cluster'] = cluster_labels.astype(str)  # Convert cluster labels to string for categorical coloring

# Create a 3D scatter plot using Plotly Express
fig = px.scatter(df_plot, x='Feature 1', y='Feature 2', color='Cluster',
                    title='K-means Clustering in 3D', opacity=0.7)


# Set layout parameters for the 3D plot
fig.update_layout(scene=dict(
                    xaxis_title='Feature 1',
                    yaxis_title='Feature 2'))

# Show the 3D plot
fig.show()


In [None]:
score = metrics.accuracy_score(y_train,kmeans.predict(X_train))
score

In [None]:
evaluate_model(y_test, X_test, kmeans)

# Evaluation


In [None]:
# convert text data into numerical features using CountVectorizer
X, y = vectorize_tamil_data()

## Ternary Model
# convert ratings into categorical labels: negative, neutral, and positive
neg_bound = 6
pos_bound = 8
y_categories_tertiary = y.copy()
for i in range(len(y_categories_tertiary)):
    if y_categories_tertiary[i] <= neg_bound:
        y_categories_tertiary[i] = 'Negative'
    elif neg_bound < y_categories_tertiary[i] <= pos_bound:
        y_categories_tertiary[i] = 'Neutral'
    elif y_categories_tertiary[i] > pos_bound:
        y_categories_tertiary[i] = 'Positive'

y_mini = tertiary_split(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)

print("Evaluating tertiary bayes model")
evaluate_model(y_test, X_test, tertiary_bayes_model)
print("Evaluating tertiary SVM model")
evaluate_model(y_test, X_test, tertiary_linear_svm_classifier)
print("Evaluating tertiary kNN model")
X_train, X_test, y_train, y_test = train_test_split(X, y_categories_tertiary, test_size=.20)
evaluate_model(y_test, X_test, knn_tertiary_model)

In [None]:
X, y = vectorize_tamil_data()
y_mini = tertiary_split(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
model = keras.models.load_model('/content/drive/My Drive/eece5644_final_project/tertiary_lstm_model.keras')

# This generates plots but throws an error if not all classes contain values
evaluate_model(y_test, X_test, model, True, dim=3)

In [None]:
X, y = vectorize_tamil_data()
y_mini = binary_split(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)
model = keras.models.load_model('/content/drive/My Drive/eece5644_final_project/binary_lstm_model.keras')

# This generates plots but throws an error if not all classes contain values
evaluate_model(y_test, X_test, model, True, dim=2)

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
    num_words=max_features
)
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.utils.pad_sequences(x_test, maxlen=maxlen)

model = keras.models.load_model('/content/drive/My Drive/eece5644_final_project/english_lstm_model.keras')

# This generates plots but throws an error if not all classes contain values
# evaluate_model(y_test, x_test, model, True, dim=2)


In [None]:
## Binary Model
# convert ratings into categorical labels: negative, neutral, and positive
bound = 7
y_categories_binary = y.copy()
for i in range(len(y_categories)):
    if y_categories[i] <= bound:
        y_categories[i] = 'Negative'
    elif y_categories[i] > bound:
        y_categories[i] = 'Positive'
print(y_categories.value_counts())

y_mini = binary_split(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_mini, test_size=.20)



evaluate_model(binary_bayes_model)

evaluate_model(knn_tertiary_model)
evaluate_model(knn_binary_model)

evaluate_model(tertiary_linear_svm_classifier)
evaluate_model(binary_linear_svm_classifier)

evaluate_model(english_bayes_model)
evaluate_model(english_knn_classifier)
# evaluate_model(english_svm_classifier)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_mini, test_size=.20)
np.random.seed(40)
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)

for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X_train)
    kmeanModel.fit(X_train)

    distortions.append(sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X_train.shape[0])
    inertias.append(kmeanModel.inertia_)

    mapping1[k] = sum(np.min(cdist(X_train, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X_train.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
for key, val in mapping1.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, distortions, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Distortion')
plt.title('The Elbow Method using Distortion')
plt.show()

In [None]:
for key, val in mapping2.items():
    print(f'{key} : {val}')

In [None]:
plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

In [None]:
# Specify the number of clusters (k)
k = 3

# Initialize K-means clustering
kmeans = KMeans(n_clusters=k)
kmeans.fit(X_train)

# Get the cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
# Create a DataFrame with cluster labels and 3D coordinates
df_plot = pd.DataFrame(X_train, columns=['Feature 1', 'Feature 2', 'Feature 3'])
df_plot['Cluster'] = cluster_labels.astype(str)  # Convert cluster labels to string for categorical coloring

# Create a 3D scatter plot using Plotly Express
fig = px.scatter_3d(df_plot, x='Feature 1', y='Feature 2', z='Feature 3', color='Cluster',
                    title='K-means Clustering in 3D', opacity=0.7)


# Set layout parameters for the 3D plot
fig.update_layout(scene=dict(
                    xaxis_title='Feature 1',
                    yaxis_title='Feature 2',
                    zaxis_title='Feature 3'))

# Show the 3D plot
fig.show()


In [None]:
score = metrics.accuracy_score(y_train,kmeans.predict(X_train))
score

In [None]:
evaluate_model(y_test, X_test, kmeans)