In [None]:
import psycopg2
import pandas as pd
import re 
import string

In [None]:
conn=psycopg2.connect('postgresql://public_owner:7CBm0fdOPkgz@ep-sweet-field-a1urmrzw.ap-southeast-1.aws.neon.tech/public?sslmode=require')
query="SELECT * FROM indonesian_vietnamese_words"
chunks = pd.read_sql(query, conn, chunksize=1000)  # Adjust chunksize based on memory
df = pd.concat(chunks, ignore_index=True)
conn.close()

# Analyze data 


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
  # replace with your file path
indonesian_texts = df['Indonesian']
vietnamese_texts = df['Vietnamese']

# Basic statistics
df['indo_word_count'] = indonesian_texts.apply(lambda x: len(x.split()))
df['viet_word_count'] = vietnamese_texts.apply(lambda x: len(x.split()))
df['indo_char_count'] = indonesian_texts.apply(len)
df['viet_char_count'] = vietnamese_texts.apply(len)

# Token statistics summary
word_stats = df[['indo_word_count', 'viet_word_count']].describe()
char_stats = df[['indo_char_count', 'viet_char_count']].describe()

# Common words using CountVectorizer
vectorizer = CountVectorizer(max_features=10, stop_words='english')
indo_word_freq = vectorizer.fit_transform(indonesian_texts).toarray().sum(axis=0)
viet_word_freq = vectorizer.fit_transform(vietnamese_texts).toarray().sum(axis=0)
indo_common_words = dict(zip(vectorizer.get_feature_names_out(), indo_word_freq))
viet_common_words = dict(zip(vectorizer.get_feature_names_out(), viet_word_freq))

# Sentence Pair Similarity using BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")
def bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.pooler_output.detach().numpy()

ind_embeddings = np.vstack([bert_embedding(text) for text in indonesian_texts])
viet_embeddings = np.vstack([bert_embedding(text) for text in vietnamese_texts])
similarities = cosine_similarity(ind_embeddings, viet_embeddings).diagonal()

# Summary of similarities
similarity_stats = {
    'mean_similarity': np.mean(similarities),
    'min_similarity': np.min(similarities),
    'max_similarity': np.max(similarities)
}

print("Word Count Statistics:", word_stats)
print("Character Count Statistics:", char_stats)
print("Top Indonesian Words:", indo_common_words)
print("Top Vietnamese Words:", viet_common_words)
print("Translation Similarity Statistics:", similarity_stats)


# plot data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# Load your dataset (replace with actual dataset file path

# Calculate word lengths for each text in both languages
df['indo_word_count'] = df['Indonesian'].apply(lambda x: len(x.split()))
df['viet_word_count'] = df['Vietnamese'].apply(lambda x: len(x.split()))

# Plot histogram of word count frequency
plt.figure(figsize=(12, 6))
plt.hist(df['indo_word_count'], bins=range(1, max(df['indo_word_count']) + 2), alpha=0.5, label="Indonesian")
plt.hist(df['viet_word_count'], bins=range(1, max(df['viet_word_count']) + 2), alpha=0.5, label="Vietnamese")
plt.title("Word Count Frequency Histogram")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.legend(loc="upper right")
plt.show()

# Find top 10 words for each language
vectorizer_indo = CountVectorizer(max_features=10, stop_words=None)
indo_word_freq = vectorizer_indo.fit_transform(data['Indonesian']).toarray().sum(axis=0)
indo_common_words = dict(zip(vectorizer_indo.get_feature_names_out(), indo_word_freq))

vectorizer_viet = CountVectorizer(max_features=10, stop_words=None)
viet_word_freq = vectorizer_viet.fit_transform(data['Vietnamese']).toarray().sum(axis=0)
viet_common_words = dict(zip(vectorizer_viet.get_feature_names_out(), viet_word_freq))

# Plot top 10 words for Indonesian
plt.figure(figsize=(12, 6))
plt.bar(indo_common_words.keys(), indo_common_words.values(), color='skyblue')
plt.title("Top 10 Most Frequent Words in Indonesian")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()

# Plot top 10 words for Vietnamese
plt.figure(figsize=(12, 6))
plt.bar(viet_common_words.keys(), viet_common_words.values(), color='lightcoral')
plt.title("Top 10 Most Frequent Words in Vietnamese")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()
