In [None]:
import csv
from bs4 import BeautifulSoup

# Load the HTML content
file_path = '5_REVIEWS.html'
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')


data = []
review_list_container = soup.find('div', {'class': 'Responsesstyled__StyledList-sc-150koqm-5 ca-dtrb'})

if review_list_container:
    # Find all individual reviews within the container
    review_items = review_list_container.find_all('div', recursive=False)

    for review in review_items:
        # Extract the name of the review
        name_tag = review.find('a', {'data-test': 'link-text'})
        name = name_tag.text.strip() if name_tag else 'N/A'
        
        # Extract the mark
        mark_tag = review.find('div', {'class': 'Grade__sc-m0t12o-0 gJnnyh'})
        mark = mark_tag['value'] if mark_tag else 'N/A'
        
        # Extract the review text
        review_text_tag = review.find('div', {'class': 'Responsesstyled__StyledItemText-sc-150koqm-3 iPpiJn'})
        review_text_link = review_text_tag.find('a', {'data-gtm-click': True}) if review_text_tag else None
        review_text = review_text_link.text.strip() if review_text_link else 'N/A'
        
        data.append([name, mark, review_text])


In [None]:
# # Save data to CSV file
csv_file_path = '5reviews_data.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Mark', 'Review'])
    writer.writerows(data)

print(f"Data saved to {csv_file_path}")

In [None]:
import pandas as pd

# Load the datasets
dataset1 = pd.read_csv('negative_reviews_data.csv')
dataset2 = pd.read_csv('2reviews_data.csv')
dataset3 = pd.read_csv('3reviews_data.csv')
dataset4 = pd.read_csv('4reviews_data.csv')
dataset5 = pd.read_csv('5REVIEWS_data.csv')

# Concatenate the datasets
combined_dataset = pd.concat([dataset1, dataset2, dataset3, dataset4, dataset5])

# Save the combined dataset to a new CSV file
combined_dataset.to_csv('combined_reviews.csv', index=False)



In [None]:
df = pd.read_csv('combined_reviews.csv')
df.head()

In [None]:
df = pd.read_csv('validation_data2.csv', sep = ';')\

df.head()

In [None]:
df.info()

In [None]:
grouped = df.groupby(['Mark', 'Sentiment', 'model_mark']).size().reset_index(name='Count')

# Print the results
print(grouped)

In [None]:
df = df.dropna()
df.info()

In [None]:
# Ensure model_mark and human_mark are categorical with the same labels
df['Sentiment'] = df['Sentiment'].astype('category')
df['human_mark'] = df['human_mark'].astype('category')



# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(df['human_mark'], df['Sentiment'])
precision = precision_score(df['human_mark'], df['Sentiment'], average='weighted')
recall = recall_score(df['human_mark'], df['Sentiment'], average='weighted')
f1 = f1_score(df['human_mark'], df['Sentiment'], average='weighted')

# Print the classification report
report = classification_report(df['human_mark'], df['Sentiment'], target_names=['2', '0', '1'])
print(report)

# Print overall metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

In [None]:
plt.figure(figsize=(12, 8))
sns.set(style="whitegrid")
barplot = sns.barplot(x='Mark', y='Count', hue='model_mark', data=grouped, palette='viridis')

# Add annotations to the bars
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.0f'),
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center', 
                     xytext=(0, 9), textcoords='offset points')

# Customize the plot
plt.title('Count of Each Model Sentiment per Banki.ru Review Mark')
plt.xlabel('Mark (Star)')
plt.ylabel('Count')

# Place the legend outside the plot
plt.legend(title='Model sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
output_plot_path = 'bankiru_mark_VS_sentiment.png'

plt.savefig(output_plot_path)
plt.show()

print(f"Plot saved to {output_plot_path}")


In [None]:
df['Mark'] = df['Mark'].astype(int)
df

In [None]:

custom_palette = {
    'negative': sns.color_palette("hls", 8)[0],  # Red
    'neutral': sns.color_palette("hls", 8)[1],   # Yellow
    'positive': sns.color_palette("hls", 8)[2]   # Green
}

# Create the plot
plt.figure(figsize=(12, 8))
sns.set(style="whitegrid")
barplot = sns.barplot(x='Mark', y='Count', hue='model_mark', data=grouped, palette=custom_palette)

# Add annotations to the bars
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.0f'),
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center', 
                     xytext=(0, 9), textcoords='offset points')

# Customize the plot
plt.title('Count of Each Model Sentiment per Banki.ru Review Mark(star)')
plt.xlabel('Number of Stars given by the User')
plt.ylabel('Count')

# Place the legend outside the plot
plt.legend(title='Model sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
output_plot_path = 'bankiru_mark_VS_sentiment777.png'

plt.savefig(output_plot_path)
plt.show()

print(f"Plot saved to {output_plot_path}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

def expected_sentiment(star):
    if star in [1, 2, 3]:
        return 'negative'
    elif star == 3:
        return ['neutral']
    elif star in [4, 5]:
        return 'positive'
    
df['expected_sentiment'] = df['Mark'].apply(expected_sentiment)
# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(df['expected_sentiment'], df['model_mark'])
precision = precision_score(df['expected_sentiment'], df['model_mark'], average='weighted')
recall = recall_score(df['expected_sentiment'], df['model_mark'], average='weighted')
f1 = f1_score(df['expected_sentiment'], df['model_mark'], average='weighted')



# Print overall metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
