In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

# Carga de dados e preprocesamento

In [None]:
data = pd.read_csv('books_data.csv')
data.head()

In [None]:
# Check data types of the columns
data.dtypes

In [None]:
# Count NaNs per column
missing_percent = (data.isnull().sum() / len(data)) * 100
missing_percent

In [None]:
ratings = pd.read_csv('Books_rating.csv')
print(len(ratings))
ratings.head()

In [None]:
# Check data types of the columns
ratings.dtypes

In [None]:
# Count NaNs per column
missing_percent = (ratings.isnull().sum() / len(ratings)) * 100
missing_percent

In [None]:
# Merge both dataset based on shared "Title" column
df = pd.merge(data,ratings, on = 'Title')
df.shape

In [None]:
# Drop duplicates
df.drop_duplicates(inplace = True)
df.shape

In [None]:
df.head()

In [None]:
# Change time from unix to datetime format
df['time'] = pd.to_datetime(df['time'], unit='s')
df['time'] = df['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
# Fix encoding issue
df['summary'].fillna(value="", inplace=True)
df[df['summary'].str.contains("quot")][:2]

In [None]:
df['summary'].replace('&quot;', '"', inplace=True)
df['text'].replace('&quot;', '"', inplace=True)

# Analisis exploratorio

## Distribucao dos scores

In [None]:
fig = px.histogram(df, x="score", nbins=5, title="Distribucao dos scores dos livros")
fig.update_xaxes(title="Score")
fig.update_yaxes(title="Cantidad")
fig.show()

## Cantidad de reviews y promedio de score por usuario

In [None]:
user_stats = df.groupby('User_id').agg({'Title': 'count', 'score': 'mean'})
user_stats = user_stats.rename(columns={'Title': 'num_reviews', 'score': 'mean_rating'})

fig = px.scatter(user_stats, x='num_reviews', y='mean_rating', 
                 hover_name=user_stats.index, title='User Stats')
fig.show()

## Clasificacao de usuarios: uma proposta

In [None]:
interesting_users = user_stats.query('num_reviews > 50 and 2 < mean_rating < 4')
interesting_users.index.unique()

In [None]:
haters = user_stats.query('num_reviews > 10 and mean_rating < 2.5')
haters

In [None]:
lovers = user_stats.query('num_reviews > 50 and mean_rating > 4.5')
lovers

## Autores com melhores e peores scores

In [None]:
# Grouping the data by authors and calculating the mean review score for each author
mask = df.groupby('authors').count()['Title'] > 1000
average_scores_by_author = df.groupby('authors')['score'].mean().loc[mask]

top_authors = average_scores_by_author.sort_values(ascending=False).head(10)[::-1]
top_authors.plot(kind='barh', figsize=(7, 7))
plt.title('Top 10 Autores')
plt.xlabel('Avg score')
plt.ylabel('Autores')
plt.show()

In [None]:
bottom_10_authors = average_scores_by_author.sort_values(ascending=True).head(10)[::-1]
bottom_10_authors.plot(kind='barh', figsize=(7,7))
plt.title('Top 10 Autores (lowest scores)', fontsize=15)
plt.xlabel('Average Ratings')
plt.ylabel('Autores')
ax = plt.gca()
ax.set_xlim([0, 5])
plt.show()

## Performance por género

In [None]:
numeric_data = df.dropna(subset=['score'])
mask = numeric_data.groupby('categories')['score'].count()
avg_cat_rating = numeric_data.groupby('categories')['score'].mean().loc[mask > 200]

top = avg_cat_rating.sort_values(ascending=False).head(10)[::-1]
bottom = avg_cat_rating.sort_values(ascending=True).head(10)

# Plot the bar chart for top and bottom categories
plt.figure(figsize=(10, 10))

plt.barh(bottom.index, bottom, color='red', label='Piores 10')
plt.barh(top.index, top, color='blue', label='Melhores 10')

plt.title('Gêneros melhores e piores rankeados', fontsize=15)
plt.xlabel('Score')
plt.legend()
plt.show()

## Performance por editora

In [None]:
# TODO

# Dataset para tarefas de NLP

In [None]:
# Dataset for NLP related task
df['all'] = df['summary'] + '\n' + df['text']
nlp = df[['all', 'score']]
nlp.head()
nlp = nlp.sample(frac=0.01)

In [None]:
nlp.to_csv('book_texts.csv')

In [None]:
nlp.head()