In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# Read the books and the ratings data
books = pd.read_csv("./Resources/Books.csv", low_memory = False)
ratings = pd.read_csv("./Resources/Ratings.csv", low_memory = False)

#Combine both into a single DataFrame
book_ratings = books.merge(ratings, how = "outer", on = "ISBN")

# Display data for preview
book_ratings.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,,
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0


In [3]:
book_ratings.dtypes

ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
User-ID                float64
Book-Rating            float64
dtype: object

In [4]:
book_ratings.count()

ISBN                   555195
Book-Title             505366
Book-Author            505365
Year-Of-Publication    505366
Publisher              505364
User-ID                433671
Book-Rating            433671
dtype: int64

In [5]:
book_ratings = book_ratings.dropna(how = "any")

In [6]:
book_ratings.count()

ISBN                   383839
Book-Title             383839
Book-Author            383839
Year-Of-Publication    383839
Publisher              383839
User-ID                383839
Book-Rating            383839
dtype: int64

In [7]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629.0,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318.0,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970.0,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313.0,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463.0,7.0


In [8]:
book_ratings["User-ID"] = pd.to_numeric(book_ratings["User-ID"], errors='coerce')

In [9]:
book_ratings["User-ID"] = book_ratings["User-ID"].astype(int)

In [10]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,7.0


In [15]:
# !pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
     ---------------------------------------- 0.0/55.1 kB ? eta -:--:--
     ---------------------------------------- 55.1/55.1 kB 1.4 MB/s eta 0:00:00
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Obtaining dependency information for hstspreload from https://files.pythonhosted.org/packages/b5/9f/83329ebd2808e04f2564051e4c4a880a1e2e67bd6410899f728096d0e22f/hstspreload-2024.2.1-py3-none-any.whl.metadata
  Downloading hstspreload-2024.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
     ---------------------------------------- 0.0/133.4 kB ? eta -:--:--
     --

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 2.1.1 requires sentencepiece, which is not installed.


In [None]:
from googletrans import Translator

title = book_ratings["Book-Title"]

def translate_to_english(title, translator):
    try:
        # Translate the title to English
        translation = translator.translate(title, dest='en')
        return translation.text
    except Exception as e:
        # Handle any potential errors during translation
#         print(f"Error translating '{title}': {str(e)}")
        return title

# Create a Translator object
translator = Translator()

# Apply translation to the 'Title' column
book_ratings['Translated Title'] = book_ratings["Book-Title"].apply(lambda x: translate_to_english(x, translator))

# Display the DataFrame with translated titles
print(book_ratings)

In [12]:
import unicodedata

def normalize_text(input_text, form='NFD'):
    normalized_text = unicodedata.normalize(form, input_text)
    return normalized_text

# Example usage
original_title = "Thérè àrè sòmè spéciâl chäräctérs"
normalized_title = normalize_text(original_title)

print("Original Title:", original_title)
print("Normalized Title:", normalized_title)

Original Title: Thérè àrè sòmè spéciâl chäräctérs
Normalized Title: Thérè àrè sòmè spéciâl chäräctérs


In [13]:
def convert_special_characters(title):
    # Define a mapping of special characters to their English alphabet equivalents
    special_characters_mapping = {
        'à': 'a', 'á': 'a', 'â': 'a', 'ä': 'a',
        'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e',
        'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i',
        'ò': 'o', 'ó': 'o', 'ô': 'o', 'ö': 'o',
        'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u',
        # Add more mappings as needed
    }

    # Apply the mapping to the title
    converted_title = ''.join(special_characters_mapping.get(char, char) for char in title)

    return converted_title

# Example usage
original_title = "Thérè àrè sòmè spéciâl chäräctérs"
converted_title = convert_special_characters(original_title)

In [14]:
converted_title

'There are some special characters'

In [11]:
book_ratings = book_ratings.reset_index()

In [12]:
book_ratings.nunique()

index                  383839
ISBN                   149833
Book-Title             135565
Book-Author             62112
Year-Of-Publication       106
Publisher               11574
User-ID                 68091
Book-Rating                10
dtype: int64

In [13]:
# Use Label Encoding from scikit-learn to encode ISBNs which are categorical
from sklearn.preprocessing import LabelEncoder

isbn_encoder = LabelEncoder()
book_ratings["ISBN_encoded"] = isbn_encoder.fit_transform(book_ratings["ISBN"])

# Reverse the encoding - we will use this to reverse the encoding later if needed
# book_ratings['ISBN_decoded'] = isbn_encoder.inverse_transform(book_ratings['ISBN_encoded'])

In [34]:
book_ratings.dtypes

index                    int64
ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
User-ID                  int32
Book-Rating            float64
ISBN_encoded             int32
dtype: object

In [30]:
display(book_ratings.loc(["ISBN_encoded"] == "074322678X"))


<pandas.core.indexing._LocIndexer at 0x16759b1dfe0>

In [15]:
book_ratings.to_csv("Output/book_ratings.csv", index = False)

In [None]:
n_users = book_ratings["User-ID"].unique().shape[0]
n_items = book_ratings["ISBN_encoded"].unique().shape[0]
n_items = book_ratings["ISBN_encoded"].max()
A = np.zeros((n_users, n_items))

for line in dataset.itertuples():
    A[line[1] - 1, line[2] - 1] = line[3]


In [32]:
non_integer_rows = book_ratings[~book_ratings['ISBN_encoded'].apply(lambda x: isinstance(x, int))]

In [33]:
non_integer_rows

Unnamed: 0,index,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,ISBN_encoded
