# install Dependencies

In [None]:
!pip install scikit-learn==1.8.0
!pip install category-encoders==2.9.0
!pip install sentence-transformers==5.2.0
!pip install beautifulsoup4==4.14.3



# Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder
import torch
from sentence_transformers import SentenceTransformer
import re
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


# Read Data

In [4]:
data = pd.read_csv(r"/home/mohamedelawakey/Desktop/Programming Books Recommendation System /ml/data/featured/v1/books_with_rating_percentages.csv")
data.head()

Unnamed: 0,Name,Authors,ISBN,Rating,PublishYear,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,...,Description,tech_score,Pages,weighted_rating,average_rating_5,average_rating_4,average_rating_2,average_rating_1,average_high_rating,average_low_rating
0,Between Therapists: The Processing of Transfer...,Arthur Robbins,1853028320,5.0,1999,Jessica Kingsley Publishers,3,0,0,0,...,Arthur Robbins demonstrates how important coun...,1,240.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
1,Last Word: Media Coverage of the Supreme Court...,Florian Sauvageau,774812435,5.0,2005,University of British Columbia Press,1,0,0,0,...,Media coverage of the Supreme Court of Canada ...,4,272.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
2,Autumn Wisdom: A Book of Readings,Richard L. Morgan,1556354843,5.0,2007,Wipf & Stock Publishers,1,0,0,0,...,"""""Compelling . . . a rare find . . . a very sp...",3,191.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
3,James Denney (1856-1917),James M. Gordon,1597527831,5.0,2006,Wipf & Stock Publishers,1,0,0,0,...,"James Denney is now best known, though in incr...",3,286.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
4,Cautious Rebel: A Biography of Susan Clay Smitzky,Lindsey Apple,873385799,5.0,1997,Kent State University Press,1,0,0,0,...,"""Willa Cather wrote that 'the history of every...",2,322.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0


In [5]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 415226 entries, 0 to 415225
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Name                 415226 non-null  str    
 1   Authors              415226 non-null  str    
 2   ISBN                 415226 non-null  str    
 3   Rating               415226 non-null  float64
 4   PublishYear          415226 non-null  int64  
 5   Publisher            415226 non-null  str    
 6   RatingDist5          415226 non-null  int64  
 7   RatingDist4          415226 non-null  int64  
 8   RatingDist3          415226 non-null  int64  
 9   RatingDist2          415226 non-null  int64  
 10  RatingDist1          415226 non-null  int64  
 11  RatingDistTotal      415226 non-null  int64  
 12  CountsOfReview       415226 non-null  float64
 13  Description          415226 non-null  str    
 14  tech_score           415226 non-null  int64  
 15  Pages                415226 

In [6]:
# convert the types of the columns to save the memory

int_columns = data.select_dtypes(include='int64').columns
float_columns = data.select_dtypes(include='float64').columns

for col in int_columns:
    data[col] = data[col].astype('int16')

for col in float_columns:
    data[col] = data[col].astype('float16')
    
data['PublishYear'] = data['PublishYear'].astype('int16')

In [7]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 415226 entries, 0 to 415225
Data columns (total 23 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Name                 415226 non-null  str    
 1   Authors              415226 non-null  str    
 2   ISBN                 415226 non-null  str    
 3   Rating               415226 non-null  float16
 4   PublishYear          415226 non-null  int16  
 5   Publisher            415226 non-null  str    
 6   RatingDist5          415226 non-null  int16  
 7   RatingDist4          415226 non-null  int16  
 8   RatingDist3          415226 non-null  int16  
 9   RatingDist2          415226 non-null  int16  
 10  RatingDist1          415226 non-null  int16  
 11  RatingDistTotal      415226 non-null  int16  
 12  CountsOfReview       415226 non-null  float16
 13  Description          415226 non-null  str    
 14  tech_score           415226 non-null  int16  
 15  Pages                415226 

In [8]:
data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Name,Authors,ISBN,Rating,PublishYear,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,...,Description,tech_score,Pages,weighted_rating,average_rating_5,average_rating_4,average_rating_2,average_rating_1,average_high_rating,average_low_rating
0,Between Therapists: The Processing of Transfer...,Arthur Robbins,1853028320,5.0,1999,Jessica Kingsley Publishers,3,0,0,0,...,Arthur Robbins demonstrates how important coun...,1,240.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
1,Last Word: Media Coverage of the Supreme Court...,Florian Sauvageau,774812435,5.0,2005,University of British Columbia Press,1,0,0,0,...,Media coverage of the Supreme Court of Canada ...,4,272.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
2,Autumn Wisdom: A Book of Readings,Richard L. Morgan,1556354843,5.0,2007,Wipf & Stock Publishers,1,0,0,0,...,"""""Compelling . . . a rare find . . . a very sp...",3,191.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
3,James Denney (1856-1917),James M. Gordon,1597527831,5.0,2006,Wipf & Stock Publishers,1,0,0,0,...,"James Denney is now best known, though in incr...",3,286.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0
4,Cautious Rebel: A Biography of Susan Clay Smitzky,Lindsey Apple,873385799,5.0,1997,Kent State University Press,1,0,0,0,...,"""Willa Cather wrote that 'the history of every...",2,322.0,5.0,100.0,0.0,0.0,0.0,100.0,0.0


In [9]:
for col in data.columns:
    print(col, ':', data[col].dtype)

Name : str
Authors : str
ISBN : str
Rating : float16
PublishYear : int16
Publisher : str
RatingDist5 : int16
RatingDist4 : int16
RatingDist3 : int16
RatingDist2 : int16
RatingDist1 : int16
RatingDistTotal : int16
CountsOfReview : float16
Description : str
tech_score : int16
Pages : float16
weighted_rating : float16
average_rating_5 : float16
average_rating_4 : float16
average_rating_2 : float16
average_rating_1 : float16
average_high_rating : float16
average_low_rating : float16


In [10]:
"""
we need to determine:
 - which columns need to be encoded 
 - which columns need to be scaled 
 - which columns need to be embedded
"""

"""
Name feature --> need to be embedded cause when user search about any think,
the query after embedded it will compared with the Name feature using similarity

Authors feature --> it's a relative matter, we can use embedding, and we can use
target or frequency encoding (i will use target encoding)

ISBN feature --> it's only an ID not useful so i'll delete it

Rating feature --> it done't need to any thing, cause the range between 0, 5

PublishYear feature --> it's a relative matter, we can make scaling or no, 
but i'll do a scaling 

Publisher feature --> need to encoded using target or frequency encoding but i'll use
target encoding

RatingDist1 to RatingDist5 feature --> it done't need to any thing, cause the numbers small enough

RatingDistTotal feature --> it's a relative matter, we can do scaling or no, but i'll do it

CountsOfReview feature --> need to scaling, so i'll will use MinMaxScaler

Description feature --> it's very important feature like name feature, so i'll do embedding fo it

tech_score feature --> it's a relative matter, so i will do scaling to it

Pages feature --> it's need scaling, cause the range between 10, 2000, it's so hight, so we 
need to apply scaling

weighted_rating feature --> it done't need to any thing, cause the range between 1, 5

average_rating_1 to average_rating_5 feature --> it done't need to any thing, it's a percentage 
and its's static

average_low_rating feature --> it done't need to any thing, it's a percentage and its's static

average_high_rating feature --> it done't need to any thing, it's a percentage and its's static
"""

"\nName feature --> need to be embedded cause when user search about any think,\nthe query after embedded it will compared with the Name feature using similarity\n\nAuthors feature --> it's a relative matter, we can use embedding, and we can use\ntarget or frequency encoding (i will use target encoding)\n\nISBN feature --> it's only an ID not useful so i'll delete it\n\nRating feature --> it done't need to any thing, cause the range between 0, 5\n\nPublishYear feature --> it's a relative matter, we can make scaling or no, \nbut i'll do a scaling \n\nPublisher feature --> need to encoded using target or frequency encoding but i'll use\ntarget encoding\n\nRatingDist1 to RatingDist5 feature --> it done't need to any thing, cause the numbers small enough\n\nRatingDistTotal feature --> it's a relative matter, we can do scaling or no, but i'll do it\n\nCountsOfReview feature --> need to scaling, so i'll will use MinMaxScaler\n\nDescription feature --> it's very important feature like name fe

In [11]:
"""
summary: 
- Name + Description --> Embedding
- Publisher, Authors --> Encoding (Target Encoding)
- Pages, CountsOfReview, PublishYear, RatingDistTotal, tech_score, --> Normalization + scaling using (MinMaxScaler)
    CountsOfReview after log
- ISBN --> delete it 
- Rating, RatingDist1 to RatingDist5, weighted_rating, average_rating_1 to average_rating_5, average_low_rating, average_high_rating --> it done't need to any thing
"""

"\nsummary: \n- Name + Description --> Embedding\n- Publisher, Authors --> Encoding (Target Encoding)\n- Pages, CountsOfReview, PublishYear, RatingDistTotal, tech_score, --> Normalization + scaling using (MinMaxScaler)\n    CountsOfReview after log\n- ISBN --> delete it \n- Rating, RatingDist1 to RatingDist5, weighted_rating, average_rating_1 to average_rating_5, average_low_rating, average_high_rating --> it done't need to any thing\n"

# preprocessing the data

In [12]:
# drop the ISBN feature
data = data.drop(['ISBN'], axis=1)

In [13]:
# Encoding the Publisher, Authors features using Target Encoding

# TargetEncoder make NAN values so we will try the Frequency Encoding
"""
encoder = TargetEncoder(
    cols=['Publisher', 'Authors'],
    smoothing=50,
    min_samples_leaf=2,
    handle_unknown='value',
    
)

data['weighted_rating'] = data['weighted_rating'].fillna(data['weighted_rating'].mean())

data[['Publisher_target', 'Authors_target']] = encoder.fit_transform(
    data[['Publisher', 'Authors']],
    data['weighted_rating']
)

# fill them if there any NAN values
global_mean = data['weighted_rating'].mean()

data['Publisher_target'] = data['Publisher_target'].fillna(global_mean)
data['Authors_target'] = data['Authors_target'].fillna(global_mean)

"""

# Encoding the Publisher, Authors features using Frequency Encoding
Publisher_freq = data['Publisher'].value_counts(normalize=True)
Authors_freq = data['Authors'].value_counts(normalize=True)

data['Publisher_frequently'] = data['Publisher'].map(Publisher_freq).fillna(0)
data['Authors_frequently'] = data['Authors'].map(Authors_freq).fillna(0)

In [14]:
# Normalization + Scaling the Pages, CountsOfReview, PublishYear, RatingDistTotal, tech_score features

scaler = MinMaxScaler()
data['Pages_scaled'] = scaler.fit_transform(data[['Pages']])
data['PublishYear_scaled'] = scaler.fit_transform(data[['PublishYear']])
data['RatingDistTotal_scaled'] = scaler.fit_transform(data[['RatingDistTotal']])
data['tech_score_scaled'] = scaler.fit_transform(data[['tech_score']])

# we need to apply the log fot the CountsOfReview feature
data['CountsOfReview_log'] = np.log1p(data['CountsOfReview'])
data['CountsOfReview_scaled'] = scaler.fit_transform(data[['CountsOfReview_log']])

In [15]:
data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Name,Authors,Rating,PublishYear,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,...,average_high_rating,average_low_rating,Publisher_frequently,Authors_frequently,Pages_scaled,PublishYear_scaled,RatingDistTotal_scaled,tech_score_scaled,CountsOfReview_log,CountsOfReview_scaled
0,Between Therapists: The Processing of Transfer...,Arthur Robbins,5.0,1999,Jessica Kingsley Publishers,3,0,0,0,0,...,100.0,0.0,0.001187,5e-06,0.115601,0.09042,0.500168,0.0,0.0,0.0
1,Last Word: Media Coverage of the Supreme Court...,Florian Sauvageau,5.0,2005,University of British Columbia Press,1,0,0,0,0,...,100.0,0.0,0.000328,2e-06,0.131714,0.090722,0.500137,0.04918,0.0,0.0
2,Autumn Wisdom: A Book of Readings,Richard L. Morgan,5.0,2007,Wipf & Stock Publishers,1,0,0,0,0,...,100.0,0.0,0.000689,5e-06,0.091003,0.090822,0.500137,0.032787,0.0,0.0
3,James Denney (1856-1917),James M. Gordon,5.0,2006,Wipf & Stock Publishers,1,0,0,0,0,...,100.0,0.0,0.000689,2e-06,0.138794,0.090772,0.500137,0.032787,0.0,0.0
4,Cautious Rebel: A Biography of Susan Clay Smitzky,Lindsey Apple,5.0,1997,Kent State University Press,1,0,0,0,0,...,100.0,0.0,0.000419,2e-06,0.15686,0.09032,0.500137,0.016393,0.0,0.0


In [16]:
# check the text is clean or not before embedding
n_rows = 30

print(data[['Description']].head(n_rows).to_string(index=True))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [17]:
print(data[['Name']].head(n_rows).to_string(index=True))

                                                                                                                           Name
0                                               Between Therapists: The Processing of Transference/Countertransference Material
1                                                                      Last Word: Media Coverage of the Supreme Court of Canada
2                                                                                             Autumn Wisdom: A Book of Readings
3                                                                                                      James Denney (1856-1917)
4                                                                             Cautious Rebel: A Biography of Susan Clay Smitzky
5                                                                                                    Selected Poetry And Essays
6                                                                        Windows 95 Unleashed/Book and C

In [18]:
print(data[['Name', 'Description']].sample(n=20, random_state=42).to_string(index=True))

                                                                                                           Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [19]:
longest = data.assign(desc_len=data['Description'].str.len()).nlargest(10, 'desc_len')[['Name', 'Description', 'desc_len']]
print(longest.to_string(index=True))

                                                                                                        Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [20]:
# the text deed to clean

def clean_description_name(text: str) -> str:
    if not isinstance(text, str):
        return ''
    
    # remove html tags
    text = BeautifulSoup(text, 'html.parser').get_text(separator='')
    
    # remove links or domains (if found)
    pattern_domain = r'https?://\S+|www\.\S+'
    text = re.sub(pattern_domain, ' ', text)
    
    # remove page numbers like [1], p.45, ...
    pattern_page_numbers = r'\[\d+\]|\(p\.\s*\d+\)'
    text = re.sub(pattern_page_numbers, ' ', text)

    # replace more line or space with only one space
    pattern_more_space_line_removal = r'\s+'
    text = re.sub(pattern_more_space_line_removal, ' ', text)
    
    # remove punctuation that repeated
    pattern_punctuation_end_sen = r'([.!?])\1+'
    pattern_punctuation_lines_repeated = r'[-_*/]{2,}'
    
    text = re.sub(pattern_punctuation_end_sen, r'\1', text)
    text = re.sub(pattern_punctuation_lines_repeated, ' ', text)
    
    # remove spaces from start and end
    text = text.strip()
    
    return text    

In [21]:
# edit the Name and the Description with the correct text

data['Name_cleaned'] = data['Name'].fillna('').apply(clean_description_name)
data['Description_cleaned'] = data['Description'].fillna('').apply(clean_description_name)

data['text_for_embedding'] = (
    data['Name_cleaned'] + ' ' +
    data['Description_cleaned']
).str.strip()

In [22]:
data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Name,Authors,Rating,PublishYear,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,...,Authors_frequently,Pages_scaled,PublishYear_scaled,RatingDistTotal_scaled,tech_score_scaled,CountsOfReview_log,CountsOfReview_scaled,Name_cleaned,Description_cleaned,text_for_embedding
0,Between Therapists: The Processing of Transfer...,Arthur Robbins,5.0,1999,Jessica Kingsley Publishers,3,0,0,0,0,...,5e-06,0.115601,0.09042,0.500168,0.0,0.0,0.0,Between Therapists: The Processing of Transfer...,Arthur Robbins demonstrates how important coun...,Between Therapists: The Processing of Transfer...
1,Last Word: Media Coverage of the Supreme Court...,Florian Sauvageau,5.0,2005,University of British Columbia Press,1,0,0,0,0,...,2e-06,0.131714,0.090722,0.500137,0.04918,0.0,0.0,Last Word: Media Coverage of the Supreme Court...,Media coverage of the Supreme Court of Canada ...,Last Word: Media Coverage of the Supreme Court...
2,Autumn Wisdom: A Book of Readings,Richard L. Morgan,5.0,2007,Wipf & Stock Publishers,1,0,0,0,0,...,5e-06,0.091003,0.090822,0.500137,0.032787,0.0,0.0,Autumn Wisdom: A Book of Readings,"""""Compelling . . . a rare find . . . a very sp...","Autumn Wisdom: A Book of Readings """"Compelling..."
3,James Denney (1856-1917),James M. Gordon,5.0,2006,Wipf & Stock Publishers,1,0,0,0,0,...,2e-06,0.138794,0.090772,0.500137,0.032787,0.0,0.0,James Denney (1856-1917),"James Denney is now best known, though in incr...",James Denney (1856-1917) James Denney is now b...
4,Cautious Rebel: A Biography of Susan Clay Smitzky,Lindsey Apple,5.0,1997,Kent State University Press,1,0,0,0,0,...,2e-06,0.15686,0.09032,0.500137,0.016393,0.0,0.0,Cautious Rebel: A Biography of Susan Clay Smitzky,"""Willa Cather wrote that 'the history of every...",Cautious Rebel: A Biography of Susan Clay Smit...


In [23]:
# Embedding the features of Name, Description features

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(
    data['text_for_embedding'].tolist(),
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True
)

Batches: 100%|██████████| 3244/3244 [3:20:45<00:00,  3.71s/it]  


In [24]:
# convert to data frame
emb_col = [f'emb{i}' for i in range(embeddings.shape[1])]
emb_df = pd.DataFrame(embeddings, columns=emb_col, index=data.index)

In [25]:
# marge all data
data = pd.concat([data.reset_index(drop=True), emb_df.reset_index(drop=True)], axis=1)

In [26]:
data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Name,Authors,Rating,PublishYear,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,...,emb374,emb375,emb376,emb377,emb378,emb379,emb380,emb381,emb382,emb383
0,Between Therapists: The Processing of Transfer...,Arthur Robbins,5.0,1999,Jessica Kingsley Publishers,3,0,0,0,0,...,0.080532,0.01215,0.032719,0.01712,-0.087718,0.072108,0.046356,0.13046,-0.033658,-0.037323
1,Last Word: Media Coverage of the Supreme Court...,Florian Sauvageau,5.0,2005,University of British Columbia Press,1,0,0,0,0,...,0.026014,0.049502,0.051143,0.057057,-0.05134,0.050311,0.005685,0.018655,0.004255,0.060309
2,Autumn Wisdom: A Book of Readings,Richard L. Morgan,5.0,2007,Wipf & Stock Publishers,1,0,0,0,0,...,-0.024516,0.018459,-0.025158,0.031864,-0.049244,-0.088706,0.041197,-0.036828,-0.031374,-0.031394
3,James Denney (1856-1917),James M. Gordon,5.0,2006,Wipf & Stock Publishers,1,0,0,0,0,...,0.02387,0.062438,0.008619,0.072061,-0.063944,0.024584,0.032356,-0.020164,0.008109,-0.042561
4,Cautious Rebel: A Biography of Susan Clay Smitzky,Lindsey Apple,5.0,1997,Kent State University Press,1,0,0,0,0,...,-0.047381,-0.054794,0.028854,0.044863,0.041969,-0.036443,0.000149,-0.049355,0.003883,-0.016954


# Save Data In CSV File

In [28]:
data.to_csv("/home/mohamedelawakey/Desktop/Programming Books Recommendation System /ml/data/processed/v1/book_backend_full_features.csv", index=False, encoding='utf-8')
print('saved is successfully')

saved is successfully


# Drop Columns un useful for model

In [29]:
columns_to_drop = [
    'Name', 'Description',
    'Name_cleaned', 'Description_cleaned', 'text_for_embedding',
    'Pages', 'CountsOfReview', 'PublishYear', 'RatingDistTotal', 'tech_score', 'CountsOfReview_log',
    'Publisher', 'Authors'
]

data = data.drop(columns=columns_to_drop)


In [30]:
print(len(data.columns))
print(data.columns.tolist())

404
['Rating', 'RatingDist5', 'RatingDist4', 'RatingDist3', 'RatingDist2', 'RatingDist1', 'weighted_rating', 'average_rating_5', 'average_rating_4', 'average_rating_2', 'average_rating_1', 'average_high_rating', 'average_low_rating', 'Publisher_frequently', 'Authors_frequently', 'Pages_scaled', 'PublishYear_scaled', 'RatingDistTotal_scaled', 'tech_score_scaled', 'CountsOfReview_scaled', 'emb0', 'emb1', 'emb2', 'emb3', 'emb4', 'emb5', 'emb6', 'emb7', 'emb8', 'emb9', 'emb10', 'emb11', 'emb12', 'emb13', 'emb14', 'emb15', 'emb16', 'emb17', 'emb18', 'emb19', 'emb20', 'emb21', 'emb22', 'emb23', 'emb24', 'emb25', 'emb26', 'emb27', 'emb28', 'emb29', 'emb30', 'emb31', 'emb32', 'emb33', 'emb34', 'emb35', 'emb36', 'emb37', 'emb38', 'emb39', 'emb40', 'emb41', 'emb42', 'emb43', 'emb44', 'emb45', 'emb46', 'emb47', 'emb48', 'emb49', 'emb50', 'emb51', 'emb52', 'emb53', 'emb54', 'emb55', 'emb56', 'emb57', 'emb58', 'emb59', 'emb60', 'emb61', 'emb62', 'emb63', 'emb64', 'emb65', 'emb66', 'emb67', 'emb68',

In [31]:
data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,Rating,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,weighted_rating,average_rating_5,average_rating_4,average_rating_2,...,emb374,emb375,emb376,emb377,emb378,emb379,emb380,emb381,emb382,emb383
0,5.0,3,0,0,0,0,5.0,100.0,0.0,0.0,...,0.080532,0.01215,0.032719,0.01712,-0.087718,0.072108,0.046356,0.13046,-0.033658,-0.037323
1,5.0,1,0,0,0,0,5.0,100.0,0.0,0.0,...,0.026014,0.049502,0.051143,0.057057,-0.05134,0.050311,0.005685,0.018655,0.004255,0.060309
2,5.0,1,0,0,0,0,5.0,100.0,0.0,0.0,...,-0.024516,0.018459,-0.025158,0.031864,-0.049244,-0.088706,0.041197,-0.036828,-0.031374,-0.031394
3,5.0,1,0,0,0,0,5.0,100.0,0.0,0.0,...,0.02387,0.062438,0.008619,0.072061,-0.063944,0.024584,0.032356,-0.020164,0.008109,-0.042561
4,5.0,1,0,0,0,0,5.0,100.0,0.0,0.0,...,-0.047381,-0.054794,0.028854,0.044863,0.041969,-0.036443,0.000149,-0.049355,0.003883,-0.016954


In [32]:
data.shape

(415226, 404)

# Save Data For Model

In [33]:
data.to_csv("/home/mohamedelawakey/Desktop/Programming Books Recommendation System /ml/data/processed/v1/books_with_rating_percentages.csv", index=False, encoding='utf-8')
print('saved is successfully')

saved is successfully
