# **1-Dataset Analysis:**


## *i) Cleaning Dataset:* <br />
   Jan 23 Last edits

Latest check 30th of Jan. 2023 at 5:00PM

### **Importings:**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from datetime import datetime 
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from ast import literal_eval




In [None]:
data= pd.read_csv("../data-history/up-to-date-MAL/anime_Feb23.csv")


print(data.shape)
data.head(5).T

### Taking care of nulls and drops:

In [None]:
drops=["main_picture_medium","main_picture_large","broadcast_day_of_the_week","broadcast_start_time","alternative_titles_en","alternative_titles_ja","alternative_titles_synonyms"]
data['fav_percent'] = data['num_favorites'] / data['num_list_users']
data_main=data.drop(drops,axis=1)[['id','title','media_type','mean','num_scoring_users','num_episodes',"source",'popularity','fav_percent','rank','rating',"genres","studios",'synopsis',"nsfw"]]

In [None]:
data_main.isnull().sum()

In [None]:
df = data_main[data_main.synopsis.isna()]

In [None]:
(df.groupby('media_type')['media_type'].count()/data_main.groupby('media_type')['media_type'].count()).plot(kind='bar')

In [None]:
(df.groupby('source')['source'].count()/data_main.groupby('source')['source'].count()).plot(kind='bar')

In [None]:
(df.groupby('rating')['rating'].count()/data_main.groupby('rating')['rating'].count()).plot(kind='bar')

In [None]:
(df.groupby('nsfw')['nsfw'].count()/data_main.groupby('nsfw')['nsfw'].count()).plot(kind='bar')

In [None]:
def fill_na(df,col):
    index = df[df[col].isna()].index
    value = np.abs(np.random.normal(loc=df[col].mean(), scale=df[col].std(), size=df[col].isna().sum()))
    df[col] = df[col].fillna(pd.Series(value, index=index))

In [None]:
fill_na(data_main,'mean')
fill_na(data_main,'rank')
data_main['fav_percent'] = data_main['fav_percent'].fillna(value=round(data_main['fav_percent'].mean(),3))
data_main['num_episodes'] = data_main['num_episodes'].fillna(value=round(data_main['num_episodes'].mean()))
data_main['source'] = data_main['source'].fillna(value=data_main['source'].mode()[0])
data_main['rating'] = data_main['rating'].fillna(value=data_main['rating'].mode()[0])
data_main['synopsis'] = data_main['synopsis'].fillna('')

In [None]:
data_main.isnull().sum()


In [None]:
data_main.nunique()

In [None]:
data_main.columns

## Preprocessing:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm
import ast
import re
import spacy as sp
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
data_main.title.head(10)

## *ii)EDA:*

In [None]:
data_main.columns

In [None]:
data_main['mean'].values[0]

In [None]:
sns.set_style("dark")
plt.figure(figsize=(12,6))
plt.hist(data['mean'], bins=100)
plt.show()

In [None]:
fig = px.pie(data_main, 'media_type')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

*Notes:* Naturally TV has higher percentage as anime media.


In [None]:
corr = data_main.corr()

# Set up the matplotlib plot configuration
#
f, ax = plt.subplots(figsize=(16, 10))
#
# Generate a mask for upper traingle
#
mask = np.triu(np.ones_like(corr, dtype=bool))
#
# Configure a custom diverging colormap
#
cmap = sns.diverging_palette(230, 20, as_cmap=True)
#
# Draw the heatmap
#
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

*Notes:*  So, basically interesting factors that are affecting the mean factor are : rank, popularity, num_scoring_users, ignore num_list_users for now till further investigation of difference.

In [None]:
fig = px.histogram(data[pd.to_datetime(data['start_date']).dt.year >= 1980], x='start_date', color='media_type')
fig.update_layout(bargap=0.1)

*Notes:* Obviously 2016 was a good year for Otakus :3 specially summer-Autumn-Fall seasons, with 119 tv, 45 movie, 23 ova, 61 ona, 60 special and 41 music. (Gotta check watching list lmao).

In [None]:
data_main.groupby('num_episodes')['id'].count().sort_values(ascending=False).head(30).plot(kind='bar', figsize=(8,6))
plt.show()

*Notes:* A lot of Movies (1 episode) that's why the spike, but the summation of all others are the other percentages of tv,ova,ona,... etc. most tv/specials are short 12 (episodes)/(season|title). </br>
*The fans of "When you have eliminated the impossible" teenager for 22+ years don't give up :(* </br>
*Gomu Gomu no guys don't be Sadge :(*

In [None]:
#data_main[['title','fav_percent']].sort_values(by=['fav_percent'],ascending=False).head(30).plot(kind='bar', figsize=(15,10))
data_main.sort_values(by=['fav_percent'],ascending=False)[1:21].plot(kind='bar',x='title',y='fav_percent');

One Piece in the top as expected

###  NLP Pre-processing

### Synopsis Keyword Analysis:
*(NLP)* :
* KeyBERT.
* Spacy.


* Creating clean text, nouns and keywords from synopsis.
* Separate in new df for data analysis.
* Delete Syns entries from main df.


In [None]:
NLP = sp.load("en_core_web_lg")
TITLE = 'Death Note'
key_model = KeyBERT()
data_main = data_main[~data_main.title.duplicated(keep='first')]
text = data_main[data_main['title'] == TITLE].synopsis.values[0]
def clean_text(text):
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = text.replace('\n', "").replace('\r', "")
    text = text.replace('', "")
    text = re.sub('[^a-zA-Z]', " ", str(text))
    text = ' '.join(text.split())
    text = text.lower()
    doc = NLP(text)
    return doc

doc = clean_text(text)
print(doc)

In [None]:
data_main.loc[:,'cleaned_syn'] = data_main.loc[:,'synopsis'].astype(str).apply(clean_text)

In [None]:
data_main.columns

In [None]:
df_string=data_main[['title','synopsis','cleaned_syn']]

In [None]:
df_num=pd.get_dummies(data_main[["media_type","source","nsfw","genres","rating","studios"]], columns=["media_type","source","nsfw","genres","rating","studios"], prefix=["media_type","source","nsfw","genres","rating","studios"])
df_num[['id','mean','num_scoring_users','num_episodes','popularity','fav_percent','rank']]=data_main[['id','mean','num_scoring_users','num_episodes','popularity','fav_percent','rank']]

In [None]:
# Based on https://stackoverflow.com/questions/48925328/how-to-get-all-noun-phrases-in-spacy
def get_candidates(doc):
    # code to recursively combine nouns
    # 'We' is actually a pronoun but included in your question
    # hence the token.pos_ == "PRON" part in the last if statement
    # suggest you extract PRON separately like the noun-chunks above

    index = 0
    noun_indices = [i for i, token in enumerate(doc) if token.pos_ == 'NOUN']
    candidates = []
    for idxValue in noun_indices:
        start = doc[idxValue].left_edge.i if not bool(doc[idxValue].left_edge.ent_type_) else idxValue
        finish = doc[idxValue].right_edge.i+1 if not bool(doc[idxValue].right_edge.ent_type_) else idxValue + 1
        if 0 < finish-start < 7:
            span = doc[start:finish]
            candidates.append(span.text)
    return candidates

candidates = get_candidates(doc)
print(candidates)


#### **Applying Key-BERT for Keywords extraction:**

In [None]:
key_model = KeyBERT()
def get_keywords(doc):
    keywords = key_model.extract_keywords(doc.text,keyphrase_ngram_range=(1, 2), candidates=candidates,stop_words='english', use_mmr=True, diversity=0.7)
    return keywords

In [None]:
get_keywords(doc)
df_string.loc[:,'nouns'] = df_string.loc[:,'cleaned_syn'].apply(get_candidates)

In [None]:
df_string.loc[:,'keywords'] = df_string.loc[:,'cleaned_syn'].apply(get_keywords)

In [None]:
df_string.sample(20)

In [None]:
df_num.shape,df_string.shape

In [None]:
df_string.sample(3)

In [None]:
df_string.to_csv(r'M:\Anime Recommender\data-history\up-to-date-MAL\anime_string_latest.csv')

 `df_num` is the Data Frame for the Analytical approach and distance techniques. <br />
 `df_string` is the textual Data Frame for the NLP approaches to get contextual/semantic content based recommendations.

# 2- **MODELS TIME:** :3
![image info](https://hips.hearstapps.com/hmg-prod.s3.amazonaws.com/images/gettyimages-458406992-1538405221.jpg?crop=0.9xw:0.9xh;0,0&resize=256:*) <br />
  July's 2022 Work

## Similarity Analysis :

### Similarity Analysis Using The Numerical Features

#### Model 1 :  Nearest Neighbors

In [None]:
from sklearn.neighbors import KDTree

scaler = StandardScaler()
scaler.fit(df_num.drop(['id'],axis=1).to_numpy())

X = scaler.transform(df_num.drop(['id'],axis=1).to_numpy())

KDTree

In [None]:
kdt = KDTree(X, metric='euclidean')
indices = kdt.query(X, k=15, return_distance=False)

In [None]:
def get_item_recommendations(anime_title, anime_idx=-1):
    if anime_idx == -1:     
        anime_idx = data_main[data_main['title'] == anime_title].index[0]
    return data_main.iloc[indices[anime_idx][1:]]

In [None]:
get_item_recommendations('Sword Art Online')

### Similarity Analysis Using The String Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet


In [None]:
# import sweetviz as sv
# #You could specify which variable in your dataset is the target for your model creation. We can specify it using the target_feat parameter.
# data_report = sv.analyze(data_main)



In [None]:
# data_report.show_notebook(w=1500, h=900, scale=0.8)
# data_report.show_html(scale=0.9)

### **Cos-similarity | TFIDF:**

In [None]:
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['synopsis'] + data['genres'] + data['rating'] + data['studios']+data['media_type'])
tfidf_matrix.shape

Using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two movies. 

$cosine(x,y) = \frac{x. y^\intercal}{||x||.||y||}$

In [None]:
cos_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
data = data_main.reset_index()
titles = data['title']
indices = pd.Series(data_main.index, index=data['title'])

In [None]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    anime_indices = [i[0] for i in sim_scores]
    return titles.iloc[anime_indices]
data['title'][3]

In [None]:
cos_results=get_recommendations('Death Note').head(10)
cos_results

Not so close recommendations but good start


**zenzen wakaranaaaaaiiiii !!!!!!!!!!!!!** </br>
:"D </br>
pair-wise distance results not related to cosine Similarity results at all no intersections. </br>
using keywords or using Full synopsis didn't matter for cos similarity so better for resources use keywords

### **RecommendNet Maybe?** :

**Zenzen heiki janai :"D , Tasukete, Dare ka tasukeeteeeeee !** <br />
  Aug 2022 Work

#### *Normal Recommender features*

In [None]:
# Callbacks
from tensorflow.python.keras.callbacks import Callback, ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping, ReduceLROnPlateau

start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005
batch_size = 10000
rampup_epochs = 5
sustain_epochs = 0
exp_decay = .8

def lrfn(epoch):
    if epoch < rampup_epochs:
        return (max_lr - start_lr)/rampup_epochs * epoch + start_lr
    elif epoch < rampup_epochs + sustain_epochs:
        return max_lr
    else:
        return (max_lr - min_lr) * exp_decay**(epoch-rampup_epochs-sustain_epochs) + min_lr


lr_callback = LearningRateScheduler(lambda epoch: lrfn(epoch), verbose=0)

checkpoint_filepath = './weights.h5'

model_checkpoints = ModelCheckpoint(filepath=checkpoint_filepath,
                            save_weights_only=True,
                            monitor='val_loss',
                            mode='min',
                            save_best_only=True)

early_stopping = EarlyStopping(patience = 3, monitor='val_loss', 
                            mode='min', restore_best_weights=True)

my_callbacks = [
    model_checkpoints,
    lr_callback,
    early_stopping,   
]

In [None]:
print(len(X_test_array[0]))
print(len(y_test))


In [None]:
# Model training
history = model1.fit(
    x=X_train_array,
    y=y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=(X_test_array, y_test),
    callbacks=my_callbacks
)

model1.load_weights(checkpoint_filepath)

In [None]:
#Training results
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history["loss"][0:-2])
plt.plot(history.history["val_loss"][0:-2])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

In [None]:
from tqdm.keras import TqdmCallback


history = model1.fit(
    x=X_train_array,
    y=y_train,
    batch_size=batch_size,
    epochs=30,
    validation_data=(X_test_array, y_test),
    verbose = 0, 
    callbacks=[TqdmCallback(verbose=0)])

model1.load_weights(checkpoint_filepath)



In [None]:
#Training results
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(history.history["loss"][0:-2])
plt.plot(history.history["val_loss"][0:-2])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()

In [None]:
def extract_weights(name, model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights / np.linalg.norm(weights, axis = 1).reshape((-1, 1))
    return weights

anime_weights = extract_weights('anime_embedding', model1)
user_weights = extract_weights('user_embedding', model1)

In [None]:
data_main.columns

In [None]:
name = data[data_main.id == 100].title.values[0]
print(name)

In [None]:
# Fixing Names
def get_animename(anime_id):
    try:
        name = data[data_main.id == anime_id].title.values[0]
        return name
    except:
        print('error')
        return 0

data["eng_version"] = data['title']


data_main.sort_values(by=['mean'], 
                inplace=True,
                ascending=False, 
                kind='quicksort',
                na_position='last')

df = data[["id","title", "mean", "genres", "num_episodes", 
        "media_type","synopsis"]]


def get_animeframe(anime):
    if isinstance(anime, int):
        return df[df.id == anime]
    if isinstance(anime, str):
        return df[df.title == anime]
def get_sypnopsis(anime):
    if isinstance(anime, int):
        return df[df.id == anime].synopsis.values[0]

In [None]:
df.shape

In [None]:
pd.set_option("max_colwidth", None)

def find_similar_animes(name, n, return_dist=False, neg=False):
        index = get_animeframe(name).id.values[0]
        print(index)
        encoded_index = anime2anime_encoded.get(index)
        weights = anime_weights
        print(encoded_index)
        dists = np.dot(weights, weights[encoded_index])
        sorted_dists = np.argsort(dists)
        
        n = n + 1            
        
        if neg:
            closest = sorted_dists[:n]
        else:
            closest = sorted_dists[-n:]
        print('animes closest to {}'.format(name))
        if return_dist:
            return dists, closest
        rindex = df
        similarityarr = []
        for close in closest:
            decoded_id = anime_encoded2anime.get(close)
            sypnopsis = get_sypnopsis(decoded_id)
            anime_frame = get_animeframe(decoded_id)
            anime_name = anime_frame.title.values[0]
            genre = anime_frame.genres.values[0]
            similarity = dists[close]
            similarityarr.append({"id": decoded_id, "title": anime_name,
                            "similarity": similarity,"genres": genre,
                            'synopsis': sypnopsis})
        frame = pd.Dataframe(similarityarr).sort_values(by="similarity", ascending=False)
        return frame[frame.id != index].drop(['id'], axis=1)


In [None]:
find_similar_animes('Death Note', n=10, neg=False)

#### *Features modding* <br />
   Modifying parameters for Recommend NET

In [None]:
# dfdl =pd.DataFrame()

In [None]:
# dfdl_ids = data["id"].tolist()
# dfdlid_encoded = {x: i for i, x in enumerate(dfdl_ids)}
# n_animes = len(dfdlid_encoded)
# id_encoded2id = {i: x for i, x in enumerate(dfdl_ids)}
# dfdl["id"] = data["id"].map(dfdlid_encoded)

# dfdl_mean = data["mean"].tolist()
# dfdl_mean_encoded = {x: i for i, x in enumerate(dfdl_mean)}
# mean_encoded2mean = {i: x for i, x in enumerate(dfdl_mean)}
# n_users = len(dfdl_mean_encoded)
# dfdl["mean"] = data["mean"].map(dfdl_mean_encoded)

# dfdl_pop = data["popularity"].tolist()
# user2user_encoded = {x: i for i, x in enumerate(dfdl_pop)}
# user_encoded2user = {i: x for i, x in enumerate(dfdl_pop)}
# n_users = len(user2user_encoded)
# dfdl["popularity"] = data["popularity"].map(user2user_encoded)

In [None]:
data_main.columns, data_main.shape

In [None]:

# x1 = rdf[['user', 'anime']].values 

# #x2=  data[['id'],['popularity']].values
# x3=data[['mean'],['num_scoring_users']].values
# x4=data['rank'].tolist(),data['num_favorites'].tolist()
# x5= df.filter(regex='^media_type_',axis=1).values[i]
# x6= df.filter(regex='^source_',axis=1).values[i]

# y = rdf["rating"]
# # Split
# test_set_size = 250000 #10k for test set
# train_indices = rdf_sampled.shape[0] - test_set_size 
# len(x1),len(x2),len(x2[1]),len(x3),len(x3[1]),len(x4),len(x4[1]),len(y),


In [None]:
# X3= x3[:,0] + x3[:,2] +x3[:,3] + x3[:,4] + x3[:,5] + x3[:,1] 
# X4=['None']*len(x4)*len(x4[1])
# for i in range(len(x4[1])):
#     X4 =X4 + x4[:,i]

In [None]:
# X1_train, X1_test, y_train, y_test = (
#     x1[:train_indices],
#     x1[train_indices:],
#     y[:train_indices],
#     y[train_indices:],
# )

### *After Reading Some Articels:*

#### **Research at home** <br />
   Dec. 2022 work <br />

Semantic Similarity on synopsis using nlp models.

Potential Models for learning: <br />
* paraphrase-miniLM
* stsb-roberta latest alternatives
* bert-base-nli-mean-tokens

**To_Do:**
- Get embeddings from pretrained for all synopsis ( all paragraphs ).
- Compare Similarity using distance wise / cosine / pairwise whatever the hell will measure similarity of embeddings.
- Worst case senario ,(For each sentence embeddings in the requested anime synopsis loop cosine similarity between all sentences in all other synopsis)
- Optimization worth testing: Finding similarity between sentences in the same synopsis to get unique sentences and store those while ignoring sentences that are pretty much similar in embeddings, that leads to having smaller group of sentences for each synopsis to loop on (Still looping bratan).
- 5Head IDEA: Semantic Keyword embeddings similarity analysis to get potential chosen titles to do semantic sentence analysis on.
- **OR JUST USE PARAPHRASE MINING U Fokin IDIOT, anata BAKA ??? hontoni BAKAAAA.**

## *Using Synopsis NLP:*

#### revised pytorch April 23


#### Using the PyTorch universal encoder:

In [None]:
### Pytorch implementation:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import sentence_transformers
from torch.utils.data import Dataset, DataLoader
import time
from IPython.display import display
import sys
import os

In [None]:
# Set the GPU as the device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)


In [None]:
# Load the UniversalSentenceEncoder model
print("Loading model...")
model = sentence_transformers.SentenceTransformer('roberta-base')
model = model.to(device)
print("Loaded model and moved it to device.")
print(f"Memory allocated: {torch.cuda.memory_allocated(device=device)/(1024**2):.2f} MB")  # <-- add this line to check GPU memory usage

In [None]:
# Load the string data:
df_string = pd.read_csv(r'M:\Anime Recommender\data-history\up-to-date-MAL\anime_string_latest.csv')
df_string.rename(columns={"Unnamed: 0": "ID"}, inplace=True)
df_string.dropna(inplace=True)

In [None]:
# Define a function to get the text encoding using the GPU
@torch.no_grad()
def get_encoding(x):
    if isinstance(x, str) and x.strip() != "":
        print(f"Encoding: {x}", flush=True)
        sys.stdout.flush()
        encoding = torch.tensor(model.encode([x], show_progress_bar=True), device=device)
        print(f"Encoded tensor shape: {encoding.shape}", flush=True)
        sys.stdout.flush()
        return encoding
    else:
        print(f"Encoding: None", flush=True)
        sys.stdout.flush()
        return None

In [None]:
class MyDataset(Dataset):
    def __init__(self, df, device):
        self.df = df
        self.device = device
        self.df['cleaned_syn_encoding'] = self.df['cleaned_syn'].apply(get_encoding)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df['cleaned_syn_encoding'].iloc[idx]

In [None]:

from scipy.spatial import distance


def get_top_similar(df , k):
    # Extract the embeddings from the "encodings" column into a numpy array
    embeddings = df['encodings'].to_numpy()
    
    # Compute pairwise cosine distances between all embeddings
    distances = distance.pdist(embeddings, metric='cosine')
    
    # Convert condensed distance matrix to square distance matrix
    dist_matrix = distance.squareform(distances)
    
    # Sort the distance matrix to get indices of the most similar embeddings for each embedding
    sorted_indices = np.argsort(dist_matrix)
    
    # Create an empty list to hold the top "n" similar IDs for each embedding
    top_n_similar_ids = []
    
    # Iterate through each row in the sorted_indices array and extract the top "n" IDs
    for i in range(len(sorted_indices)):
        top_n_similar_ids.append(list(df['ID'].iloc[sorted_indices[i][1:n+1]]))
    
    # Create a new DataFrame column called "top_n_similar_ids" containing the top "n" similar IDs for each embedding
    df['top_n_similar_ids'] = top_n_similar_ids
    
    # Return the updated DataFrame
    return df


In [None]:
sys.stdout.flush()
dataset = MyDataset(df_string, device)
dataloader = DataLoader(dataset, batch_size=64, num_workers=1, pin_memory=True)
result = []
for batch in dataloader:
    with torch.cuda.amp.autocast():
        encodings = torch.stack(batch)
        print(f"Memory allocated: {torch.cuda.memory_allocated(device=device)/(1024**2):.2f} MB", flush=True)  # <-- add this line to check GPU memory usage
        sys.stdout.flush()
df_string['encodings'] = encodings


In [None]:
closest_IDs = get_top_similar( df_string,k=15) 
df_string['similar_IDs'] = result

In [None]:



# Define a function to get the top k most similar IDs using PyTorch
def get_top_k_similar_IDs(x, df, k=15):
    encoding = x
    if encoding is not None:
        print("Computing cosine similarity distances...", flush=True)
        sys.stdout.flush()
        distances = F.cosine_similarity(encoding, torch.stack(df['cleaned_syn_encoding'].values))
        print(f"Distances tensor shape: {distances.shape}", flush=True)
        sys.stdout.flush()
        closest_paragraphs_indices = distances.argsort(descending=True)[:k + 1][1:]
        print(f"Top {k} indices: {closest_paragraphs_indices}", flush=True)
        sys.stdout.flush()
        return df.iloc[closest_paragraphs_indices]['ID'].values
    else:
        print("Encoding is None, returning None.", flush=True)
        sys.stdout.flush()
        return None



# Define the function to process each chunk
def process_chunk(chunk):
    print(f"Processing chunk {chunk.index[0]}-{chunk.index[-1]}...", flush=True)
    sys.stdout.flush()
    dataset = MyDataset(chunk, device)
    dataloader = DataLoader(dataset, batch_size=64, num_workers=4, pin_memory=True)
    result = []
    for batch in dataloader:
        with torch.cuda.amp.autocast():
            encodings = torch.stack(batch)
            print(f"Memory allocated: {torch.cuda.memory_allocated(device=device)/(1024**2):.2f} MB", flush=True)  # <-- add this line to check GPU memory usage
            sys.stdout.flush()
            closest_IDs = [get_top_k_similar_IDs(encodings[i], chunk, k=15) for i in range(len(batch))]
        result.extend(closest_IDs)
    chunk['similar_IDs'] = result
    print(f"Memory allocated: {torch.cuda.memory_allocated(device=device)/(1024**2):.2f} MB", flush=True)  # <-- add this line to check GPU memory usage
    sys.stdout.flush()
    return chunk
def init_child(model_, device_):
    print("Initializing child process with model and device...", flush=True)
    sys.stdout.flush()
    global model, device
    model = model_
    device = device_
    print("Initialized child process with model and device.", flush=True)
    sys.stdout.flush()    

In [None]:
df_string.shape

In [None]:
df_string.sample(5)

In [None]:


# Define the number of processes to use
num_processes = torch.multiprocessing.cpu_count()
print(f"Using {num_processes} processes.")
# Split the data into chunks
chunks = np.array_split(df_string, num_processes)
print(f"Data split into {len(chunks)} chunks.")
# Set up the multiprocessing pool
print("Initializing multiprocessing pool...")
pool = torch.multiprocessing.Pool(processes=num_processes, initializer=init_child, initargs=(model, device))
print("Multiprocessing pool created.", flush=True)
# Process the chunks in parallel
results = []
start_time = time.time()
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i+1}/{len(chunks)}...", flush=True)
    sys.stdout.flush()
    result = process_chunk(chunk)
    results.append(result)
    elapsed_time = time.time() - start_time
    print(f"Chunk {i+1}/{len(chunks)} processed in {elapsed_time:.2f} seconds.", flush=True)
    print(f"Memory allocated: {torch.cuda.memory_allocated(device=device)/(1024**2):.2f} MB", flush=True)
    sys.stdout.flush()
    
# Combine the results from all processes
final_result = pd.concat(results)
print("Results combined.")
# Close the pool
pool.close()
pool.join()
print("Pool closed.")
# Save the final result to a CSV file
final_result.to_csv('similar_animes.csv', index=False)
print("Result saved to file.")

still taking so long time 


In [None]:
import faiss
import cProfile
import pstats


In [None]:
df_string = pd.read_csv(r'M:\Anime Recommender\data-history\up-to-date-MAL\anime_string_encodings_bert.csv',index_col=[0])

In [None]:
type(df_string['cleaned_syn_encoding'].values[0])

In [None]:
df_string.head(5)

In [None]:
df=df_string.sample(1000)
df.sample(10)

In [None]:
def get_top_k_similar_paragraphs(x,df, k=10):
    import numpy as np
    import pandas as pd
    import faiss
    encoding = x
    if encoding is not None:
        query_embedding = np.array(encoding, dtype=np.float32)
        corpus_embeddings = np.stack(df['cleaned_syn_encoding'].values)

        # Use Faiss to index the corpus embeddings
        index = faiss.IndexFlatIP(query_embedding.size)
        index.add(corpus_embeddings)

        # Search the index to get the top k similar paragraphs
        distances, indices = index.search(np.array([query_embedding]), k+1)
        indices = indices.squeeze()[1:]
        
        return df.loc[indices, 'ID'].values
    else:
        return None

In [None]:
float(df['cleaned_syn_encoding'].values[100])

In [None]:
df.head()

In [None]:
from pandarallel import pandarallel
import re
pandarallel.initialize(progress_bar=True)

# # df.apply(func)
# match_number = re.compile('-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
# x=  [float(x) for x in re.findall(match_number, string(df['cleaned_syn_encoding'].to_list)] 

df['similar_paragraphs'] = df['cleaned_syn_encoding'].parallel_apply( get_top_k_similar_paragraphs, df =df)

In [None]:
# Break down the data into smaller chunks
chunk_size = 5
chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]


# Define the function to process each chunk
def process_chunk(chunk):
    chunk['similar_paragraphs'] = chunk['cleaned_syn_encoding'].apply(lambda x: get_top_k_similar_paragraphs(x, df))
    return chunk

# Create a Pool with the number of processes equal to the number of chunks
with Pool(len(chunks)) as p:
    cProfile.runctx("result = p.map(process_chunk, chunks)", globals(), locals(), "Profile.prof")
    
# Concatenate the results
df = pd.concat(result)

In [None]:
p = pstats.Stats(cProfile)
p.strip_dirs().sort_stats("time").print_stats()

In [None]:
df.head(20)

In [None]:
def get_top_similar_animes(title, top_n=5):
    index = df_string[df_string['title'] == title].index[0]
    similar_indices = df_string['similar_paragraphs'][index][:top_n]
    similar_animes = df_string.iloc[similar_indices]['title'].values
    return similar_animes

In [None]:
title = "Naruto"
top_similar_animes = get_top_similar_animes(title, df_string, top_n=10)
print(top_similar_animes)

It took so long lets find out why ?

In [None]:

import cProfile

In [None]:


# Define the function to process each chunk
def process_chunk(chunk):
    df['similar_paragraphs'] = df['cleaned_syn_encoding'].apply(lambda x: get_top_k_similar_paragraphs(x, df))
    return df



In [None]:
# Create a Pool with the number of processes equal to the number of chunks
with Pool(2) as p:
    cProfile.runctx("result = p.map(process_chunk, df)", globals(), locals(), "Profile.prof")

In [None]:
import pstats

p = pstats.Stats("profile_results")
p = pstats.Stats(cProfile)
p.strip_dirs().sort_stats("time").print_stats()

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine


# set the GPU as the device for TensorFlow
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    tf.config.experimental.set_visible_devices(physical_devices[0], 'GPU')
    print("Using GPU")
else:
    print("Using CPU")


In [None]:
# Load the string data:
df_string=pd.read_csv(r'M:\Anime Recommender\data-history\up-to-date-MAL\anime_string_latest.csv')

In [None]:
df_string.sample(3).T

In [None]:
import tensorflow_hub as hub

class UniversalSentenceEncoder:
    def __init__(self, dimension=512):
        self.dimension = dimension
        try:
            self.model = tf.keras.Sequential([
                tf.keras.layers.Input(shape=(1,), dtype=tf.string),
                hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                output_shape=[self.dimension],
                                trainable=False)
            ])
        except Exception as e:
            print("An error occurred while creating the UniversalSentenceEncoder model:", e)
            self.model = None
    def __call__(self, x):
        if self.model is not None:
            return self.model(x)
        else:
            print("The UniversalSentenceEncoder model is not available for use.")

In [None]:
import tensorflow_hub as hub
class UniversalSentenceEncoder:
    def __init__(self, dimension=512):
        self.dimension = dimension
        self.model = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(1,), dtype=tf.string),
            tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=0)),
            hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                            output_shape=[self.dimension],
                            trainable=False)
        ])
    def __call__(self, x):
        return self.model(x)


In [None]:
# Initialize the encoding model
encoding_model = UniversalSentenceEncoder(512)

In [None]:
def get_encoding(model, x):
    return model(tf.constant(x)).numpy().flatten()

def get_cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def get_top_k_similar_paragraphs(x, df, encoding_model, k=15):
    encoding = get_encoding(encoding_model, [x])
    distances = np.array([get_cosine_similarity(encoding, vec) for vec in df_string['cleaned_syn_encoding']])
    closest_paragraphs_indices = np.argsort(-distances)[:k + 1][1:]
    return df_string.iloc[closest_paragraphs_indices]['cleaned_syn']

In [None]:
# Get the encodings for the paragraphs in the 'cleaned_syn' column
df_string.loc[:,'cleaned_syn_encoding'] = df_string.loc[df_string['cleaned_syn'] != '', 'cleaned_syn'].apply(lambda x: get_encoding(encoding_model, [x]))

In [None]:
# Generate a new column 'similar_paragraphs' containing the top k similar paragraphs for each paragraph in the 'cleaned_syn' column
df_string.loc[df_string['cleaned_syn'] != '', 'similar_paragraphs'] = df_string.loc[df_string['cleaned_syn'] != '', 'cleaned_syn'].apply(lambda x: get_top_k_similar_paragraphs(x, df, encoding_model))