# Sistema de Recomendación

**Librerias**

In [1]:
import pandas as pd
import numpy as np
import neattext.functions  as nfx
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#avoid Warnings
import warnings
warnings.filterwarnings('ignore')

## Cargar los datos

In [3]:
df = pd.read_csv('data.csv')
df.head(2) #show first 2 rows

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance


In [4]:
#rows and columns
df.shape

(3678, 12)

## EDA

#### Información General

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


#### Verificar Valores nulos

In [6]:
df.isnull().sum()

course_id              0
course_title           0
url                    0
is_paid                0
price                  0
num_subscribers        0
num_reviews            0
num_lectures           0
level                  0
content_duration       0
published_timestamp    0
subject                0
dtype: int64

#### Verificar si existen datos duplicados

In [7]:
df.duplicated().any()

True

In [8]:
#count of Duplicated data
len(df[df.duplicated()])

6

In [9]:
#show the duplicated data
df[df.duplicated()]

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
788,1157298,Introduction to Forex Trading Business For Beg...,https://www.udemy.com/introduction-to-forex-tr...,True,20,0,0,27,Beginner Level,1.5,2017-04-23T16:19:01Z,Business Finance
894,1035638,Understanding Financial Statements,https://www.udemy.com/understanding-financial-...,True,25,0,0,10,All Levels,1.0,2016-12-15T14:56:17Z,Business Finance
1100,1084454,CFA Level 2- Quantitative Methods,https://www.udemy.com/cfa-level-2-quantitative...,True,40,0,0,35,All Levels,5.5,2017-07-02T14:29:35Z,Business Finance
1473,185526,MicroStation - Células,https://www.udemy.com/microstation-celulas/,True,20,0,0,9,Beginner Level,0.616667,2014-04-15T21:48:55Z,Graphic Design
2561,28295,Learn Web Designing & HTML5/CSS3 Essentials in...,https://www.udemy.com/build-beautiful-html5-we...,True,75,43285,525,24,All Levels,4.0,2013-01-03T00:55:31Z,Web Development


In [10]:
df[df['course_id']==837322] #filtar los datos especificos para confirmar los duplicados

Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
453,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance
787,837322,Essentials of money value: Get a financial Life !,https://www.udemy.com/essentials-of-money-value/,True,20,0,0,20,All Levels,0.616667,2016-05-16T18:28:30Z,Business Finance


**Comentarios** 
- tenemos 6 registros duplicados por lo que debemos de eliminar

#### Eliminar los datos duplicados

In [11]:
df = df.drop_duplicates()
df.shape

(3672, 12)

## Popularity Based recomendation System

### Que es un sistema de recomendación  
Los sistemas de recomendación, a veces llamados en inglés “recommender systems” son algoritmos que intentan “predecir” los siguientes ítems (productos, canciones, etc.) que querrá adquirir un usuario en particular.  

- **Popularity:** Aconseja por la “popularidad” de los productos. Por ejemplo, “los más vendidos” globalmente, se ofrecerán a todos los usuarios por igual sin aprovechar la personalización. Es fácil de implementar y en algunos casos es efectiva.


#### Función en base a su popularidad

In [12]:
def popularity_based_recommendation(df, top_n=5):
  """
  Recommends courses based on their popularity score.

  Args:
      df (pandas.DataFrame): The DataFrame containing course data.
      top_n (int, optional): The number of top courses to recommend. Defaults to 5.

  Returns:
      pandas.DataFrame: A DataFrame containing the recommended courses with their titles and popularity scores.
  """

  # Calcular el score de popularidad de cada curso
  df['popularity_score'] = 0.6 * df['num_subscribers'] + 0.4 * df['num_reviews']

  # ordenar por score de poupularidad
  df_sorted = df.sort_values(by='popularity_score', ascending=False)

  # Seleccionar las columnas titulo y score de popularidad
  recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)

  return recommended_courses

In [13]:
popularity_based_recommendation(df)

Unnamed: 0,course_title,popularity_score
2827,Learn HTML5 Programming From Scratch,164805.4
3032,Coding for Entrepreneurs Basic,96729.0
3230,The Web Developer Bootcamp,83928.4
3232,The Complete Web Developer Course 2.0,77672.0
2783,Build Your First Website in 1 Week with HTML5 ...,74544.2


## Content Based Recomendation System
- A partir de productos visitados por el usuario, se intenta “adivinar” qué busca el usuario y ofrecer mercancías similares

### Data Cleaning

In [14]:
df.course_title.head()

0                   Ultimate Investment Banking Course
1    Complete GST Course & Certification - Grow You...
2    Financial Modeling for Business Analysts and C...
3    Beginner to Pro - Financial Analysis in Excel ...
4         How To Maximize Your Profits Trading Options
Name: course_title, dtype: object

In [15]:
#Crear una copia de la dat
data = df.copy()

In [16]:
#Eliminar palabras del titulo
data['course_title'] = data['course_title'].apply(nfx.remove_shortwords)
data.course_title.head()

0                  Ultimate Investment Banking Course
1    Complete Course Certification Grow Your Practice
2    Financial Modeling Business Analysts Consultants
3              Beginner Financial Analysis Excel 2017
4               Maximize Your Profits Trading Options
Name: course_title, dtype: object

In [17]:
#Remover Caracteres especiales
data['course_title'] = data['course_title'].apply(nfx.remove_special_characters)
data.course_title.head()

0                  Ultimate Investment Banking Course
1    Complete Course Certification Grow Your Practice
2    Financial Modeling Business Analysts Consultants
3              Beginner Financial Analysis Excel 2017
4               Maximize Your Profits Trading Options
Name: course_title, dtype: object

#### Crear nueva columna

In [18]:
#concatenar la columa titulo y subject
data['title_subject'] = data['course_title'] + ' ' +  data['subject']
data[['course_title', 'subject', 'title_subject']].head()

Unnamed: 0,course_title,subject,title_subject
0,Ultimate Investment Banking Course,Business Finance,Ultimate Investment Banking Course Business Fi...
1,Complete Course Certification Grow Your Practice,Business Finance,Complete Course Certification Grow Your Practi...
2,Financial Modeling Business Analysts Consultants,Business Finance,Financial Modeling Business Analysts Consultan...
3,Beginner Financial Analysis Excel 2017,Business Finance,Beginner Financial Analysis Excel 2017 Busines...
4,Maximize Your Profits Trading Options,Business Finance,Maximize Your Profits Trading Options Business...


#### Data Preprocessing

In [19]:
#vetorizar los datos
cv = CountVectorizer(max_features=3000)
vectors = cv.fit_transform(data['title_subject']).toarray()

In [20]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
len(cv.get_feature_names_out()) #verificar si vectorizo la cnatidad marcada

3000

### Calular la distancia del coseno de similaridad

In [22]:
similarity = cosine_similarity(vectors)

In [29]:
#funcion
def recommed_based_contend(course):
    course_index = data[data['course_title']==course].index[0]
    distances = similarity[course_index]
    courses_list = sorted(list(enumerate(distances)), reverse = True, key =lambda x:x[1])[1:6]
    for i in courses_list:
        print(data.iloc[i[0]]['course_title'])

In [33]:
recommed_based_contend('Complete Investment Banking Course 2017')

Ultimate Investment Banking Course
Complete Financial Analyst Course 2017
Cryptocurrency Investment Trading Course 2017
2017
Complete Short Course Ethereum


## save

In [32]:
import pickle

In [34]:
pickle.dump(similarity, open('similarity.pk', 'wb'))

## GUI

In [39]:
import pandas as pd
import pickle
import tkinter as tk
from tkinter import ttk, messagebox

#1. Sistema de Recomendación Basado en popularidad
def popularity_based_recommendation(df, top_n=5):
  # Calcular el score de popularidad de cada curso
  df['popularity_score'] = 0.6 * df['num_subscribers'] + 0.4 * df['num_reviews']

  # ordenar por score de poupularidad
  df_sorted = df.sort_values(by='popularity_score', ascending=False)

  # Seleccionar las columnas titulo y score de popularidad
  recommended_courses = df_sorted[['course_title', 'popularity_score']].head(top_n)

  return recommended_courses

#funcction bases on contend
def recommed(course):
    try:
       course_index = data[data['course_title']==course].index[0]
       distances = similarity[course_index]
       courses_list = sorted(list(enumerate(distances)), reverse = True, key =lambda x:x[1])[1:6]
       recommended_courses = [data.iloc[i[0]]['course_title'] for i in courses_list]
       return recommended_courses
    except IndexError:
       messagebox.showerror('Error Course {}no found.'.format(course))


#3. Button de recomendación
def recommend_button_click():
   course_title = course_var.get()
   recommended_courses = recommed(course_title)
   if recommended_courses:
      popularity_label.pack_forget()
      result_label.config(text="Recommended Courses:\n" + '\n'.join(recommended_courses))

#4. Application
# Create the main applications window
root = tk.Tk()
root.title('Recommend System')
root.geometry('400x300')

# change fond and color
font_style = ('Arial', 12)
label_color = 'blue'
heading_color = 'red'
button_color = 'green'
result_label_color = 'black'

#5. Create and Place GUI elements
course_titles = data['course_title'].tolist()
course_var = tk.StringVar(value = course_titles[0])
course_dropdown = ttk.Combobox(root, textvariable=course_var, values=course_titles, width=40, font=font_style)
course_dropdown.pack(pady=5)

popularity_recommendations = popularity_based_recommendation(data, top_n=5)
popularity_label = tk.Label(root, text="Popularity-based Recommendations:\n" + popularity_recommendations.to_string(index=False),
                             font=font_style, fg=label_color)
popularity_label.pack()

recommend_button = tk.Button(root, text="Recommend", command= recommend_button_click, width=20, font=font_style, fg=button_color)
recommend_button.pack(pady=10)

result_label = tk.Label(root, text="", wraplength=350, font=font_style, fg=result_label_color)
result_label.pack()

root.mainloop()