# Recommender System for Coaching and Academic Advising
#### Goal of the system is to consider user interests and feedback to recommend specific courses based on their level.

In [3]:
#AUTHOR: Aubry, Nanae & Ruchti, Kilian
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import math
from os import path
import csv


class color:
   RED = '\033[91m'
   GREEN = '\033[92m'
   BLUE = '\033[94m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

#### Natural language processing functions

In [4]:
# Function that returns all the words in a lowercase text.
def getWordlist(text):
    word_list = []
    text = text.lower()
    splitted = re.findall(r'\b[a-z]+\b', text)
    for word in splitted:
        word_list.append(sStemmer(word))
        
    return word_list

# Function that returns the singular form of a word given in input.
def sStemmer(wordS):
#     Errors handeling.
    if type(wordS) != str:
        print("Error ! The element you gave is not a string.\n")
        return ''
    if re.search(r'^[a-z]{,4}$', wordS):
        return wordS
    if not re.search(r'^[a-z]{5,}$', wordS):
        return ''  
#     First test.
    if not re.search(r'[ae]ies$', wordS) and re.search(r'ies$', wordS):
#         If the word finishes by 'ies' without an 'a' or an 'e' before, we change the 'ies' into an 'y'.
        word = re.sub(r'ies$', 'y', wordS)
#     Second test
    elif not re.search(r'[aeo]es$', wordS) and re.search(r'es$', wordS):
#         If the word finishes by 'es' without an 'a', an 'e' or an 'o' before, we change the 'es' into an 'e'.
        word = re.sub(r'es$', 'e', wordS)
#     Third test
    elif not re.search(r'[us]s$', wordS) and re.search(r's$', wordS):   
#         If the word finishes by 's' without an 'u' or an 's' before, we delete the 's'.
        word = re.sub(r's$', '', wordS)
    else:
#         When none of the cases worked, then it means that the word isn't a plural form so we return the word..
        word = wordS
    
    return word

#### Recommender System functions

In [6]:
# Filter dataframe to find courses containing a keyword
def subjects_filter(error, user_filter):
    rc = all_courses.copy()
    
    filtered_resources = rc.loc[rc['specific_subject'].str.contains(user_filter)]

# Error handling
    if filtered_resources.empty:
        error = True
        print(color.BOLD + color.RED + "\nError: Sorry, the subject you requested is not in the list. Please try with another one.\nMind the upper case letters." + color.END)
                
    return error, filtered_resources


# Seperate the filtered data by level of difficulty
def levels_filter(filtered_courses):
    beginner = filtered_courses[filtered_courses["level"].isin({'Beginner Level'})]
    intermediate = filtered_courses[filtered_courses["level"].isin({'Intermediate Level'})]
    expert = filtered_courses[filtered_courses["level"].isin({'Expert Level'})]
   
    list_levels = list()
    list_levels.append(beginner)
    list_levels.append(intermediate)
    list_levels.append(expert)
    
    return list_levels


# ------------------------------------------------------
# Calculate Cosine Similarity Accross the Courses
# We are calculating the similarity by considering their subjects
def similarity_matrix(courses_attributes):
    name_courses= list(courses_attributes.index)
    sim = pd.DataFrame(cosine_similarity(courses_attributes), columns=name_courses, index=name_courses)
    return sim



# Calculating similarity by taking into account what user already rated
def user_pref_sim_matrix(topic_attr, user_pref):
    
    #Sort by index preferences of users
    #Create dataframe copy with only course id and ratings
    user = user_preferences.sort_values(by = ['id'])

    #Create new dataframe with the course_attributes and user_preferences appended
    #Add 0 for indexes that have not been rated
    df = pd.DataFrame()
    df = df.append(topic_attributes)
    df['Num_Attr'] = df.drop('id', axis=1).sum(axis=1)
    df = df.merge(user, on=['id', 'course_title'], how='left').fillna(0)

    user_rating = df[['rating']].copy() 

    #Get number columns to use in operations
    column_values = len(df.columns)

    #Create weighted matrix
    weighted_matrix = pd.DataFrame(df.iloc[:,2:column_values - 3].values/pd.DataFrame([math.sqrt(i) for i in df.iloc[:,column_values - 2].values]).values)

    #Calculate like scores for user
    user_likes_score = pd.DataFrame((weighted_matrix.values*user_rating.values).sum(axis=0)).T

    #Calculate predictions for user
    pred_user=(df.iloc[:,2:column_values - 3].values*user_likes_score.values).sum(axis=1)
    df["pred_user"]=pred_user

    #Display top 5 predictions
    predictions = df.loc[df['rating'] == 0].nlargest(5, 'pred_user')[['course_title', 'pred_user']]
    
    return predictions



# ------------------------------------------------------
## Courses rating

def update_rating(row):
    df = pd.read_csv('data/users/' + user_name + "_" + topic +".csv") 
    df = df[~df['id'].isin([row[0]])]
    updated_csv = df.to_csv('data/users/' + user_name + "_" + topic +".csv", index=False)
    return updated_csv

#### Asking user's input functions

In [None]:
def AskForTopic():
#     print("\n--------------------\nTopics:" + color.BOLD + color.GREEN +  "\n Musical Instrument\n Business Finance\n" + color.END)
    
    choice = input(color.BOLD + color.UNDERLINE + "What do you want to learn about?\n" + color.END + " ")
    topic = ""
    main_topic_choice = getWordlist(choice)
    
    return main_topic_choice    

#### User enter name and topic choice

In [None]:
user_name = input(color.BOLD + color.UNDERLINE + "What is your name?\n" + color.END + " ")

print("\n--------------------\nTopics:" + color.BOLD + color.GREEN +  "\n Musical Instrument\n Business Finance\n" + color.END)
main_topic_choice = AskForTopic()


##### Read dataset of courses corresponding to the main topic user chose #####
#Select csv to open based on user input

topic_ok = False

while topic_ok == False:
    if ('music' in main_topic_choice) or ('musical' in main_topic_choice) or ('instrument' in main_topic_choice):
        topic_ok = True
        all_courses = pd.read_csv('data/music_courses.csv', header = 0, sep=',')
        topic = "music"
        
    elif ('business' in main_topic_choice) or ('finance' in main_topic_choice):
        topic_ok = True
        all_courses = pd.read_csv('data/business_courses.csv', header = 0, sep=',')
        topic = "business"
    else:
        print(color.BOLD + color.RED + "\nError: Sorry, the topic you requested is not in the list. Please try with another one.\n" + color.END)
        main_topic_choice = AskForTopic()


    
all_courses.set_index(["course_title"], inplace=True)

#### Filter through courses to only show rows that are associated with specific subject

In [None]:
error = False
subject_ok = False


if topic == 'music':
        print("Subjects:" + color.BOLD +  color.GREEN +  " \n Drums \n Guitar \n Piano \n Chords \n Harmonica \n Songs \n etc..." + color.END)
elif topic == 'business':
        print("Subjects:" + color.BOLD +  color.GREEN +  " \n Investment \n Trading \n Cryptocurrency \n Stock Market \n Analysis \n Accounting \n etc..." + color.END)

user_filter = input(color.BOLD + color.UNDERLINE + "\nWhat topics would you like to see?\n" + color.END + " ")

        
while subject_ok == False:    
    error, specific_subject = subjects_filter(error, user_filter)
   

    if error == False and not specific_subject.empty:
        subject_ok = True
    else:
        user_filter = input(color.BOLD + color.UNDERLINE + "\nWhat topics would you like to see?\n" + color.END + " ")

    error = False
    

#### Read course_attributes dataset to create similarity matrix

In [None]:
if topic == "music":
    topic_attributes = pd.read_csv('data/music_attributes.csv', header=0, index_col="course_title")
if topic == "business":
    topic_attributes = pd.read_csv('data/business_attributes.csv', header=0, index_col="course_title")

#### Show courses to user, ordered by level

In [None]:
list_courses = levels_filter(specific_subject)
merged_list = pd.concat(list_courses)
merged_list

In [None]:
# User chooses a course and program displays the link
course = input(color.BOLD + color.UNDERLINE + "What course do you want to watch?\n" + color.END + " ")

extract_url = all_courses.loc[[course], ["url"]].values[0]
course_url = extract_url[0]
print("\nHere is the link to the course:\n " + color.BOLD + color.BLUE + color.UNDERLINE + course_url + color.END)

## Ask if user liked course to store rating

In [None]:
user_rating = input(color.BOLD + color.UNDERLINE + "Did you like the video? (yes/no):" + color.END + " ")

if user_rating == "yes":
    like = 1
else:
    like = -1

#### Add course and rating to user's file. If new user, create new csv
##### Find course id to store

In [None]:
extract_id = all_courses.loc[[course], ["id"]].values[0]
course_id = extract_id[0]

In [None]:
file_name = user_name + "_" + topic +".csv"

if path.exists('data/users/' + file_name) == True:
    with open('data/users/' + file_name, 'r') as file:
        csv_reader = csv.reader(file)
        rated = False
        for row in csv_reader:
            if course in row:
                print("You already rated this course, we'll update your rating!")
                updated_csv = update_rating(row)
                break
    
    with open('data/users/' + file_name, 'a') as filea:
        writer = csv.writer(filea)
        writer.writerow([course_id, course, like])
            
else:
    with open('data/users/' + file_name, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["id","course_title", "rating"])
        writer.writerow([course_id, course, like])
file.close()

##### Open file with user ratings

In [None]:
#Open csv of user preferences if user exists for topic    
file = 'data/users/' + user_name + "_" + topic +".csv"
if path.exists(file) == True:
    user_preferences = pd.read_csv(file)
else:
    user_preferences = pd.DataFrame()

#### Recommend next courses based on user ratings

In [None]:
print(color.BOLD + "\nCourses recommended based on what you've rated so far: \n" + color.END)
top_pred = user_pref_sim_matrix(topic_attributes, user_preferences)
top_pred

#### Show most similar courses to course user chose

In [None]:
print(color.BOLD + "\nCourses recommended based course you just took: \n" + color.END)

similarity = similarity_matrix(topic_attributes)
corr_top5=pd.DataFrame(similarity.loc[course].sort_values(ascending=False)[1:6])
corr_top5