In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import DistanceMetric
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import nltk
import numpy as np

loading data

In [2]:
colnames = ['course_id','course_name', 'link', 'course_details']
data = pd.read_csv('courses.csv',skiprows=[0],names=colnames)
data

Unnamed: 0,course_id,course_name,link,course_details
0,CMPE 030,Programming Concepts and Methodology,http://info.sjsu.edu/web-dbgen/catalog/courses...,Introduction to programming; overview of compu...
1,CMPE 050,Object-Oriented Concepts and Methodology,http://info.sjsu.edu/web-dbgen/catalog/courses...,Application of object-oriented software engine...
2,CMPE 102,Assembly Language Programming,http://info.sjsu.edu/web-dbgen/catalog/courses...,Assembly programming; assembly-C interface; CP...
3,CMPE 110,Electronics for Computing Systems,http://info.sjsu.edu/web-dbgen/catalog/courses...,"RC, RL and RLC circuit analysis, diodes and di..."
4,CMPE 120,Computer Organization and Architecture,http://info.sjsu.edu/web-dbgen/catalog/courses...,Introduction to computer organization and arch...
5,CMPE 124,Digital Design I,http://info.sjsu.edu/web-dbgen/catalog/courses...,Combinational and sequential logic theory and ...
6,CMPE 125,Digital Design II,http://info.sjsu.edu/web-dbgen/catalog/courses...,"Digital system building blocks, data path and ..."
7,CMPE 126,Algorithms and Data Structure Design,http://info.sjsu.edu/web-dbgen/catalog/courses...,Object-oriented data organization and represen...
8,CMPE 127,Microprocessor Design I,http://info.sjsu.edu/web-dbgen/catalog/courses...,Microprocessor architecture and assembly langu...
9,CMPE 130,Advanced Algorithm Design,http://info.sjsu.edu/web-dbgen/catalog/courses...,Design and analysis of data structures and alg...


converting to list

In [17]:
new_df= data[['course_details']]

col_course_details = data.course_details.tolist ()

col_course_id=data.course_id.tolist()

col_course_name=data.course_name.tolist()
new_df

Unnamed: 0,course_details
0,Introduction to programming; overview of compu...
1,Application of object-oriented software engine...
2,Assembly programming; assembly-C interface; CP...
3,"RC, RL and RLC circuit analysis, diodes and di..."
4,Introduction to computer organization and arch...
5,Combinational and sequential logic theory and ...
6,"Digital system building blocks, data path and ..."
7,Object-oriented data organization and represen...
8,Microprocessor architecture and assembly langu...
9,Design and analysis of data structures and alg...


In [4]:
pre_processing = ""
pre_processing = pre_processing.join ( col_course_details )

feature engineering-remove punctuation

In [5]:
tokenizer = RegexpTokenizer ( r'\w+' )
tokenizer.tokenize ( pre_processing )
sentences = nltk.sent_tokenize ( pre_processing )

text processing

In [6]:
stemmer = PorterStemmer ()
for i in range ( len ( sentences ) ):
    wordsStemmer = nltk.word_tokenize ( sentences[i] )
    wordsStemmer = [stemmer.stem ( word ) for word in wordsStemmer]
    sentences[i] = ' '.join ( wordsStemmer )

text processing two words are same then it will normalization

In [7]:
lemmatizer = WordNetLemmatizer ()
for i in range ( len ( sentences ) ):
    wordslemmatizer = nltk.word_tokenize ( sentences[i] )
    wordslemmatizer = [lemmatizer.lemmatize ( word ) for word in wordslemmatizer]
    sentences[i] = ' '.join ( wordslemmatizer )

vector generation

In [8]:
sentences = sentences[0].split ( '.' )
del sentences[-1]
stopWords = stopwords.words ( 'english' )

vectorizer = CountVectorizer ( stop_words=stopWords )

featurevectors = vectorizer.fit_transform ( col_course_details ).todense ()


cosine similarity

In [9]:
def cosine(test2):
    global featurevectors
    cosine_similarities = linear_kernel ( test2, featurevectors ).flatten ()
    related_docs_indices = cosine_similarities.argsort ()[:-5:-1]
    related_docs_indices_list = related_docs_indices.tolist()

    course_name={}
    for i in related_docs_indices:
        course_name[col_course_id[i]]=col_course_name[i]
        
    result = []
    for i in related_docs_indices_list:
        result.append(col_course_details[i])
        
    return (course_name)

Knn model

In [10]:
def build_model_knn(test2):
    neigh = NearestNeighbors ( n_neighbors=5 )
    global featurevectors
    neigh.fit ( featurevectors )
    NearestNeighbors ( algorithm='auto', leaf_size=30 )

    final_knn = neigh.kneighbors ( test2, return_distance=False )
    final_knn_list = final_knn.tolist()
    return final_knn_list

tf-idf

In [11]:
#getting the data
data_frame = pd.read_csv('courses.csv' , index_col = False)
data_frame = data_frame.loc[:, ~data_frame.columns.str.match('Unnamed')]

new_data_frame= data_frame[['course id','description','name']]


#Making into vectors
tfidfvectorizer = TfidfVectorizer()
tfidfmatrix = tfidfvectorizer.fit_transform(new_data_frame['description'])

data_frame = pd.DataFrame(tfidfmatrix.toarray())

# Caluculating similarity 
cosine_sim = cosine_similarity(data_frame)
df_cosineSim = pd.DataFrame(cosine_sim)

#Recommendations
def recommendations(title, cosine_sim = cosine_sim):
    recommended_course_name={}
    idx = new_data_frame[new_data_frame['description'].str.contains(title, case=False)].index[0]  

    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    top_5_indexes = list(score_series.iloc[1:6].index)
    
    for i in top_5_indexes:
        recommended_course_name[list(new_data_frame['course id'])[i]]=(list(new_data_frame['name'])[i])
        
    
    return (recommended_course_name)

In [12]:
#print(recommendations("cloud computing"))

In [16]:
input=["software testing"]
vector_input = vectorizer.transform ( input ).toarray ()
build_model_knn ( vector_input )
result_cosine =cosine ( vector_input )
result_tfidf=recommendations(input[0])
#print(result_cosine)
final_output={}
for i in result_cosine:
    if i in result_tfidf:
        final_output[i]=result_tfidf[i]
print(final_output)

{'CMPE 287': 'Software Quality Assurance and Testing', 'CMPE 187': 'Software Quality Engineering', 'CMPE 133': 'Software Engineering II'}
