In [1]:
import multiprocessing as mp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
import nltk
nltk.download('stopwords', quiet=True)



True

In [2]:
def preprocess_text(text):
    # lowercasing
    lowercased_text = text.lower()

    # cleaning 
    import re 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    remove_white_space = remove_punctuation.strip()

    # Tokenization = Breaking down each sentence into an array
    from nltk.tokenize import word_tokenize
    tokenized_text = word_tokenize(remove_white_space)

    # Stop Words/filtering = Removing irrelevant words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in tokenized_text if word not in stopwords]

    # Stemming = Transforming words into their base form
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    
    # Putting all the results into a dataframe.
    df = pd.DataFrame({
        'DOCUMENT': [text],
        'LOWERCASE' : [lowercased_text],
        'CLEANING': [remove_white_space],
        'TOKENIZATION': [tokenized_text],
        'STOP-WORDS': [stopwords_removed],
        'STEMMING': [stemmed_text]
    })

    return df
        
def calculate_tfidf(corpus):
    # Call the preprocessing result
    df = preprocessing(corpus)
        
    # Make each array row from stopwords_removed to be a sentence
    stemming = corpus['STEMMING'].apply(' '.join)
    
    # Count TF-IDF
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(stemming)
    
    # Get words from stopwords array to use as headers
    feature_names = vectorizer.get_feature_names_out()

    # Combine header titles and weights
    df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
    df_tfidf = pd.concat([df, df_tfidf], axis=1)

    return df_tfidf

def cosineSimilarity(corpus):
    # Call the TF-IDF result
    df_tfidf = calculate_tfidf(corpus)
    
    # Get the TF-IDF vector for the first item (index 0)
    vector1 = df_tfidf.iloc[0, 6:].values.reshape(1, -1)

    # Get the TF-IDF vector for all items except the first item
    vectors = df_tfidf.iloc[:, 6:].values
    
    # Calculate cosine similarity between the first item and all other items
    from sklearn.metrics.pairwise import cosine_similarity
    cosim = cosine_similarity(vector1, vectors)
    cosim = pd.DataFrame(cosim)
    
    # Convert the DataFrame into a one-dimensional array
    cosim = cosim.values.flatten()

    # Convert the cosine similarity result into a DataFrame
    df_cosim = pd.DataFrame(cosim, columns=['COSIM'])

    # Combine the TF-IDF array with the cosine similarity result
    df_cosim = pd.concat([df_tfidf, df_cosim], axis=1)

    return df_cosim

In [3]:
# file semua judul proposal pada prodi sistem informasi UINSU Medan
data = pd.read_csv('data.csv', delimiter=';', encoding='latin')
data

Unnamed: 0,DOCUMENT
0,"Being into sports is more than just exercising. It's a lifestyle that gives me energy, discipline, and a sense of achievement every day."
1,Playing the guitar and singing helps me express my feelings. It's like creating my own language through music that speaks to my heart.
2,"I really like watching movies. Whether it's a funny one or a touching story, I enjoy the different feelings they bring."
3,"Whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle."


In [4]:
def preprocessing(corpus):
    # Create an empty DataFrame
    df = pd.DataFrame(columns=['DOCUMENT'])

    # Running preprocessing one by one
    for doc in corpus['DOCUMENT']:
        # Call the preprocess_text function
        result_df = preprocess_text(doc)
        
        # Concatenate the result of preprocessing to the main DataFrame
        df = pd.concat([df, result_df], ignore_index=True)
        
    return df

result_preprocessing = preprocessing(data)
result_preprocessing

Unnamed: 0,DOCUMENT,LOWERCASE,CLEANING,TOKENIZATION,STOP-WORDS,STEMMING
0,"Being into sports is more than just exercising. It's a lifestyle that gives me energy, discipline, and a sense of achievement every day.","being into sports is more than just exercising. it's a lifestyle that gives me energy, discipline, and a sense of achievement every day.",being into sports is more than just exercising its a lifestyle that gives me energy discipline and a sense of achievement every day,"[being, into, sports, is, more, than, just, exercising, its, a, lifestyle, that, gives, me, energy, discipline, and, a, sense, of, achievement, every, day]","[sports, exercising, lifestyle, gives, energy, discipline, sense, achievement, every, day]","[sport, exercis, lifestyl, give, energi, disciplin, sens, achiev, everi, day]"
1,Playing the guitar and singing helps me express my feelings. It's like creating my own language through music that speaks to my heart.,playing the guitar and singing helps me express my feelings. it's like creating my own language through music that speaks to my heart.,playing the guitar and singing helps me express my feelings its like creating my own language through music that speaks to my heart,"[playing, the, guitar, and, singing, helps, me, express, my, feelings, its, like, creating, my, own, language, through, music, that, speaks, to, my, heart]","[playing, guitar, singing, helps, express, feelings, like, creating, language, music, speaks, heart]","[play, guitar, sing, help, express, feel, like, creat, languag, music, speak, heart]"
2,"I really like watching movies. Whether it's a funny one or a touching story, I enjoy the different feelings they bring.","i really like watching movies. whether it's a funny one or a touching story, i enjoy the different feelings they bring.",i really like watching movies whether its a funny one or a touching story i enjoy the different feelings they bring,"[i, really, like, watching, movies, whether, its, a, funny, one, or, a, touching, story, i, enjoy, the, different, feelings, they, bring]","[really, like, watching, movies, whether, funny, one, touching, story, enjoy, different, feelings, bring]","[realli, like, watch, movi, whether, funni, one, touch, stori, enjoy, differ, feel, bring]"
3,"Whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle.","whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle.",whether its hitting the gym or doing outdoor activities staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle,"[whether, its, hitting, the, gym, or, doing, outdoor, activities, staying, active, and, feeling, that, exercise, buzz, is, a, crucial, part, of, my, commitment, to, a, healthy, lifestyle]","[whether, hitting, gym, outdoor, activities, staying, active, feeling, exercise, buzz, crucial, part, commitment, healthy, lifestyle]","[whether, hit, gym, outdoor, activ, stay, activ, feel, exercis, buzz, crucial, part, commit, healthi, lifestyl]"


In [5]:
result_tfidf = calculate_tfidf(result_preprocessing)
result_tfidf

Unnamed: 0,DOCUMENT,LOWERCASE,CLEANING,TOKENIZATION,STOP-WORDS,STEMMING,achiev,activ,bring,buzz,...,realli,sens,sing,speak,sport,stay,stori,touch,watch,whether
0,"Being into sports is more than just exercising. It's a lifestyle that gives me energy, discipline, and a sense of achievement every day.","being into sports is more than just exercising. it's a lifestyle that gives me energy, discipline, and a sense of achievement every day.",being into sports is more than just exercising its a lifestyle that gives me energy discipline and a sense of achievement every day,"[being, into, sports, is, more, than, just, exercising, its, a, lifestyle, that, gives, me, energy, discipline, and, a, sense, of, achievement, every, day]","[sports, exercising, lifestyle, gives, energy, discipline, sense, achievement, every, day]","[sport, exercis, lifestyl, give, energi, disciplin, sens, achiev, everi, day]",0.328919,0.0,0.0,0.0,...,0.0,0.328919,0.0,0.0,0.328919,0.0,0.0,0.0,0.0,0.0
1,Playing the guitar and singing helps me express my feelings. It's like creating my own language through music that speaks to my heart.,playing the guitar and singing helps me express my feelings. it's like creating my own language through music that speaks to my heart.,playing the guitar and singing helps me express my feelings its like creating my own language through music that speaks to my heart,"[playing, the, guitar, and, singing, helps, me, express, my, feelings, its, like, creating, my, own, language, through, music, that, speaks, to, my, heart]","[playing, guitar, singing, helps, express, feelings, like, creating, language, music, speaks, heart]","[play, guitar, sing, help, express, feel, like, creat, languag, music, speak, heart]",0.0,0.0,0.0,0.0,...,0.0,0.0,0.301115,0.301115,0.0,0.0,0.0,0.0,0.0,0.0
2,"I really like watching movies. Whether it's a funny one or a touching story, I enjoy the different feelings they bring.","i really like watching movies. whether it's a funny one or a touching story, i enjoy the different feelings they bring.",i really like watching movies whether its a funny one or a touching story i enjoy the different feelings they bring,"[i, really, like, watching, movies, whether, its, a, funny, one, or, a, touching, story, i, enjoy, the, different, feelings, they, bring]","[really, like, watching, movies, whether, funny, one, touching, story, enjoy, different, feelings, bring]","[realli, like, watch, movi, whether, funni, one, touch, stori, enjoy, differ, feel, bring]",0.0,0.0,0.292972,0.0,...,0.292972,0.0,0.0,0.0,0.0,0.0,0.292972,0.292972,0.292972,0.230982
3,"Whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle.","whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle.",whether its hitting the gym or doing outdoor activities staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle,"[whether, its, hitting, the, gym, or, doing, outdoor, activities, staying, active, and, feeling, that, exercise, buzz, is, a, crucial, part, of, my, commitment, to, a, healthy, lifestyle]","[whether, hitting, gym, outdoor, activities, staying, active, feeling, exercise, buzz, crucial, part, commitment, healthy, lifestyle]","[whether, hit, gym, outdoor, activ, stay, activ, feel, exercis, buzz, crucial, part, commit, healthi, lifestyl]",0.0,0.511775,0.0,0.255888,...,0.0,0.0,0.0,0.0,0.0,0.255888,0.0,0.0,0.0,0.201745


In [6]:
cosim_result = cosineSimilarity(result_tfidf)
cosim_result

Unnamed: 0,DOCUMENT,LOWERCASE,CLEANING,TOKENIZATION,STOP-WORDS,STEMMING,achiev,activ,bring,buzz,...,sens,sing,speak,sport,stay,stori,touch,watch,whether,COSIM
0,"Being into sports is more than just exercising. It's a lifestyle that gives me energy, discipline, and a sense of achievement every day.","being into sports is more than just exercising. it's a lifestyle that gives me energy, discipline, and a sense of achievement every day.",being into sports is more than just exercising its a lifestyle that gives me energy discipline and a sense of achievement every day,"[being, into, sports, is, more, than, just, exercising, its, a, lifestyle, that, gives, me, energy, discipline, and, a, sense, of, achievement, every, day]","[sports, exercising, lifestyle, gives, energy, discipline, sense, achievement, every, day]","[sport, exercis, lifestyl, give, energi, disciplin, sens, achiev, everi, day]",0.328919,0.0,0.0,0.0,...,0.328919,0.0,0.0,0.328919,0.0,0.0,0.0,0.0,0.0,1.0
1,Playing the guitar and singing helps me express my feelings. It's like creating my own language through music that speaks to my heart.,playing the guitar and singing helps me express my feelings. it's like creating my own language through music that speaks to my heart.,playing the guitar and singing helps me express my feelings its like creating my own language through music that speaks to my heart,"[playing, the, guitar, and, singing, helps, me, express, my, feelings, its, like, creating, my, own, language, through, music, that, speaks, to, my, heart]","[playing, guitar, singing, helps, express, feelings, like, creating, language, music, speaks, heart]","[play, guitar, sing, help, express, feel, like, creat, languag, music, speak, heart]",0.0,0.0,0.0,0.0,...,0.0,0.301115,0.301115,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"I really like watching movies. Whether it's a funny one or a touching story, I enjoy the different feelings they bring.","i really like watching movies. whether it's a funny one or a touching story, i enjoy the different feelings they bring.",i really like watching movies whether its a funny one or a touching story i enjoy the different feelings they bring,"[i, really, like, watching, movies, whether, its, a, funny, one, or, a, touching, story, i, enjoy, the, different, feelings, they, bring]","[really, like, watching, movies, whether, funny, one, touching, story, enjoy, different, feelings, bring]","[realli, like, watch, movi, whether, funni, one, touch, stori, enjoy, differ, feel, bring]",0.0,0.0,0.292972,0.0,...,0.0,0.0,0.0,0.0,0.0,0.292972,0.292972,0.292972,0.230982,0.0
3,"Whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle.","whether it's hitting the gym or doing outdoor activities, staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle.",whether its hitting the gym or doing outdoor activities staying active and feeling that exercise buzz is a crucial part of my commitment to a healthy lifestyle,"[whether, its, hitting, the, gym, or, doing, outdoor, activities, staying, active, and, feeling, that, exercise, buzz, is, a, crucial, part, of, my, commitment, to, a, healthy, lifestyle]","[whether, hitting, gym, outdoor, activities, staying, active, feeling, exercise, buzz, crucial, part, commitment, healthy, lifestyle]","[whether, hit, gym, outdoor, activ, stay, activ, feel, exercis, buzz, crucial, part, commit, healthi, lifestyl]",0.0,0.511775,0.0,0.255888,...,0.0,0.0,0.0,0.0,0.255888,0.0,0.0,0.0,0.201745,0.104634
