In [4]:
import numpy as np
import math
import pandas as pd
import re
from collections import Counter
import os

def readData(file):
    with open(file, encoding="utf-8") as file:
        content = file.read()
    return content

def buildUnigramModel(text):                                
    data = []
    
    content_lower = text.lower()                            #mengecilkan huruf pada teks
    for word in content_lower.split():                      #perulangan sebanyak teks yang sudah dikecilkan dan di split menjadi token
        cleanContent = re.sub('["()&+,./;“”\"]', '', word)  #menghilangkan punctuation
        data.append(cleanContent)                           #menyimpan unigram ke list 'data[]'
    
    pd.DataFrame(data).to_csv('files/unigramModel.csv', index=False, header=False) #menyimpan model unigram ke csv agar memudahkan penghitungan
    return data

def buildBigramModel(data):                                 #fungsi bigram modelnya
    key = []                                                #key merupakan list yang akan menyimpan kumpulan bigram
    countBigram = {}                                        #countBigram dan countUnigram, array yang menyimpan jumlah bigram dan unigram yang ada
    countUnigram = {}

    i = 1
    for i in range(len(data)):
        if i < len(data) - 1:
            key.append((data[i], data[i + 1]))              #menyimpan bigram(i, i+1) ke list key[]
            if (data[i], data[i + 1]) in countBigram:
                countBigram[(data[i], data[i + 1])] += 1
            else:
                countBigram[(data[i], data[i + 1])] = 1
        if data[i] in countUnigram:
            countUnigram[data[i]] += 1                       
        else:                                              
            countUnigram[data[i]] = 1

    return key, countBigram, countUnigram

def probBigram(key, countBigram, countUnigram):
    valueBigram = {}                                                                     
    for bigramNew in key:
        word1 = bigramNew[0]                                                              #men-assign nilai Ci
        valueBigram[bigramNew] = (countBigram.get(bigramNew) / (countUnigram.get(word1))) #P() = P(Ci,Ci+1)/P(Ci)

    pd.Series(valueBigram).to_csv('files/bigramModel.csv', header=False)                        #menyimpan model bigram dan probabilitasnya ke csv agar memudahkan penghitungan

    return valueBigram

def laplaceSmoothing(key, countBigram, countUnigram):       #untuk menghindari "Zero Probability"
    valueBigram = {}
    cStar = {}

    for bigram in key:
        word1 = bigram[0]
        valueBigram[bigram] = (countBigram.get(bigram) + 1) / (countUnigram.get(word1) + len(countUnigram))          #perhitungan P = (Ci + 1)/N + V
        cStar[bigram] = (countBigram[bigram] + 1) * countUnigram[word1] / (countUnigram[word1] + len(countUnigram))  #cStar = (Ci + 1)*(N/N+V)
                                                                                                                     #N merupakan countUnigram dan V merupakan vocab dari unigram
    pd.Series(valueBigram).to_csv('files/laplaceSmoothingResult.csv', header=False)

    return valueBigram, cStar

def nextBestWord():

    dataUnigram = pd.read_csv('files/unigramModel.csv', names=['words'])['words'].tolist()         #membaca data model unigram dari csv
    dataBigram = pd.read_csv('files/laplaceSmoothingResult.csv', header=None).values.tolist()      #membaca data model bigram dari csv

    word1 = 'of'
    word2 = 'update'
    word3 = 'hopes'
    while (word1 == 'of'):

        if (word1 in dataUnigram):
            nextBest = []                                    #variable untuk menyimpan list kata selanjutnya

            for row in dataBigram:
                if word1 == row[0]:                          #ketika kata yang diinputkan sama dengan kata pada baris 1 di file csv
                    nextBest.append((row[2], row[1]))        #memasukan nilai ke list nextBest
            
            nextBest.sort(reverse = True)                    #sort berdasar value tertinggi
            
            print('Next best words of "' + word1 + '" is: \n')
            print(nextBest[0])
            print('---------')
            break
        else:
            print('No data in dataframe.')
    
    while (word2 == 'update'):

        if (word2 in dataUnigram):
            nextBest = []

            for row in dataBigram:
                if word2 == row[0]:
                    nextBest.append((row[2], row[1]))
            
            nextBest.sort(reverse = True)                      #sort berdasar value tertinggi
            
            print('Next best words of "' + word2 + '" is: \n')
            print(nextBest[0])
            print('---------')
            break
        else:
            print('No data in dataframes.')
    
    while (word3 == 'hopes'):

        if (word3 in dataUnigram):
            nextBest = []

            for row in dataBigram:
                if word3 == row[0]:
                    nextBest.append((row[2], row[1]))
            
            nextBest.sort(reverse = True)                       #sort berdasar value tertinggi
            
            print('Next best words of "' + word3 + '" is: \n')
            print(nextBest[0])
            print('---------')
            break
        else:
            print('No data in dataframes.')

def nextTenBestWords():
    unigram = pd.read_csv('unigramModel.csv', names=['words'])['words'].tolist()             #membaca data dari csv untuk penghitungan
    dataBigram = pd.read_csv('laplaceSmoothingResult.csv', header=None).values.tolist()      #membaca data dari csv untuk penghitungan

    word1 = 'of'
    word2 = 'update'
    word3 = 'hopes'
    while (word1 == 'of'):
        
        if (word1 in dataUnigram):
            bestTen = []

            for row in dataBigram:
                if word1 == row[0]:
                    bestTen.append((row[2], row[1]))

            bestTen.sort(reverse = True)                        #sort berdasar value tertinggi
            
            print('Next best TEN words of "' + word1 + '" is: \n')
            print(bestTen[0:10])
            print('---------')
            break
        else:
            print('No data in dataframes.')
    
    while (word2 == 'update'):
        
        if (word2 in dataUnigram):
            bestTen = []

            for row in dataBigram:
                if word2 == row[0]:
                    bestTen.append((row[2], row[1]))

            bestTen.sort(reverse = True)                        #sort berdasar value tertinggi
            
            print('Next best TEN words of "' + word2 + '" is: \n')
            print(bestTen[0:10])
            print('---------')
            break
        else:
            print('No data in dataframes.')
    
    while (word3 == 'hopes'):
        
        if (word3 in dataUnigram):
            bestTen = []

            for row in dataBigram:
                if word3 == row[0]:
                    bestTen.append((row[2], row[1]))

            bestTen.sort(reverse = True)                        #sort berdasar value tertinggi
            
            print('Next best TEN words of "' + word3 + '" is: ')
            print(bestTen[0:10])
            print('---------')
            break
        else:
            print('No data in dataframes.')

if __name__ == '__main__':
    
    data = pd.read_csv("files/text.csv") #khusus no. 1
    
    text = readData("files/text.csv")    #untuk pendefinisian argumen pada functions
    text = buildUnigramModel(text)
    
    key, countBigram, countUnigram = buildBigramModel(text)                                          
    bigramProb = probBigram(key, countBigram, countUnigram)                       
    bigramAddOne, addOneCstar = laplaceSmoothing(key,countBigram,countUnigram)
    
    unigramModel = pd.read_csv("files/unigramModel.csv")
    bigramModel = pd.read_csv("files/bigramModel.csv", names=['W1', 'W2', 'Prob'])
    
    print("TUGAS LANGUAGE MODELING NLP - SFY")
    print("SILAKAN MASUKKAN IDENTITAS ANDA\n")
    
    Nama = 'Naufal HIlmiaji'
    NIM = '1301174314'

    os.system("pause")
    os.system("cls")
    
    print("\nTUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET")
    print()
    print("HASIL : ")
    print(data.head())

    os.system("pause")
    os.system("cls")

    print("\nTUGAS 2. BUAT MODEL UNIGRAM")
    print()
    print("HASIL : ")
    print(unigramModel)

    os.system("pause")
    os.system("cls")

    print("\nTUGAS 3. BUAT MODEL BIGRAM")
    print()
    print("HASIL : ")
    print(bigramModel[1:])    

    os.system("pause")
    os.system("cls")
    
    print("\nTUGAS 4. MENAMPILKAN NEXT BEST WORD")
    print()
    print("HASIL : ")
    nextBestWord()

    os.system("pause")
    os.system("cls")

    print("SELAMAT", Nama , "dengan NIM ", NIM, " ANDA SUDAH MENYELESAIKAN TUGAS LANGUAGE MODELING NLP-SFY")

TUGAS LANGUAGE MODELING NLP - SFY
SILAKAN MASUKKAN IDENTITAS ANDA


TUGAS 1. TAMPILKAN 5 BARIS PERTAMA DARI DATASET

HASIL : 
                                                text
0  Oh, how the headlines blared: Chatbots were Th...
1  If you’ve ever found yourself looking up the s...
2  Machine learning is increasingly moving from h...
3  If your understanding of A.I. and Machine Lear...
4  Want to learn about applied Artificial Intelli...

TUGAS 2. BUAT MODEL UNIGRAM

HASIL : 
             text
0              oh
1             how
2             the
3       headlines
4         blared:
...           ...
190784     design
190785        and
190786       data
190787    science
190788        NaN

[190789 rows x 1 columns]

TUGAS 3. BUAT MODEL BIGRAM

HASIL : 
              W1         W2      Prob
1             oh        how  0.400000
2            how        the  0.049844
3            the  headlines  0.000332
4      headlines    blared:  0.500000
5        blared:   chatbots  1.000000
...     