<a href="https://colab.research.google.com/github/muhajirakbarhsb/NLP_class_2023/blob/main/Class_Meeting_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Representation

## OneHot Encoding

In [1]:
from sklearn.preprocessing import OneHotEncoder
import itertools

# two example documents
docs = ["cat", "dog", "bat", "ate"]

# Split documents into tokens
tokens_docs = [doc.split(" ") for doc in docs]

# Convert list of token-lists to one flat list of tokens
# and then create a dictionary that maps word to id of word
all_tokens = itertools.chain.from_iterable(tokens_docs)
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}

# Convert token lists to token-id lists
token_ids = [[word_to_id[token] for token in tokens_doc] for tokens_doc in tokens_docs]

# Convert list of token-id lists to one-hot representation
vec = OneHotEncoder(categories="auto")
X = vec.fit_transform(token_ids)

# Convert the one-hot encoded vectors back to text representations
inverse_mapping = {idx: token for token, idx in word_to_id.items()}

# Display the original text representations
for i, row in enumerate(X.toarray()):
    print(f"Original Text: {docs[i]}")
    print("One-Hot Encoded Vector:", row)
    decoded_text = [inverse_mapping[idx] for idx, val in enumerate(row) if val == 1]
    print("Decoded Text:", " ".join(decoded_text))
    print()


Original Text: cat
One-Hot Encoded Vector: [0. 1. 0. 0.]
Decoded Text: cat

Original Text: dog
One-Hot Encoded Vector: [0. 0. 0. 1.]
Decoded Text: dog

Original Text: bat
One-Hot Encoded Vector: [1. 0. 0. 0.]
Decoded Text: bat

Original Text: ate
One-Hot Encoded Vector: [0. 0. 1. 0.]
Decoded Text: ate



In [2]:
tokens_docs

[['cat'], ['dog'], ['bat'], ['ate']]

In [3]:
word_to_id

{'bat': 0, 'cat': 1, 'ate': 2, 'dog': 3}

## Bag of Words countVectorizer


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
text = ["i love nlp. nlp is so cool"]
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
print(vectorizer.vocabulary_)
# Output: {'love': 2, 'nlp': 3, 'is': 1, 'so': 4, 'cool': 0}
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape) # Output: (1, 5)
print(vector.toarray())

{'love': 2, 'nlp': 3, 'is': 1, 'so': 4, 'cool': 0}
(1, 5)
[[1 1 1 2 1]]


## TF-IDF

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re


paragraph = """The news mentioned here is fake. Audience do not encourage fake news. Fake news is false or misleading"""

sentences = nltk.sent_tokenize(paragraph)

lemmatizer = WordNetLemmatizer()

corpus = []

# Lemmatization
for i in range(len(sentences)):
    sent = re.sub('[^a-zA-Z]', ' ', sentences[i])
    sent = sent.lower()
    sent= sent.split()
    sent = [lemmatizer.lemmatize(word) for word in sent if not word in set(stopwords.words('english'))]
    sent = ' '.join(sent)
    corpus.append(sent)


print(corpus)


words_unique = []
for i in range(len(corpus)):
    unique = nltk.word_tokenize(corpus[i])
    words_unique.append(unique)

print(words_unique)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['news mentioned fake', 'audience encourage fake news', 'fake news false misleading']
[['news', 'mentioned', 'fake'], ['audience', 'encourage', 'fake', 'news'], ['fake', 'news', 'false', 'misleading']]


In [7]:
# Creating the TF-IDF model
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
independentFeatures_tfIDF = tfidf.fit_transform(corpus).toarray()
tfidf_df = pd.DataFrame(data=independentFeatures_tfIDF, columns=tfidf.get_feature_names_out())

# Add an index column to represent the documents
tfidf_df.index = [f"Document {i+1}" for i in range(len(corpus))]

# Display the TF-IDF DataFrame
print(tfidf_df)

            audience  encourage      fake     false  mentioned  misleading  \
Document 1  0.000000   0.000000  0.453295  0.000000   0.767495    0.000000   
Document 2  0.608845   0.608845  0.359594  0.000000   0.000000    0.000000   
Document 3  0.000000   0.000000  0.359594  0.608845   0.000000    0.608845   

                news  
Document 1  0.453295  
Document 2  0.359594  
Document 3  0.359594  


## n-gram

In [8]:
import re
from nltk.util import ngrams

def generate_ngrams(text, n):
    # Tokenize the text into words
    words = re.findall(r'\w+', text)

    # Generate n-grams using NLTK's ngrams function
    ngrams_list = list(ngrams(words, n))

    return ngrams_list

# Example text
text = "This is an example sentence for generating n-grams."

# Generate bi-grams (2-grams)
bigrams = generate_ngrams(text, 2)
print("Bi-grams:")
for gram in bigrams:
    print(gram)

# Generate tri-grams (3-grams)
trigrams = generate_ngrams(text, 3)
print("\nTri-grams:")
for gram in trigrams:
    print(gram)

Bi-grams:
('This', 'is')
('is', 'an')
('an', 'example')
('example', 'sentence')
('sentence', 'for')
('for', 'generating')
('generating', 'n')
('n', 'grams')

Tri-grams:
('This', 'is', 'an')
('is', 'an', 'example')
('an', 'example', 'sentence')
('example', 'sentence', 'for')
('sentence', 'for', 'generating')
('for', 'generating', 'n')
('generating', 'n', 'grams')


## Load Data

In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv('clean_review.csv')
df.head(10)

Unnamed: 0,at,content,score,userName,contentp,contentp_clean,text_length
0,2023-09-22 04:23:14,Baik,5,Athar Rl,baik,baik,4
1,2023-09-22 03:45:31,"Gak bisa log in bos, gimna nih",1,PRO GAMING,gak bisa log in bos gimna nih,bisa log in bos gimna nih,25
2,2023-09-22 01:42:30,Sering kembali. Selalu minta login. Minta otp....,1,Iksan Fahrozi,sering kembali selalu minta login minta otp ha...,sering selalu minta login minta otp bongkar pa...,115
3,2023-09-21 18:23:05,Ini kenapa kalau mau priksa perangkat terhubun...,2,Kipuy Sama,ini kenapa kalau mau priksa perangkat terhubun...,kenapa kalau mau priksa perangkat hubung selal...,86
4,2023-09-21 11:52:14,Mengapa kouta bonus 25gb/bulan selama 6 bulan ...,2,Daniel Chainci,mengapa kouta bonus gb bulan selama bulan saya...,kouta bonus gb bulan lama bulan tidak masuk ba...,65
5,2023-09-21 07:31:30,Sering muncul peringatan 'anda tidak terhubung...,5,Wilson Nts,sering muncul peringatan anda tidak terhubung ...,sering muncul ingat tidak hubung jaring orbit ...,238
6,2023-09-21 06:15:56,nyesel banget beli orbit paketan nya malah tam...,1,Saprol Firgiawan,nyesel banget beli orbit paketan nya malah tam...,nyesal banget beli orbit paket nya malah tamba...,53
7,2023-09-21 01:53:33,"Sementara bintang 1 dlu ya, setelah apps diupd...",1,Arditya Purwa F,sementara bintang dlu ya setelah apps diupdate...,bintang dulu telah aplikasi diupdate baru seri...,173
8,2023-09-20 23:19:47,"Mahal banget, mending klo bagus ini mah boro2",1,Choerul Anwar,mahal banget mending klo bagus ini mah boro,mahal banget lebih baik kalau bagus mah boro,44
9,2023-09-20 16:11:34,"Aplikasi jelek, mau buka menu perangkat terhub...",1,benny panca,aplikasi jelek mau buka menu perangkat terhubu...,aplikasi jelek mau buka menu perangkat hubung ...,61


In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from nltk.util import ngrams

def text_representation(df, text_column, method='one-hot', ngram_range=(1, 1)):
    """
    Apply various text representation techniques to a DataFrame.

    Parameters:
    - df: DataFrame with 'date' and 'text' columns.
    - text_column: Name of the text column.
    - method: Text representation method ('one-hot', 'bag-of-words', 'ngram', 'countvectorize', 'tfidf').
    - ngram_range: Tuple specifying the n-gram range (e.g., (1, 1) for unigrams, (1, 2) for unigrams and bigrams).

    Returns:
    - Transformed DataFrame.
    """

    if method == 'one-hot':
        mlb = MultiLabelBinarizer()
        words = [text.split() for text in df[text_column]]
        one_hot_encoded = mlb.fit_transform(words)
        one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
        return pd.concat([df, one_hot_df], axis=1)

    elif method == 'bag-of-words':
        vectorizer = CountVectorizer()
        bow = vectorizer.fit_transform(df[text_column])
        bow_df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, bow_df], axis=1)

    elif method == 'ngram':
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        ngram_features = vectorizer.fit_transform(df[text_column])
        ngram_df = pd.DataFrame(ngram_features.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, ngram_df], axis=1)

    elif method == 'countvectorize':
        vectorizer = CountVectorizer()
        count_vectorized = vectorizer.fit_transform(df[text_column])
        countvectorize_df = pd.DataFrame(count_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, countvectorize_df], axis=1)

    elif method == 'tfidf':
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform(df[text_column])
        tfidf_df = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())
        return pd.concat([df, tfidf_df], axis=1)

    else:
        raise ValueError("Invalid method. Supported methods are 'one-hot', 'bag-of-words', 'ngram', 'countvectorize', and 'tfidf'.")

# Example usage:
# Assuming you have a DataFrame called 'df' with 'date' and 'text' columns
# transformed_df = text_representation(df, 'text', method='one-hot')


In [25]:
# Example usage:
# Assuming you have a DataFrame called 'df' with 'date' and 'text' columns
transformed_df = text_representation(df, 'contentp_clean', method='tfidf')

In [26]:
transformed_df

Unnamed: 0,at,content,score,userName,contentp,contentp_clean,text_length,abai,abang,account,...,wifii,wifinya,wilayah,woi,wror,xiaomi,yah,yaitu,yang,youtube
0,2023-09-22 04:23:14,Baik,5,Athar Rl,baik,baik,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-09-22 03:45:31,"Gak bisa log in bos, gimna nih",1,PRO GAMING,gak bisa log in bos gimna nih,bisa log in bos gimna nih,25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,2023-09-22 01:42:30,Sering kembali. Selalu minta login. Minta otp....,1,Iksan Fahrozi,sering kembali selalu minta login minta otp ha...,sering selalu minta login minta otp bongkar pa...,115,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,2023-09-21 18:23:05,Ini kenapa kalau mau priksa perangkat terhubun...,2,Kipuy Sama,ini kenapa kalau mau priksa perangkat terhubun...,kenapa kalau mau priksa perangkat hubung selal...,86,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,2023-09-21 11:52:14,Mengapa kouta bonus 25gb/bulan selama 6 bulan ...,2,Daniel Chainci,mengapa kouta bonus gb bulan selama bulan saya...,kouta bonus gb bulan lama bulan tidak masuk ba...,65,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,2023-09-05 16:34:16,"Gini ya, ini produk udh lelet mahal pulak woi ...",1,Qouqly,gini ya ini produk udh lelet mahal pulak woi k...,gin produk lambat mahal pulak woi kuota kau ma...,101,0.0,0.0,0.0,...,0.0,0.0,0.0,0.209662,0.0,0.0,0.0,0.0,0.0,0.0
196,2023-09-05 14:13:15,ini jaringan ngeleg kali padahal paket data ma...,1,Herni Juliet Butar Butar,ini jaringan ngeleg kali padahal paket data ma...,jaring lag kali padahal paket data banyak siny...,70,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
197,2023-09-05 14:08:56,kenapa kalo pencet perangkat terhubung selalu ...,1,Xicely yfavor,kenapa kalo pencet perangkat terhubung selalu ...,kalau pencet perangkat hubung selalu keluar se...,51,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
198,2023-09-05 14:01:56,"Paket datanya makin mahal , hanya pertama paka...",3,Ishak Gd,paket datanya makin mahal hanya pertama pakai ...,paket data makin mahal pertama pakai murah ria...,68,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Calculate the sparsity of the TF-IDF matrix
tfidf_df =  transformed_df.drop(['at'	,'content','score','userName','contentp','contentp_clean'], axis=1)
sparsity = 1.0 - (np.count_nonzero(tfidf_df) / tfidf_df.size)

print(f"Sparsity of the TF-IDF matrix: {sparsity:.4f}")

Sparsity of the TF-IDF matrix: 0.9819
