In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import silhouette_score

def main():
    # Load the dataset (replace 'file_path' with your file's path)
    file_path = '/content/drive/My Drive/UAS_PPW/tugas/Label_with_Features.xlsx'
    df = pd.read_excel(file_path)

    print("Pemodelan LDA")
    print("Ini adalah contoh aplikasi untuk Latent Dirichlet Allocation (LDA).")

    # Mengubah teks menjadi fitur numerik (misalnya, TF-IDF)
    lda_results = []

    for n in range(1, 51):
        lda = LatentDirichletAllocation(n_components=n, doc_topic_prior=0.2, topic_word_prior=0.1, random_state=42, max_iter=1)
        lda_top = lda.fit_transform(df)
        lda_results.append(lda_top)

    n_components = 50
    column_names = [f'Topik {i+1}' for i in range(n_components)]
    topik50 = pd.DataFrame(lda_top, columns=column_names)

    print("Menampilkan DataFrame 'topik50'")
    print(topik50)

    # Menambahkan kolom 'Label' pada DataFrame
    print("Menambahkan kolom 'Label' pada DataFrame")
    df_gabungan = pd.concat([topik50, df['Label']], axis=1)
    print(df_gabungan)

    # Menghapus baris dengan nilai NaN
    df_gabungan = df_gabungan.dropna()

    # # Menampilkan DataFrame 'df_gabungan' menggunakan st.dataframe pada Streamlit
    # print("Menampilkan DataFrame 'df_gabungan'")
    # print(df_gabungan)
    print("Klasifikasi KNN")
    X = df_gabungan.drop(columns=['Label']).values.astype('U') # Convert input data to string
    y = df_gabungan['Label'].values

    # Vectorize the text data
    tfidfvectorizer = TfidfVectorizer()
    tfidf_wm = tfidfvectorizer.fit_transform([str(x) for x in X])


    # Save the vectorizer for future use
    with open('tfidf.pkl', 'wb') as f:
        pickle.dump(tfidfvectorizer, f)

    # Reduce the dimensionality of the data using PCA
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(tfidf_wm.toarray())

    # Split the data into training and testing sets
    training, test, training_label, test_label = train_test_split(X_pca, y, test_size=0.2, random_state=10)

    # Train the KNN model
    modelKNN = KNeighborsClassifier(n_neighbors=3)
    modelKNN.fit(training, training_label)

    # Test the KNN model
    test_pred = modelKNN.predict(test)
    print(test_pred)
    accuracy = accuracy_score(test_label, test_pred)
    report = classification_report(test_label, test_pred, output_dict=True)
    # Print the accuracy and classification report
    print("KNN Model Evaluation")
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(report)

    print("K-Means Klastering")
    X_std = df_gabungan.values
    sklearn_pca = PCA(n_components=2)
    Y_sklearn = sklearn_pca.fit_transform(X_std)

    # Perform K-means clustering
    n_clusters = 2
    kmeans = KMeans(n_clusters=n_clusters, max_iter=400, algorithm='auto')
    fitted = kmeans.fit(Y_sklearn)
    prediction = kmeans.predict(Y_sklearn)

    # Define the elbow method function
    def elbow_method(Y_sklearn):
        number_clusters = range(1, 7)
        kmeans = [KMeans(n_clusters=i, max_iter=600) for i in number_clusters]
        score = [kmeans[i].fit(Y_sklearn).score(Y_sklearn) for i in range(len(kmeans))]
        score = [i * -1 for i in score]

        fig, ax = plt.subplots()
        ax.plot(number_clusters, score)
        ax.set(xlabel='Number of Clusters', ylabel='Score', title='Elbow Method')
        st.pyplot(fig)

    # Define the K-means clustering function
    def kmeans_clustering(Y_sklearn, fitted):
        plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1], c=prediction, s=50, cmap='viridis')
        centers2 = fitted.cluster_centers_
        plt.scatter(centers2[:, 0], centers2[:, 1], c='black', s=300, alpha=0.6)
        st.pyplot()

    # Get top features for each cluster
    def get_top_features_cluster(X_std, prediction, n_feats):
        features = df.columns[:-1]  # Assuming the last column is the label
        labels = np.unique(prediction)
        dfs = []
        for label in labels:
            id_temp = np.where(prediction == label)
            x_means = np.mean(X_std[id_temp], axis=0)
            sorted_means = np.argsort(x_means)[::-1][:n_feats]
            best_features = [(features[i], x_means[i]) for i in sorted_means if i < len(features) and i < len(x_means)]
            Df = pd.DataFrame(best_features, columns=['features', 'score'])
            dfs.append(Df)
        return dfs
        # Elbow Method
        print('Elbow Method')
        elbow_method(Y_sklearn)

        # K-means Clustering
        print('K-means Clustering')
        kmeans_clustering(Y_sklearn, fitted)

        # Silhouette Score
        silhouette_avg = silhouette_score(Y_sklearn, prediction)
        print(f"Silhouette Score: {silhouette_avg}")

        # Melihat cluster fitur teratas yang diperoleh
        dfs = get_top_features_cluster(X_std, prediction, 20)
        plt.figure(figsize=(8, 6))
        sns.barplot(x='score', y='features', orient='h', data=dfs[0])
        plt.show()

if __name__ == "__main__":
    main()


Pemodelan LDA
Ini adalah contoh aplikasi untuk Latent Dirichlet Allocation (LDA).
Menampilkan DataFrame 'topik50'
      Topik 1   Topik 2   Topik 3   Topik 4   Topik 5   Topik 6   Topik 7  \
0    0.002171  0.002140  0.002123  0.002128  0.002122  0.002132  0.002157   
1    0.001710  0.001707  0.001699  0.001705  0.001704  0.916440  0.001709   
2    0.001649  0.001650  0.001644  0.001648  0.001650  0.919126  0.001653   
3    0.002536  0.002522  0.002521  0.002519  0.002527  0.002536  0.002533   
4    0.002041  0.002040  0.002037  0.002032  0.002050  0.002043  0.002037   
..        ...       ...       ...       ...       ...       ...       ...   
853  0.001482  0.001479  0.001472  0.001465  0.001473  0.001475  0.001480   
854  0.002122  0.002098  0.002077  0.002104  0.002082  0.002119  0.002101   
855  0.001177  0.001179  0.001170  0.001163  0.001173  0.001166  0.001168   
856  0.001413  0.001405  0.001395  0.001392  0.001400  0.001402  0.001393   
857  0.001928  0.001911  0.001904  0.00

